diff options
1745 files changed, 38393 insertions, 25985 deletions
diff --git a/Documentation/ABI/README b/Documentation/ABI/README index 8bac9cb09a6d..ef0e6d11e919 100644 --- a/Documentation/ABI/README +++ b/Documentation/ABI/README @@ -1,4 +1,5 @@ -This directory attempts to document the ABI between the Linux kernel and +This part of the documentation inside Documentation/ABI directory +attempts to document the ABI between the Linux kernel and userspace, and the relative stability of these interfaces. Due to the everchanging nature of Linux, and the differing maturity levels, these interfaces should be used by userspace programs in different ways. diff --git a/Documentation/ABI/removed/sysfs-class-rfkill b/Documentation/ABI/removed/sysfs-class-rfkill index f25174eafd55..20cb688af173 100644 --- a/Documentation/ABI/removed/sysfs-class-rfkill +++ b/Documentation/ABI/removed/sysfs-class-rfkill @@ -4,7 +4,7 @@ For details to this subsystem look at Documentation/driver-api/rfkill.rst. What: /sys/class/rfkill/rfkill[0-9]+/claim Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: This file was deprecated because there no longer was a way to claim just control over a single rfkill instance. diff --git a/Documentation/ABI/stable/sysfs-class-rfkill b/Documentation/ABI/stable/sysfs-class-rfkill index 037979f7dc4b..67b605e3dd16 100644 --- a/Documentation/ABI/stable/sysfs-class-rfkill +++ b/Documentation/ABI/stable/sysfs-class-rfkill @@ -16,7 +16,7 @@ Description: The rfkill class subsystem folder. What: /sys/class/rfkill/rfkill[0-9]+/name Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: Name assigned by driver to this key (interface or driver name). Values: arbitrary string. @@ -24,7 +24,7 @@ Values: arbitrary string. What: /sys/class/rfkill/rfkill[0-9]+/type Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: Driver type string ("wlan", "bluetooth", etc). Values: See include/linux/rfkill.h. @@ -32,7 +32,7 @@ Values: See include/linux/rfkill.h. What: /sys/class/rfkill/rfkill[0-9]+/persistent Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: Whether the soft blocked state is initialised from non-volatile storage at startup. @@ -44,7 +44,7 @@ Values: A numeric value: What: /sys/class/rfkill/rfkill[0-9]+/state Date: 09-Jul-2007 -KernelVersion v2.6.22 +KernelVersion: v2.6.22 Contact: linux-wireless@vger.kernel.org Description: Current state of the transmitter. This file was scheduled to be removed in 2014, but due to its @@ -67,7 +67,7 @@ Values: A numeric value. What: /sys/class/rfkill/rfkill[0-9]+/hard Date: 12-March-2010 -KernelVersion v2.6.34 +KernelVersion: v2.6.34 Contact: linux-wireless@vger.kernel.org Description: Current hardblock state. This file is read only. Values: A numeric value. @@ -81,7 +81,7 @@ Values: A numeric value. What: /sys/class/rfkill/rfkill[0-9]+/soft Date: 12-March-2010 -KernelVersion v2.6.34 +KernelVersion: v2.6.34 Contact: linux-wireless@vger.kernel.org Description: Current softblock state. This file is read and write. Values: A numeric value. diff --git a/Documentation/ABI/stable/sysfs-devices-system-cpu b/Documentation/ABI/stable/sysfs-devices-system-cpu index 902392d7eddf..cf78bd99f6c8 100644 --- a/Documentation/ABI/stable/sysfs-devices-system-cpu +++ b/Documentation/ABI/stable/sysfs-devices-system-cpu @@ -24,12 +24,6 @@ Description: Default value for the Data Stream Control Register (DSCR) on If set by a process it will be inherited by child processes. Values: 64 bit unsigned integer (bit field) -What: /sys/devices/system/cpu/cpuX/topology/physical_package_id -Description: physical package id of cpuX. Typically corresponds to a physical - socket number, but the actual value is architecture and platform - dependent. -Values: integer - What: /sys/devices/system/cpu/cpuX/topology/die_id Description: the CPU die ID of cpuX. Typically it is the hardware platform's identifier (rather than the kernel's). The actual value is @@ -86,10 +80,6 @@ What: /sys/devices/system/cpu/cpuX/topology/die_cpus Description: internal kernel map of CPUs within the same die. Values: hexadecimal bitmask. -What: /sys/devices/system/cpu/cpuX/topology/ppin -Description: per-socket protected processor inventory number -Values: hexadecimal. - What: /sys/devices/system/cpu/cpuX/topology/die_cpus_list Description: human-readable list of CPUs within the same die. The format is like 0-3, 8-11, 14,17. diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd index f2ec42949a54..4a355e6747ae 100644 --- a/Documentation/ABI/stable/sysfs-driver-dma-idxd +++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd @@ -246,14 +246,14 @@ Description: Controls whether PRS disable is turned on for the workqueue. capability. What: /sys/bus/dsa/devices/wq<m>.<n>/occupancy -Date May 25, 2021 +Date: May 25, 2021 KernelVersion: 5.14.0 Contact: dmaengine@vger.kernel.org Description: Show the current number of entries in this WQ if WQ Occupancy Support bit WQ capabilities is 1. What: /sys/bus/dsa/devices/wq<m>.<n>/enqcmds_retries -Date Oct 29, 2021 +Date: Oct 29, 2021 KernelVersion: 5.17.0 Contact: dmaengine@vger.kernel.org Description: Indicate the number of retires for an enqcmds submission on a sharedwq. diff --git a/Documentation/ABI/testing/configfs-usb-gadget-midi2 b/Documentation/ABI/testing/configfs-usb-gadget-midi2 index 0eac3aaba137..d76a52e2ca7f 100644 --- a/Documentation/ABI/testing/configfs-usb-gadget-midi2 +++ b/Documentation/ABI/testing/configfs-usb-gadget-midi2 @@ -47,7 +47,7 @@ Description: midi1_first_group The first UMP Group number for MIDI 1.0 (0-15) midi1_num_groups The number of groups for MIDI 1.0 (0-16) ui_hint 0: unknown, 1: receiver, 2: sender, 3: both - midi_ci_verison Supported MIDI-CI version number (8 bit) + midi_ci_version Supported MIDI-CI version number (8 bit) is_midi1 Legacy MIDI 1.0 device (0, 1 or 2) sysex8_streams Max number of SysEx8 streams (8 bit) active Active FB flag (0 or 1) diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti b/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti index bf2869c413e7..a97b70f588da 100644 --- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti +++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti @@ -1,241 +1,241 @@ What: /sys/bus/coresight/devices/<cti-name>/enable Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Enable/Disable the CTI hardware. What: /sys/bus/coresight/devices/<cti-name>/powered Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Indicate if the CTI hardware is powered. What: /sys/bus/coresight/devices/<cti-name>/ctmid Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Display the associated CTM ID What: /sys/bus/coresight/devices/<cti-name>/nr_trigger_cons Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Number of devices connected to triggers on this CTI What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/name Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Name of connected device <N> What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/in_signals Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Input trigger signals from connected device <N> What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/in_types Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Functional types for the input trigger signals from connected device <N> What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/out_signals Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Output trigger signals to connected device <N> What: /sys/bus/coresight/devices/<cti-name>/triggers<N>/out_types Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Functional types for the output trigger signals to connected device <N> What: /sys/bus/coresight/devices/<cti-name>/regs/inout_sel Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Select the index for inen and outen registers. What: /sys/bus/coresight/devices/<cti-name>/regs/inen Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Read or write the CTIINEN register selected by inout_sel. What: /sys/bus/coresight/devices/<cti-name>/regs/outen Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Read or write the CTIOUTEN register selected by inout_sel. What: /sys/bus/coresight/devices/<cti-name>/regs/gate Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Read or write CTIGATE register. What: /sys/bus/coresight/devices/<cti-name>/regs/asicctl Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Read or write ASICCTL register. What: /sys/bus/coresight/devices/<cti-name>/regs/intack Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Write the INTACK register. What: /sys/bus/coresight/devices/<cti-name>/regs/appset Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Set CTIAPPSET register to activate channel. Read back to determine current value of register. What: /sys/bus/coresight/devices/<cti-name>/regs/appclear Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Write APPCLEAR register to deactivate channel. What: /sys/bus/coresight/devices/<cti-name>/regs/apppulse Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Write APPPULSE to pulse a channel active for one clock cycle. What: /sys/bus/coresight/devices/<cti-name>/regs/chinstatus Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Read current status of channel inputs. What: /sys/bus/coresight/devices/<cti-name>/regs/choutstatus Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) read current status of channel outputs. What: /sys/bus/coresight/devices/<cti-name>/regs/triginstatus Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) read current status of input trigger signals What: /sys/bus/coresight/devices/<cti-name>/regs/trigoutstatus Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) read current status of output trigger signals. What: /sys/bus/coresight/devices/<cti-name>/channels/trigin_attach Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Attach a CTI input trigger to a CTM channel. What: /sys/bus/coresight/devices/<cti-name>/channels/trigin_detach Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Detach a CTI input trigger from a CTM channel. What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_attach Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Attach a CTI output trigger to a CTM channel. What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_detach Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Detach a CTI output trigger from a CTM channel. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_gate_enable Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Enable CTIGATE for single channel (Write) or list enabled channels through the gate (R). What: /sys/bus/coresight/devices/<cti-name>/channels/chan_gate_disable Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Disable CTIGATE for single channel. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_set Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Activate a single channel. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_clear Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Deactivate a single channel. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_pulse Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Pulse a single channel - activate for a single clock cycle. What: /sys/bus/coresight/devices/<cti-name>/channels/trigout_filtered Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) List of output triggers filtered across all connections. What: /sys/bus/coresight/devices/<cti-name>/channels/trig_filter_enable Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Enable or disable trigger output signal filtering. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_inuse Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) show channels with at least one attached trigger signal. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_free Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) show channels with no attached trigger signals. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_sel Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (RW) Write channel number to select a channel to view, read to see selected channel number. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_in Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Read to see input triggers connected to selected view channel. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_out Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Read) Read to see output triggers connected to selected view channel. What: /sys/bus/coresight/devices/<cti-name>/channels/chan_xtrigs_reset Date: March 2020 -KernelVersion 5.7 +KernelVersion: 5.7 Contact: Mike Leach or Mathieu Poirier Description: (Write) Clear all channel / trigger programming. diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm index bf710ea6e0ef..53cb454b60d0 100644 --- a/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm +++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-tpdm @@ -1,6 +1,6 @@ What: /sys/bus/coresight/devices/<tpdm-name>/integration_test Date: January 2023 -KernelVersion 6.2 +KernelVersion: 6.2 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (Write) Run integration test for tpdm. Integration test @@ -14,7 +14,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/reset_dataset Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (Write) Reset the dataset of the tpdm. @@ -24,7 +24,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_trig_type Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the trigger type of the DSB for tpdm. @@ -35,7 +35,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_trig_ts Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the trigger timestamp of the DSB for tpdm. @@ -46,7 +46,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_mode Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the programming mode of the DSB for tpdm. @@ -60,7 +60,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_edge/ctrl_idx Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the index number of the edge detection for the DSB @@ -69,7 +69,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_edge/ctrl_val Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: Write a data to control the edge detection corresponding to @@ -85,7 +85,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_edge/ctrl_mask Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: Write a data to mask the edge detection corresponding to the index @@ -97,21 +97,21 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_edge/edcr[0:15] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: Read a set of the edge control value of the DSB in TPDM. What: /sys/bus/coresight/devices/<tpdm-name>/dsb_edge/edcmr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: Read a set of the edge control mask of the DSB in TPDM. What: /sys/bus/coresight/devices/<tpdm-name>/dsb_trig_patt/xpr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the value of the trigger pattern for the DSB @@ -119,7 +119,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_trig_patt/xpmr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the mask of the trigger pattern for the DSB @@ -127,21 +127,21 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_patt/tpr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the value of the pattern for the DSB subunit TPDM. What: /sys/bus/coresight/devices/<tpdm-name>/dsb_patt/tpmr[0:7] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the mask of the pattern for the DSB subunit TPDM. What: /sys/bus/coresight/devices/<tpdm-name>/dsb_patt/enable_ts Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (Write) Set the pattern timestamp of DSB tpdm. Read @@ -153,7 +153,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_patt/set_type Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (Write) Set the pattern type of DSB tpdm. Read @@ -165,7 +165,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_msr/msr[0:31] Date: March 2023 -KernelVersion 6.7 +KernelVersion: 6.7 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the MSR(mux select register) for the DSB subunit @@ -173,7 +173,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/cmb_mode Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (Write) Set the data collection mode of CMB tpdm. Continuous change creates CMB data set elements on every CMBCLK edge. @@ -187,7 +187,7 @@ Description: (Write) Set the data collection mode of CMB tpdm. Continuous What: /sys/bus/coresight/devices/<tpdm-name>/cmb_trig_patt/xpr[0:1] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the value of the trigger pattern for the CMB @@ -195,7 +195,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/cmb_trig_patt/xpmr[0:1] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the mask of the trigger pattern for the CMB @@ -203,21 +203,21 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/dsb_patt/tpr[0:1] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the value of the pattern for the CMB subunit TPDM. What: /sys/bus/coresight/devices/<tpdm-name>/dsb_patt/tpmr[0:1] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the mask of the pattern for the CMB subunit TPDM. What: /sys/bus/coresight/devices/<tpdm-name>/cmb_patt/enable_ts Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (Write) Set the pattern timestamp of CMB tpdm. Read @@ -229,7 +229,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/cmb_trig_ts Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the trigger timestamp of the CMB for tpdm. @@ -240,7 +240,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/cmb_ts_all Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Read or write the status of timestamp upon all interface. @@ -252,7 +252,7 @@ Description: What: /sys/bus/coresight/devices/<tpdm-name>/cmb_msr/msr[0:31] Date: January 2024 -KernelVersion 6.9 +KernelVersion: 6.9 Contact: Jinlong Mao (QUIC) <quic_jinlmao@quicinc.com>, Tao Zhang (QUIC) <quic_taozha@quicinc.com> Description: (RW) Set/Get the MSR(mux select register) for the CMB subunit diff --git a/Documentation/ABI/testing/sysfs-edac-ecs b/Documentation/ABI/testing/sysfs-edac-ecs new file mode 100644 index 000000000000..87c885c4eb1a --- /dev/null +++ b/Documentation/ABI/testing/sysfs-edac-ecs @@ -0,0 +1,74 @@ +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + The sysfs EDAC bus devices /<dev-name>/ecs_fruX subdirectory + pertains to the memory media ECS (Error Check Scrub) control + feature, where <dev-name> directory corresponds to a device + registered with the EDAC device driver for the ECS feature. + /ecs_fruX belongs to the media FRUs (Field Replaceable Unit) + under the memory device. + + The sysfs ECS attr nodes are only present if the parent + driver has implemented the corresponding attr callback + function and provided the necessary operations to the EDAC + device driver during registration. + +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/log_entry_type +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The log entry type of how the DDR5 ECS log is reported. + + - 0 - per DRAM. + + - 1 - per memory media FRU. + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/mode +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The mode of how the DDR5 ECS counts the errors. + Error count is tracked based on two different modes + selected by DDR5 ECS Control Feature - Codeword mode and + Row Count mode. If the ECS is under Codeword mode, then + the error count increments each time a codeword with check + bit errors is detected. If the ECS is under Row Count mode, + then the error counter increments each time a row with + check bit errors is detected. + + - 0 - ECS counts rows in the memory media that have ECC errors. + + - 1 - ECS counts codewords with errors, specifically, it counts + the number of ECC-detected errors in the memory media. + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/reset +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (WO) ECS reset ECC counter. + + - 1 - reset ECC counter to the default value. + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/threshold +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) DDR5 ECS threshold count per gigabits of memory cells. + The ECS error count is subject to the ECS Threshold count + per Gbit, which masks error counts less than the Threshold. + + Supported values are 256, 1024 and 4096. + + All other values are reserved. diff --git a/Documentation/ABI/testing/sysfs-edac-memory-repair b/Documentation/ABI/testing/sysfs-edac-memory-repair new file mode 100644 index 000000000000..0434a3b23ff3 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-edac-memory-repair @@ -0,0 +1,206 @@ +What: /sys/bus/edac/devices/<dev-name>/mem_repairX +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + The sysfs EDAC bus devices /<dev-name>/mem_repairX subdirectory + pertains to the memory media repair features control, such as + PPR (Post Package Repair), memory sparing etc, where <dev-name> + directory corresponds to a device registered with the EDAC + device driver for the memory repair features. + + Post Package Repair is a maintenance operation requests the memory + device to perform a repair operation on its media. It is a memory + self-healing feature that fixes a failing memory location by + replacing it with a spare row in a DRAM device. For example, a + CXL memory device with DRAM components that support PPR features may + implement PPR maintenance operations. DRAM components may support + two types of PPR functions: hard PPR, for a permanent row repair, and + soft PPR, for a temporary row repair. Soft PPR may be much faster + than hard PPR, but the repair is lost with a power cycle. + + The sysfs attributes nodes for a repair feature are only + present if the parent driver has implemented the corresponding + attr callback function and provided the necessary operations + to the EDAC device driver during registration. + + In some states of system configuration (e.g. before address + decoders have been configured), memory devices (e.g. CXL) + may not have an active mapping in the main host address + physical address map. As such, the memory to repair must be + identified by a device specific physical addressing scheme + using a device physical address(DPA). The DPA and other control + attributes to use will be presented in related error records. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair_type +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RO) Memory repair type. For eg. post package repair, + memory sparing etc. Valid values are: + + - ppr - Post package repair. + + - cacheline-sparing + + - row-sparing + + - bank-sparing + + - rank-sparing + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/persist_mode +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Get/Set the current persist repair mode set for a + repair function. Persist repair modes supported in the + device, based on a memory repair function, either is temporary, + which is lost with a power cycle or permanent. Valid values are: + + - 0 - Soft memory repair (temporary repair). + + - 1 - Hard memory repair (permanent repair). + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair_safe_when_in_use +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RO) True if memory media is accessible and data is retained + during the memory repair operation. + The data may not be retained and memory requests may not be + correctly processed during a repair operation. In such case + repair operation can not be executed at runtime. The memory + must be taken offline. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/hpa +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Host Physical Address (HPA) of the memory to repair. + The HPA to use will be provided in related error records. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/dpa +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Device Physical Address (DPA) of the memory to repair. + The specific DPA to use will be provided in related error + records. + + In some states of system configuration (e.g. before address + decoders have been configured), memory devices (e.g. CXL) + may not have an active mapping in the main host address + physical address map. As such, the memory to repair must be + identified by a device specific physical addressing scheme + using a DPA. The device physical address(DPA) to use will be + presented in related error records. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/nibble_mask +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Read/Write Nibble mask of the memory to repair. + Nibble mask identifies one or more nibbles in error on the + memory bus that produced the error event. Nibble Mask bit 0 + shall be set if nibble 0 on the memory bus produced the + event, etc. For example, CXL PPR and sparing, a nibble mask + bit set to 1 indicates the request to perform repair + operation in the specific device. All nibble mask bits set + to 1 indicates the request to perform the operation in all + devices. Eg. for CXL memory repair, the specific value of + nibble mask to use will be provided in related error records. + For more details, See nibble mask field in CXL spec ver 3.1, + section 8.2.9.7.1.2 Table 8-103 soft PPR and section + 8.2.9.7.1.3 Table 8-104 hard PPR, section 8.2.9.7.1.4 + Table 8-105 memory sparing. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/min_hpa +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/max_hpa +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/min_dpa +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/max_dpa +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The supported range of memory address that is to be + repaired. The memory device may give the supported range of + attributes to use and it will depend on the memory device + and the portion of memory to repair. + The userspace may receive the specific value of attributes + to use for a repair operation from the memory device via + related error records and trace events, for eg. CXL DRAM + and CXL general media error records in CXL memory devices. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/bank_group +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/bank +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/rank +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/row +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/column +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/channel +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/sub_channel +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The control attributes for the memory to be repaired. + The specific value of attributes to use depends on the + portion of memory to repair and will be reported to the host + in related error records and be available to userspace + in trace events, such as CXL DRAM and CXL general media + error records of CXL memory devices. + + When readng back these attributes, it returns the current + value of memory requested to be repaired. + + bank_group - The bank group of the memory to repair. + + bank - The bank number of the memory to repair. + + rank - The rank of the memory to repair. Rank is defined as a + set of memory devices on a channel that together execute a + transaction. + + row - The row number of the memory to repair. + + column - The column number of the memory to repair. + + channel - The channel of the memory to repair. Channel is + defined as an interface that can be independently accessed + for a transaction. + + sub_channel - The subchannel of the memory to repair. + + The requirement to set these attributes varies based on the + repair function. The attributes in sysfs are not present + unless required for a repair function. + + For example, CXL spec ver 3.1, Section 8.2.9.7.1.2 Table 8-103 + soft PPR and Section 8.2.9.7.1.3 Table 8-104 hard PPR operations, + these attributes are not required to set. CXL spec ver 3.1, + Section 8.2.9.7.1.4 Table 8-105 memory sparing, these attributes + are required to set based on memory sparing granularity. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (WO) Issue the memory repair operation for the specified + memory repair attributes. The operation may fail if resources + are insufficient based on the requirements of the memory + device and repair function. + + - 1 - Issue the repair operation. + + - All other values are reserved. diff --git a/Documentation/ABI/testing/sysfs-edac-scrub b/Documentation/ABI/testing/sysfs-edac-scrub new file mode 100644 index 000000000000..c43be90deab4 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-edac-scrub @@ -0,0 +1,69 @@ +What: /sys/bus/edac/devices/<dev-name>/scrubX +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + The sysfs EDAC bus devices /<dev-name>/scrubX subdirectory + belongs to an instance of memory scrub control feature, + where <dev-name> directory corresponds to a device/memory + region registered with the EDAC device driver for the + scrub control feature. + + The sysfs scrub attr nodes are only present if the parent + driver has implemented the corresponding attr callback + function and provided the necessary operations to the EDAC + device driver during registration. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/addr +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The base address of the memory region to be scrubbed + for on-demand scrubbing. Setting address starts scrubbing. + The size must be set before that. + + The readback addr value is non-zero if the requested + on-demand scrubbing is in progress, zero otherwise. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/size +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The size of the memory region to be scrubbed + (on-demand scrubbing). + +What: /sys/bus/edac/devices/<dev-name>/scrubX/enable_background +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Start/Stop background (patrol) scrubbing if supported. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/min_cycle_duration +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RO) Supported minimum scrub cycle duration in seconds + by the memory scrubber. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/max_cycle_duration +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RO) Supported maximum scrub cycle duration in seconds + by the memory scrubber. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/current_cycle_duration +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The current scrub cycle duration in seconds and must be + within the supported range by the memory scrubber. + + Scrub has an overhead when running and that may want to be + reduced by taking longer to do it. diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 3e1630c70d8a..e44bb614964b 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -347,7 +347,7 @@ Description: Used to control configure extension list: - [c] means add/del cold file extension What: /sys/fs/f2fs/<disk>/unusable -Date April 2019 +Date: April 2019 Contact: "Daniel Rosenberg" <drosen@google.com> Description: If checkpoint=disable, it displays the number of blocks that are unusable. @@ -355,7 +355,7 @@ Description: If checkpoint=disable, it displays the number of blocks that would be unusable if checkpoint=disable were to be set. What: /sys/fs/f2fs/<disk>/encoding -Date July 2019 +Date: July 2019 Contact: "Daniel Rosenberg" <drosen@google.com> Description: Displays name and version of the encoding set for the filesystem. If no encoding is set, displays (none) diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index a3942b1036e2..2192478e83cf 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -131,7 +131,7 @@ Description: CAUTION: Using it will cause your machine's real-time (CMOS) clock to be set to a random invalid time after a resume. -What; /sys/power/pm_trace_dev_match +What: /sys/power/pm_trace_dev_match Date: October 2010 Contact: James Hogan <jhogan@kernel.org> Description: diff --git a/Documentation/Makefile b/Documentation/Makefile index 52c6c5a3efa9..63094646df28 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -12,7 +12,7 @@ endif # Check for broken ABI files ifeq ($(CONFIG_WARN_ABI_ERRORS),y) -$(shell $(srctree)/scripts/get_abi.pl validate --dir $(srctree)/Documentation/ABI) +$(shell $(srctree)/scripts/get_abi.py --dir $(srctree)/Documentation/ABI validate) endif # You can set these variables from the command line. diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst index 6da7f66da2a8..12a7b059654f 100644 --- a/Documentation/RCU/rcubarrier.rst +++ b/Documentation/RCU/rcubarrier.rst @@ -329,10 +329,7 @@ Answer: was first added back in 2005. This is because on_each_cpu() disables preemption, which acted as an RCU read-side critical section, thus preventing CPU 0's grace period from completing - until on_each_cpu() had dealt with all of the CPUs. However, - with the advent of preemptible RCU, rcu_barrier() no longer - waited on nonpreemptible regions of code in preemptible kernels, - that being the job of the new rcu_barrier_sched() function. + until on_each_cpu() had dealt with all of the CPUs. However, with the RCU flavor consolidation around v4.20, this possibility was once again ruled out, because the consolidated diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index 30080ff6f406..d1ccd6039a8c 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -96,6 +96,13 @@ warnings: the ``rcu_.*timer wakeup didn't happen for`` console-log message, which will include additional debugging information. +- A timer issue causes time to appear to jump forward, so that RCU + believes that the RCU CPU stall-warning timeout has been exceeded + when in fact much less time has passed. This could be due to + timer hardware bugs, timer driver bugs, or even corruption of + the "jiffies" global variable. These sorts of timer hardware + and driver bugs are not uncommon when testing new hardware. + - A low-level kernel issue that either fails to invoke one of the variants of rcu_eqs_enter(true), rcu_eqs_exit(true), ct_idle_enter(), ct_idle_exit(), ct_irq_enter(), or ct_irq_exit() on the one diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index b557cf1c820d..70b02f30013a 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -165,7 +165,7 @@ Configuring the kernel "make xconfig" Qt based configuration tool. - "make gconfig" GTK+ based configuration tool. + "make gconfig" GTK based configuration tool. "make oldconfig" Default all questions based on the contents of your existing ./.config file and asking about diff --git a/Documentation/admin-guide/abi-obsolete-files.rst b/Documentation/admin-guide/abi-obsolete-files.rst new file mode 100644 index 000000000000..3061a916b4b5 --- /dev/null +++ b/Documentation/admin-guide/abi-obsolete-files.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Obsolete ABI Files +================== + +.. kernel-abi:: obsolete + :no-symbols: diff --git a/Documentation/admin-guide/abi-obsolete.rst b/Documentation/admin-guide/abi-obsolete.rst index 594e697aa1b2..640f3903e847 100644 --- a/Documentation/admin-guide/abi-obsolete.rst +++ b/Documentation/admin-guide/abi-obsolete.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ABI obsolete symbols ==================== @@ -7,5 +9,5 @@ marked to be removed at some later point in time. The description of the interface will document the reason why it is obsolete and when it can be expected to be removed. -.. kernel-abi:: ABI/obsolete - :rst: +.. kernel-abi:: obsolete + :no-files: diff --git a/Documentation/admin-guide/abi-removed-files.rst b/Documentation/admin-guide/abi-removed-files.rst new file mode 100644 index 000000000000..f1bdfadd2ec4 --- /dev/null +++ b/Documentation/admin-guide/abi-removed-files.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Removed ABI Files +================= + +.. kernel-abi:: removed + :no-symbols: diff --git a/Documentation/admin-guide/abi-removed.rst b/Documentation/admin-guide/abi-removed.rst index f9e000c81828..88832d3eacd6 100644 --- a/Documentation/admin-guide/abi-removed.rst +++ b/Documentation/admin-guide/abi-removed.rst @@ -1,5 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + ABI removed symbols =================== -.. kernel-abi:: ABI/removed - :rst: +.. kernel-abi:: removed + :no-files: diff --git a/Documentation/admin-guide/abi-stable-files.rst b/Documentation/admin-guide/abi-stable-files.rst new file mode 100644 index 000000000000..f867738fc178 --- /dev/null +++ b/Documentation/admin-guide/abi-stable-files.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Stable ABI Files +================ + +.. kernel-abi:: stable + :no-symbols: diff --git a/Documentation/admin-guide/abi-stable.rst b/Documentation/admin-guide/abi-stable.rst index fc3361d847b1..528c68401f4b 100644 --- a/Documentation/admin-guide/abi-stable.rst +++ b/Documentation/admin-guide/abi-stable.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ABI stable symbols ================== @@ -10,5 +12,5 @@ for at least 2 years. Most interfaces (like syscalls) are expected to never change and always be available. -.. kernel-abi:: ABI/stable - :rst: +.. kernel-abi:: stable + :no-files: diff --git a/Documentation/admin-guide/abi-testing-files.rst b/Documentation/admin-guide/abi-testing-files.rst new file mode 100644 index 000000000000..1da868e42fdb --- /dev/null +++ b/Documentation/admin-guide/abi-testing-files.rst @@ -0,0 +1,7 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Testing ABI Files +================= + +.. kernel-abi:: testing + :no-symbols: diff --git a/Documentation/admin-guide/abi-testing.rst b/Documentation/admin-guide/abi-testing.rst index 19767926b344..6153ebd38e2d 100644 --- a/Documentation/admin-guide/abi-testing.rst +++ b/Documentation/admin-guide/abi-testing.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0 + ABI testing symbols =================== @@ -16,5 +18,5 @@ Programs that use these interfaces are strongly encouraged to add their name to the description of these interfaces, so that the kernel developers can easily notify them if any changes occur. -.. kernel-abi:: ABI/testing - :rst: +.. kernel-abi:: testing + :no-files: diff --git a/Documentation/admin-guide/abi.rst b/Documentation/admin-guide/abi.rst index bcab3ef2597c..c6039359e585 100644 --- a/Documentation/admin-guide/abi.rst +++ b/Documentation/admin-guide/abi.rst @@ -1,7 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + ===================== Linux ABI description ===================== +.. kernel-abi:: README + +ABI symbols +----------- + .. toctree:: :maxdepth: 2 @@ -9,3 +16,14 @@ Linux ABI description abi-testing abi-obsolete abi-removed + +ABI files +--------- + +.. toctree:: + :maxdepth: 2 + + abi-stable-files + abi-testing-files + abi-obsolete-files + abi-removed-files diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst index 582d3427de3f..a964aff373b1 100644 --- a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst +++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst @@ -125,3 +125,7 @@ to unfreeze all tasks in the container:: This is the basic mechanism which should do the right thing for user space task in a simple scenario. + +This freezer implementation is affected by shortcomings (see commit +76f969e8948d8 ("cgroup: cgroup v2 freezer")) and cgroup v2 freezer is +recommended. diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 286d16fc22eb..02b8206a3594 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -90,6 +90,7 @@ Brief summary of control files. used. memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) + Per memcg knob does not exist in cgroup v2. memory.move_charge_at_immigrate This knob is deprecated. memory.oom_control set/show oom controls. This knob is deprecated and shouldn't be diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index cb1b4e759b7e..f293a13b42ed 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1076,15 +1076,20 @@ cpufreq governor about the minimum desired frequency which should always be provided by a CPU, as well as the maximum desired frequency, which should not be exceeded by a CPU. -WARNING: cgroup2 doesn't yet support control of realtime processes. For -a kernel built with the CONFIG_RT_GROUP_SCHED option enabled for group -scheduling of realtime processes, the cpu controller can only be enabled -when all RT processes are in the root cgroup. This limitation does -not apply if CONFIG_RT_GROUP_SCHED is disabled. Be aware that system -management software may already have placed RT processes into nonroot -cgroups during the system boot process, and these processes may need -to be moved to the root cgroup before the cpu controller can be enabled -with a CONFIG_RT_GROUP_SCHED enabled kernel. +WARNING: cgroup2 cpu controller doesn't yet fully support the control of +realtime processes. For a kernel built with the CONFIG_RT_GROUP_SCHED option +enabled for group scheduling of realtime processes, the cpu controller can only +be enabled when all RT processes are in the root cgroup. Be aware that system +management software may already have placed RT processes into non-root cgroups +during the system boot process, and these processes may need to be moved to the +root cgroup before the cpu controller can be enabled with a +CONFIG_RT_GROUP_SCHED enabled kernel. + +With CONFIG_RT_GROUP_SCHED disabled, this limitation does not apply and some of +the interface files either affect realtime processes or account for them. See +the following section for details. Only the cpu controller is affected by +CONFIG_RT_GROUP_SCHED. Other controllers can be used for the resource control of +realtime processes irrespective of CONFIG_RT_GROUP_SCHED. CPU Interface Files diff --git a/Documentation/admin-guide/gpio/gpio-sim.rst b/Documentation/admin-guide/gpio/gpio-sim.rst index 1cc5567a4bbe..35d49ccd49e0 100644 --- a/Documentation/admin-guide/gpio/gpio-sim.rst +++ b/Documentation/admin-guide/gpio/gpio-sim.rst @@ -71,7 +71,7 @@ specific lines. The name of those subdirectories must take the form of: ``'line<offset>'`` (e.g. ``'line0'``, ``'line20'``, etc.) as the name will be used by the module to assign the config to the specific line at given offset. -Once the confiuration is complete, the ``'live'`` attribute must be set to 1 in +Once the configuration is complete, the ``'live'`` attribute must be set to 1 in order to instantiate the chip. It can be set back to 0 to destroy the simulated chip. The module will synchronously wait for the new simulated device to be successfully probed and if this doesn't happen, writing to ``'live'`` will diff --git a/Documentation/admin-guide/gpio/gpio-virtuser.rst b/Documentation/admin-guide/gpio/gpio-virtuser.rst index 2aca70db9f3b..7e7c0df51640 100644 --- a/Documentation/admin-guide/gpio/gpio-virtuser.rst +++ b/Documentation/admin-guide/gpio/gpio-virtuser.rst @@ -92,7 +92,7 @@ struct. The first two take string values as arguments: Activating GPIO consumers ------------------------- -Once the confiuration is complete, the ``'live'`` attribute must be set to 1 in +Once the configuration is complete, the ``'live'`` attribute must be set to 1 in order to instantiate the consumer. It can be set back to 0 to destroy the virtual device. The module will synchronously wait for the new simulated device to be successfully probed and if this doesn't happen, writing to ``'live'`` will diff --git a/Documentation/admin-guide/highuid.rst b/Documentation/admin-guide/highuid.rst deleted file mode 100644 index 6ee70465c0ea..000000000000 --- a/Documentation/admin-guide/highuid.rst +++ /dev/null @@ -1,80 +0,0 @@ -=================================================== -Notes on the change from 16-bit UIDs to 32-bit UIDs -=================================================== - -:Author: Chris Wing <wingc@umich.edu> -:Last updated: January 11, 2000 - -- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t - when communicating between user and kernel space in an ioctl or data - structure. - -- kernel code should use uid_t and gid_t in kernel-private structures and - code. - -What's left to be done for 32-bit UIDs on all Linux architectures: - -- Disk quotas have an interesting limitation that is not related to the - maximum UID/GID. They are limited by the maximum file size on the - underlying filesystem, because quota records are written at offsets - corresponding to the UID in question. - Further investigation is needed to see if the quota system can cope - properly with huge UIDs. If it can deal with 64-bit file offsets on all - architectures, this should not be a problem. - -- Decide whether or not to keep backwards compatibility with the system - accounting file, or if we should break it as the comments suggest - (currently, the old 16-bit UID and GID are still written to disk, and - part of the former pad space is used to store separate 32-bit UID and - GID) - -- Need to validate that OS emulation calls the 16-bit UID - compatibility syscalls, if the OS being emulated used 16-bit UIDs, or - uses the 32-bit UID system calls properly otherwise. - - This affects at least: - - - iBCS on Intel - - - sparc32 emulation on sparc64 - (need to support whatever new 32-bit UID system calls are added to - sparc32) - -- Validate that all filesystems behave properly. - - At present, 32-bit UIDs _should_ work for: - - - ext2 - - ufs - - isofs - - nfs - - coda - - udf - - Ioctl() fixups have been made for: - - - ncpfs - - smbfs - - Filesystems with simple fixups to prevent 16-bit UID wraparound: - - - minix - - sysv - - qnx4 - - Other filesystems have not been checked yet. - -- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in - all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but - more are needed. (as well as new user<->kernel data structures) - -- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k, - sh, and sparc32. Fixing this is probably not that important, but would - require adding a new ELF section. - -- The ioctl()s used to control the in-kernel NFS server only support - 16-bit UIDs on arm, i386, m68k, sh, and sparc32. - -- make sure that the UID mapping feature of AX25 networking works properly - (it should be safe because it's always used a 32-bit integer to - communicate between user and kernel) diff --git a/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst b/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst index 0585d02b9a6c..ad15417d39f9 100644 --- a/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst +++ b/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst @@ -29,14 +29,6 @@ Below is the list of affected Intel processors [#f1]_: RAPTORLAKE_S 06_BFH =================== ============ -As an exception to this table, Intel Xeon E family parts ALDERLAKE(06_97H) and -RAPTORLAKE(06_B7H) codenamed Catlow are not affected. They are reported as -vulnerable in Linux because they share the same family/model with an affected -part. Unlike their affected counterparts, they do not enumerate RFDS_CLEAR or -CPUID.HYBRID. This information could be used to distinguish between the -affected and unaffected parts, but it is deemed not worth adding complexity as -the reporting is fixed automatically when these parts enumerate RFDS_NO. - Mitigation ========== Intel released a microcode update that enables software to clear sensitive diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst index 2ad1c05b8c88..66af95251a3d 100644 --- a/Documentation/admin-guide/hw-vuln/srso.rst +++ b/Documentation/admin-guide/hw-vuln/srso.rst @@ -104,7 +104,20 @@ The possible values in this file are: (spec_rstack_overflow=ibpb-vmexit) + * 'Mitigation: Reduced Speculation': + This mitigation gets automatically enabled when the above one "IBPB on + VMEXIT" has been selected and the CPU supports the BpSpecReduce bit. + + It gets automatically enabled on machines which have the + SRSO_USER_KERNEL_NO=1 CPUID bit. In that case, the code logic is to switch + to the above =ibpb-vmexit mitigation because the user/kernel boundary is + not affected anymore and thus "safe RET" is not needed. + + After enabling the IBPB on VMEXIT mitigation option, the BpSpecReduce bit + is detected (functionality present on all such machines) and that + practically overrides IBPB on VMEXIT as it has a lot less performance + impact and takes care of the guest->host attack vector too. In order to exploit vulnerability, an attacker needs to: diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index c8af32a8f800..259d79fbeb94 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -187,7 +187,6 @@ A few hard-to-categorize and generally obsolete documents. .. toctree:: :maxdepth: 1 - highuid ldm unicode diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst index 609a3201fd4e..9453196ade51 100644 --- a/Documentation/admin-guide/iostats.rst +++ b/Documentation/admin-guide/iostats.rst @@ -2,62 +2,39 @@ I/O statistics fields ===================== -Since 2.4.20 (and some versions before, with patches), and 2.5.45, -more extensive disk statistics have been introduced to help measure disk -activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do -the work for you, but in case you are interested in creating your own -tools, the fields are explained here. - -In 2.4 now, the information is found as additional fields in -``/proc/partitions``. In 2.6 and upper, the same information is found in two -places: one is in the file ``/proc/diskstats``, and the other is within -the sysfs file system, which must be mounted in order to obtain -the information. Throughout this document we'll assume that sysfs -is mounted on ``/sys``, although of course it may be mounted anywhere. -Both ``/proc/diskstats`` and sysfs use the same source for the information -and so should not differ. - -Here are examples of these different formats:: - - 2.4: - 3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030 - - 2.6+ sysfs: - 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 35486 38030 38030 38030 - - 2.6+ diskstats: - 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 3 1 hda1 35486 38030 38030 38030 - - 4.18+ diskstats: - 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 - -On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have -a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. - -The advantage of one over the other is that the sysfs choice works well -if you are watching a known, small set of disks. ``/proc/diskstats`` may -be a better choice if you are watching a large number of disks because -you'll avoid the overhead of 50, 100, or 500 or more opens/closes with -each snapshot of your disk statistics. - -In 2.4, the statistics fields are those after the device name. In -the above example, the first field of statistics would be 446216. -By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll -find just the 15 fields, beginning with 446216. If you look at -``/proc/diskstats``, the 15 fields will be preceded by the major and -minor device numbers, and device name. Each of these formats provides -15 fields of statistics, each meaning exactly the same things. -All fields except field 9 are cumulative since boot. Field 9 should -go to zero as I/Os complete; all others only increase (unless they -overflow and wrap). Wrapping might eventually occur on a very busy -or long-lived system; so applications should be prepared to deal with -it. Regarding wrapping, the types of the fields are either unsigned -int (32 bit) or unsigned long (32-bit or 64-bit, depending on your -machine) as noted per-field below. Unless your observations are very -spread in time, these fields should not wrap twice before you notice it. +The kernel exposes disk statistics via ``/proc/diskstats`` and +``/sys/block/<device>/stat``. These stats are usually accessed via tools +such as ``sar`` and ``iostat``. + +Here are examples using a disk with two partitions:: + + /proc/diskstats: + 259 0 nvme0n1 255999 814 12369153 47919 996852 81 36123024 425995 0 301795 580470 0 0 0 0 60602 106555 + 259 1 nvme0n1p1 492 813 17572 96 848 81 108288 210 0 76 307 0 0 0 0 0 0 + 259 2 nvme0n1p2 255401 1 12343477 47799 996004 0 36014736 425784 0 344336 473584 0 0 0 0 0 0 + + /sys/block/nvme0n1/stat: + 255999 814 12369153 47919 996858 81 36123056 426009 0 301809 580491 0 0 0 0 60605 106562 + + /sys/block/nvme0n1/nvme0n1p1/stat: + 492 813 17572 96 848 81 108288 210 0 76 307 0 0 0 0 0 0 + +Both files contain the same 17 statistics. ``/sys/block/<device>/stat`` +contains the fields for ``<device>``. In ``/proc/diskstats`` the fields +are prefixed with the major and minor device numbers and the device +name. In the example above, the first stat value for ``nvme0n1`` is +255999 in both files. + +The sysfs ``stat`` file is efficient for monitoring a small, known set +of disks. If you're tracking a large number of devices, +``/proc/diskstats`` is often the better choice since it avoids the +overhead of opening and closing multiple files for each snapshot. + +All fields are cumulative, monotonic counters, except for field 9, which +resets to zero as I/Os complete. The remaining fields reset at boot, on +device reattachment or reinitialization, or when the underlying counter +overflows. Applications reading these counters should detect and handle +resets when comparing stat snapshots. Each set of stats only applies to the indicated device; if you want system-wide stats you'll have to find all the devices and sum them all up. diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 5376890adbeb..1f7f14c6e184 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -180,10 +180,6 @@ Dump-capture kernel config options (Arch Dependent, i386 and x86_64) 1) On i386, enable high memory support under "Processor type and features":: - CONFIG_HIGHMEM64G=y - - or:: - CONFIG_HIGHMEM4G 2) With CONFIG_SMP=y, usually nr_cpus=1 need specified on the kernel diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb8752b42ec8..bc24adc4d228 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -416,10 +416,6 @@ Format: { quiet (default) | verbose | debug } Change the amount of debugging information output when initialising the APIC and IO-APIC components. - For X86-32, this can also be used to specify an APIC - driver name. - Format: apic=driver_name - Examples: apic=bigsmp apic_extnmi= [APIC,X86,EARLY] External NMI delivery setting Format: { bsp (default) | all | none } @@ -1785,7 +1781,9 @@ allocation boundaries as a proactive defense against bounds-checking flaws in the kernel's copy_to_user()/copy_from_user() interface. - on Perform hardened usercopy checks (default). + The default is determined by + CONFIG_HARDENED_USERCOPY_DEFAULT_ON. + on Perform hardened usercopy checks. off Disable hardened usercopy checks. hardlockup_all_cpu_backtrace= @@ -5758,6 +5756,11 @@ rcutorture.test_boost_duration= [KNL] Duration (s) of each individual boost test. + rcutorture.test_boost_holdoff= [KNL] + Holdoff time (s) from start of test to the start + of RCU priority-boost testing. Defaults to zero, + that is, no holdoff. + rcutorture.test_boost_interval= [KNL] Interval (s) between each boost test. @@ -6082,7 +6085,7 @@ is assumed to be I/O ports; otherwise it is memory. reserve_mem= [RAM] - Format: nn[KNG]:<align>:<label> + Format: nn[KMG]:<align>:<label> Reserve physical memory and label it with a name that other subsystems can use to access it. This is typically used for systems that do not wipe the RAM, and this command @@ -6582,6 +6585,8 @@ Selecting 'on' will also enable the mitigation against user space to user space task attacks. + Selecting specific mitigation does not force enable + user mitigations. Selecting 'off' will disable both the kernel and the user space protections. @@ -7672,13 +7677,6 @@ 16 - SIGBUS faults Example: user_debug=31 - userpte= - [X86,EARLY] Flags controlling user PTE allocations. - - nohigh = do not allocate PTE pages in - HIGHMEM regardless of setting - of CONFIG_HIGHPTE. - vdso= [X86,SH,SPARC] On X86_32, this is an alias for vdso32=. Otherwise: diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index a21369eba034..3950583f2b15 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -248,6 +248,20 @@ are the following: If that frequency cannot be determined, this attribute should not be present. +``cpuinfo_avg_freq`` + An average frequency (in KHz) of all CPUs belonging to a given policy, + derived from a hardware provided feedback and reported on a time frame + spanning at most few milliseconds. + + This is expected to be based on the frequency the hardware actually runs + at and, as such, might require specialised hardware support (such as AMU + extension on ARM). If one cannot be determined, this attribute should + not be present. + + Note, that failed attempt to retrieve current frequency for a given + CPU(s) will result in an appropriate error, i.e: EAGAIN for CPU that + remains idle (raised on ARM). + ``cpuinfo_max_freq`` Maximum possible operating frequency the CPUs belonging to this policy can run at (in kHz). @@ -293,7 +307,8 @@ are the following: Some architectures (e.g. ``x86``) may attempt to provide information more precisely reflecting the current CPU frequency through this attribute, but that still may not be the exact current CPU frequency as - seen by the hardware at the moment. + seen by the hardware at the moment. This behavior though, is only + available via c:macro:``CPUFREQ_ARCH_CUR_FREQ`` option. ``scaling_driver`` The scaling driver currently in use. diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst index 2ed79f41a411..d0502691dfa1 100644 --- a/Documentation/admin-guide/thunderbolt.rst +++ b/Documentation/admin-guide/thunderbolt.rst @@ -28,7 +28,7 @@ should be a userspace tool that handles all the low-level details, keeps a database of the authorized devices and prompts users for new connections. More details about the sysfs interface for Thunderbolt devices can be -found in ``Documentation/ABI/testing/sysfs-bus-thunderbolt``. +found in Documentation/ABI/testing/sysfs-bus-thunderbolt. Those users who just want to connect any device without any sort of manual work can add following line to diff --git a/Documentation/admin-guide/workload-tracing.rst b/Documentation/admin-guide/workload-tracing.rst index 6be38c1b9c5b..d6313890ee41 100644 --- a/Documentation/admin-guide/workload-tracing.rst +++ b/Documentation/admin-guide/workload-tracing.rst @@ -82,7 +82,7 @@ Install tools to build Linux kernel and tools in kernel repository. scripts/ver_linux is a good way to check if your system already has the necessary tools:: - sudo apt-get build-essentials flex bison yacc + sudo apt-get install build-essential flex bison yacc sudo apt install libelf-dev systemtap-sdt-dev libslang2-dev libperl-dev libdw-dev cscope is a good tool to browse kernel sources. Let's install it now:: diff --git a/Documentation/arch/arm64/amu.rst b/Documentation/arch/arm64/amu.rst index 01f2de2b0450..ac1b3f0e211d 100644 --- a/Documentation/arch/arm64/amu.rst +++ b/Documentation/arch/arm64/amu.rst @@ -80,7 +80,7 @@ bypass the setting of AMUSERENR_EL0 to trap accesses from EL0 (userspace) to EL1 (kernel). Therefore, firmware should still ensure accesses to AMU registers are not trapped in EL2/EL3. -The fixed counters of AMUv1 are accessible though the following system +The fixed counters of AMUv1 are accessible through the following system register definitions: - SYS_AMEVCNTR0_CORE_EL0 diff --git a/Documentation/arch/arm64/asymmetric-32bit.rst b/Documentation/arch/arm64/asymmetric-32bit.rst index 1ca2b359a907..57b8d7476f71 100644 --- a/Documentation/arch/arm64/asymmetric-32bit.rst +++ b/Documentation/arch/arm64/asymmetric-32bit.rst @@ -55,7 +55,7 @@ sysfs The subset of CPUs capable of running 32-bit tasks is described in ``/sys/devices/system/cpu/aarch32_el0`` and is documented further in -``Documentation/ABI/testing/sysfs-devices-system-cpu``. +Documentation/ABI/testing/sysfs-devices-system-cpu. **Note:** CPUs are advertised by this file as they are detected and so late-onlining of 32-bit-capable CPUs can result in the file contents diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index cad6fdc96b98..dee7b6de864f 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -288,6 +288,12 @@ Before jumping into the kernel, the following conditions must be met: - SCR_EL3.FGTEn (bit 27) must be initialised to 0b1. + For CPUs with the Fine Grained Traps 2 (FEAT_FGT2) extension present: + + - If EL3 is present and the kernel is entered at EL2: + + - SCR_EL3.FGTEn2 (bit 59) must be initialised to 0b1. + For CPUs with support for HCRX_EL2 (FEAT_HCX) present: - If EL3 is present and the kernel is entered at EL2: @@ -382,6 +388,22 @@ Before jumping into the kernel, the following conditions must be met: - SMCR_EL2.EZT0 (bit 30) must be initialised to 0b1. + For CPUs with the Performance Monitors Extension (FEAT_PMUv3p9): + + - If EL3 is present: + + - MDCR_EL3.EnPM2 (bit 7) must be initialised to 0b1. + + - If the kernel is entered at EL1 and EL2 is present: + + - HDFGRTR2_EL2.nPMICNTR_EL0 (bit 2) must be initialised to 0b1. + - HDFGRTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1. + - HDFGRTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1. + + - HDFGWTR2_EL2.nPMICNTR_EL0 (bit 2) must be initialised to 0b1. + - HDFGWTR2_EL2.nPMICFILTR_EL0 (bit 3) must be initialised to 0b1. + - HDFGWTR2_EL2.nPMUACR_EL1 (bit 4) must be initialised to 0b1. + For CPUs with Memory Copy and Memory Set instructions (FEAT_MOPS): - If the kernel is entered at EL1 and EL2 is present: diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index f074f6219f5c..f968c13b46a7 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -284,6 +284,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | Rockchip | RK3588 | #3588001 | ROCKCHIP_ERRATUM_3588001 | +----------------+-----------------+-----------------+-----------------------------+ +| Rockchip | RK3568 | #3568002 | ROCKCHIP_ERRATUM_3568002 | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Fujitsu | A64FX | E#010001 | FUJITSU_ERRATUM_010001 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst index 76f53d3450e7..77e6163288db 100644 --- a/Documentation/arch/x86/boot.rst +++ b/Documentation/arch/x86/boot.rst @@ -1038,16 +1038,6 @@ Offset/size: 0x000c/4 This field contains maximal allowed type for setup_data and setup_indirect structs. -The Image Checksum -================== - -From boot protocol version 2.08 onwards the CRC-32 is calculated over -the entire file using the characteristic polynomial 0x04C11DB7 and an -initial remainder of 0xffffffff. The checksum is appended to the -file; therefore the CRC of the file up to the limit specified in the -syssize field of the header is always 0. - - The Kernel Command Line ======================= diff --git a/Documentation/arch/x86/usb-legacy-support.rst b/Documentation/arch/x86/usb-legacy-support.rst index e01c08b7c981..b17bf122270a 100644 --- a/Documentation/arch/x86/usb-legacy-support.rst +++ b/Documentation/arch/x86/usb-legacy-support.rst @@ -20,11 +20,7 @@ It has several drawbacks, though: features (wheel, extra buttons, touchpad mode) of the real PS/2 mouse may not be available. -2) If CONFIG_HIGHMEM64G is enabled, the PS/2 mouse emulation can cause - system crashes, because the SMM BIOS is not expecting to be in PAE mode. - The Intel E7505 is a typical machine where this happens. - -3) If AMD64 64-bit mode is enabled, again system crashes often happen, +2) If AMD64 64-bit mode is enabled, again system crashes often happen, because the SMM BIOS isn't expecting the CPU to be in 64-bit mode. The BIOS manufacturers only test with Windows, and Windows doesn't do 64-bit yet. @@ -38,11 +34,6 @@ Problem 1) compiled-in, too. Problem 2) - can currently only be solved by either disabling HIGHMEM64G - in the kernel config or USB Legacy support in the BIOS. A BIOS update - could help, but so far no such update exists. - -Problem 3) is usually fixed by a BIOS update. Check the board manufacturers web site. If an update is not available, disable USB Legacy support in the BIOS. If this alone doesn't help, try also adding diff --git a/Documentation/conf.py b/Documentation/conf.py index 0c2205d536b3..3dad1f90b098 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -47,7 +47,7 @@ from load_config import loadConfig # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -needs_sphinx = '2.4.4' +needs_sphinx = '3.4.3' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom diff --git a/Documentation/core-api/min_heap.rst b/Documentation/core-api/min_heap.rst index 683bc6d09f00..9f57766581df 100644 --- a/Documentation/core-api/min_heap.rst +++ b/Documentation/core-api/min_heap.rst @@ -47,8 +47,8 @@ Example: #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) struct _name { - int nr; /* Number of elements in the heap */ - int size; /* Maximum number of elements that can be held */ + size_t nr; /* Number of elements in the heap */ + size_t size; /* Maximum number of elements that can be held */ _type *data; /* Pointer to the heap data */ _type preallocated[_nr]; /* Static preallocated array */ } diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index ecccc0473da9..4bdc394e86af 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -661,7 +661,7 @@ Do *not* use it from C. Thanks ====== -If you add other %p extensions, please extend <lib/test_printf.c> with -one or more test cases, if at all feasible. +If you add other %p extensions, please extend <lib/tests/printf_kunit.c> +with one or more test cases, if at all feasible. Thank you for your cooperation and attention. diff --git a/Documentation/core-api/this_cpu_ops.rst b/Documentation/core-api/this_cpu_ops.rst index 91acbcf30e9b..533ac5dd5750 100644 --- a/Documentation/core-api/this_cpu_ops.rst +++ b/Documentation/core-api/this_cpu_ops.rst @@ -138,12 +138,22 @@ get_cpu/put_cpu sequence requires. No processor number is available. Instead, the offset of the local per cpu area is simply added to the per cpu offset. -Note that this operation is usually used in a code segment when -preemption has been disabled. The pointer is then used to -access local per cpu data in a critical section. When preemption -is re-enabled this pointer is usually no longer useful since it may -no longer point to per cpu data of the current processor. - +Note that this operation can only be used in code segments where +smp_processor_id() may be used, for example, where preemption has been +disabled. The pointer is then used to access local per cpu data in a +critical section. When preemption is re-enabled this pointer is usually +no longer useful since it may no longer point to per cpu data of the +current processor. + +The special cases where it makes sense to obtain a per-CPU pointer in +preemptible code are addressed by raw_cpu_ptr(), but such use cases need +to handle cases where two different CPUs are accessing the same per cpu +variable, which might well be that of a third CPU. These use cases are +typically performance optimizations. For example, SRCU implements a pair +of counters as a pair of per-CPU variables, and rcu_read_lock_nmisafe() +uses raw_cpu_ptr() to get a pointer to some CPU's counter, and uses +atomic_inc_long() to handle migration between the raw_cpu_ptr() and +the atomic_inc_long(). Per cpu variables and offsets ----------------------------- diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index d81c42d1063e..8575178aa87f 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -203,7 +203,7 @@ they happen concurrently in different threads, and at least one of them is a least one is a write. For a more thorough discussion and definition, see `"Plain Accesses and Data Races" in the LKMM`_. -.. _"Plain Accesses and Data Races" in the LKMM: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/memory-model/Documentation/explanation.txt#n1922 +.. _"Plain Accesses and Data Races" in the LKMM: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/memory-model/Documentation/explanation.txt?id=8f6629c004b193d23612641c3607e785819e97ab#n2164 Relationship with the Linux-Kernel Memory Consistency Model (LKMM) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index fdb1df86783a..18c2da67fae4 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -347,7 +347,7 @@ kselftest. We use kselftests for lib/ as an example. 1. Create the test module 2. Create the test script that will run (load/unload) the module - e.g. ``tools/testing/selftests/lib/printf.sh`` + e.g. ``tools/testing/selftests/lib/bitmap.sh`` 3. Add line to config file e.g. ``tools/testing/selftests/lib/config`` diff --git a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml index f49b43f45f3d..06e3621a8c06 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml @@ -26,6 +26,7 @@ properties: deprecated: true - const: allwinner,sun7i-a20-sc-nmi - const: allwinner,sun9i-a80-nmi + - const: allwinner,sun55i-a523-nmi - items: - enum: - allwinner,sun8i-v3s-nmi diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,rzv2h-icu.yaml b/Documentation/devicetree/bindings/interrupt-controller/renesas,rzv2h-icu.yaml index d7ef4f1323a7..3f99c8645767 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/renesas,rzv2h-icu.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,rzv2h-icu.yaml @@ -4,7 +4,7 @@ $id: http://devicetree.org/schemas/interrupt-controller/renesas,rzv2h-icu.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Renesas RZ/V2H(P) Interrupt Control Unit +title: Renesas RZ/{G3E,V2H(P)} Interrupt Control Unit maintainers: - Fabrizio Castro <fabrizio.castro.jz@renesas.com> @@ -20,7 +20,9 @@ description: properties: compatible: - const: renesas,r9a09g057-icu # RZ/V2H(P) + enum: + - renesas,r9a09g047-icu # RZ/G3E + - renesas,r9a09g057-icu # RZ/V2H(P) '#interrupt-cells': description: The first cell is the SPI number of the NMI or the diff --git a/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml b/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml index 190a6499c932..bef00521d5da 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/riscv,aplic.yaml @@ -91,6 +91,14 @@ properties: Firmware must configure interrupt delegation registers based on interrupt delegation list. + riscv,hart-indexes: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 16384 + description: + A list of hart indexes that APLIC should use to address each hart + that is mentioned in the "interrupts-extended" + dependencies: riscv,delegation: [ "riscv,children" ] diff --git a/Documentation/devicetree/bindings/interrupt-controller/sophgo,sg2042-msi.yaml b/Documentation/devicetree/bindings/interrupt-controller/sophgo,sg2042-msi.yaml new file mode 100644 index 000000000000..e1ffd55fa7bf --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/sophgo,sg2042-msi.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/interrupt-controller/sophgo,sg2042-msi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Sophgo SG2042 MSI Controller + +maintainers: + - Chen Wang <unicorn_wang@outlook.com> + +description: + This interrupt controller is in Sophgo SG2042 for transforming interrupts from + PCIe MSI to PLIC interrupts. + +allOf: + - $ref: /schemas/interrupt-controller/msi-controller.yaml# + +properties: + compatible: + const: sophgo,sg2042-msi + + reg: + items: + - description: clear register + - description: msi doorbell address + + reg-names: + items: + - const: clr + - const: doorbell + + msi-controller: true + + msi-ranges: + maxItems: 1 + + "#msi-cells": + const: 0 + +required: + - compatible + - reg + - reg-names + - msi-controller + - msi-ranges + - "#msi-cells" + +unevaluatedProperties: false + +examples: + - | + #include <dt-bindings/interrupt-controller/irq.h> + msi-controller@30000000 { + compatible = "sophgo,sg2042-msi"; + reg = <0x30000000 0x4>, <0x30000008 0x4>; + reg-names = "clr", "doorbell"; + msi-controller; + #msi-cells = <0>; + msi-ranges = <&plic 64 IRQ_TYPE_LEVEL_HIGH 32>; + }; diff --git a/Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml b/Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml index 2a98b26630cb..c155c9c6db39 100644 --- a/Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml +++ b/Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml @@ -40,7 +40,7 @@ properties: microchip,rx-int-gpios: description: - GPIO phandle of GPIO connected to to INT1 pin of the MCP251XFD, which + GPIO phandle of GPIO connected to INT1 pin of the MCP251XFD, which signals a pending RX interrupt. maxItems: 1 diff --git a/Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml b/Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml index 7c5ac5d2e880..f6884f6e59e7 100644 --- a/Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml +++ b/Documentation/devicetree/bindings/net/can/renesas,rcar-canfd.yaml @@ -170,7 +170,7 @@ allOf: const: renesas,r8a779h0-canfd then: patternProperties: - "^channel[5-7]$": false + "^channel[4-7]$": false else: if: not: diff --git a/Documentation/driver-api/firmware/firmware-usage-guidelines.rst b/Documentation/driver-api/firmware/firmware-usage-guidelines.rst index fdcfce42c6d2..336e912281c3 100644 --- a/Documentation/driver-api/firmware/firmware-usage-guidelines.rst +++ b/Documentation/driver-api/firmware/firmware-usage-guidelines.rst @@ -42,3 +42,8 @@ then of course these rules will not apply strictly.) deprecating old major versions, then this should only be done as a last option, and be stated clearly in all communications. +* Firmware files that affect the User API (UAPI) shall not introduce + changes that break existing userspace programs. Updates to such firmware + must ensure backward compatibility with existing userspace applications. + This includes maintaining consistent interfaces and behaviors that + userspace programs rely on. diff --git a/Documentation/driver-api/generic-counter.rst b/Documentation/driver-api/generic-counter.rst index 71ccc30e586b..e826f16ea43d 100644 --- a/Documentation/driver-api/generic-counter.rst +++ b/Documentation/driver-api/generic-counter.rst @@ -467,7 +467,7 @@ Counter sysfs Translates counter data to the standard Counter sysfs interface format and vice versa. -Please refer to the ``Documentation/ABI/testing/sysfs-bus-counter`` file +Please refer to the Documentation/ABI/testing/sysfs-bus-counter file for a detailed breakdown of the available Generic Counter interface sysfs attributes. @@ -483,7 +483,7 @@ Sysfs Interface Several sysfs attributes are generated by the Generic Counter interface, and reside under the ``/sys/bus/counter/devices/counterX`` directory, where ``X`` is to the respective counter device id. Please see -``Documentation/ABI/testing/sysfs-bus-counter`` for detailed information +Documentation/ABI/testing/sysfs-bus-counter for detailed information on each Generic Counter interface sysfs attribute. Through these sysfs attributes, programs and scripts may interact with diff --git a/Documentation/driver-api/iio/core.rst b/Documentation/driver-api/iio/core.rst index dfe438dc91a7..42b580fb2989 100644 --- a/Documentation/driver-api/iio/core.rst +++ b/Documentation/driver-api/iio/core.rst @@ -60,7 +60,7 @@ directory. Common attributes are: * :file:`sampling_frequency_available`, available discrete set of sampling frequency values for device. * Available standard attributes for IIO devices are described in the - :file:`Documentation/ABI/testing/sysfs-bus-iio` file in the Linux kernel + :file:Documentation/ABI/testing/sysfs-bus-iio file in the Linux kernel sources. IIO device channels diff --git a/Documentation/driver-api/infiniband.rst b/Documentation/driver-api/infiniband.rst index 30e142ccbee9..10d8be9e74fe 100644 --- a/Documentation/driver-api/infiniband.rst +++ b/Documentation/driver-api/infiniband.rst @@ -77,14 +77,14 @@ iSCSI Extensions for RDMA (iSER) :internal: .. kernel-doc:: drivers/infiniband/ulp/iser/iscsi_iser.c - :functions: iscsi_iser_pdu_alloc iser_initialize_task_headers \ - iscsi_iser_task_init iscsi_iser_mtask_xmit iscsi_iser_task_xmit \ - iscsi_iser_cleanup_task iscsi_iser_check_protection \ - iscsi_iser_conn_create iscsi_iser_conn_bind \ - iscsi_iser_conn_start iscsi_iser_conn_stop \ - iscsi_iser_session_destroy iscsi_iser_session_create \ - iscsi_iser_set_param iscsi_iser_ep_connect iscsi_iser_ep_poll \ - iscsi_iser_ep_disconnect + :functions: iscsi_iser_pdu_alloc iser_initialize_task_headers + iscsi_iser_task_init iscsi_iser_mtask_xmit iscsi_iser_task_xmit + iscsi_iser_cleanup_task iscsi_iser_check_protection + iscsi_iser_conn_create iscsi_iser_conn_bind + iscsi_iser_conn_start iscsi_iser_conn_stop + iscsi_iser_session_destroy iscsi_iser_session_create + iscsi_iser_set_param iscsi_iser_ep_connect iscsi_iser_ep_poll + iscsi_iser_ep_disconnect .. kernel-doc:: drivers/infiniband/ulp/iser/iser_initiator.c :internal: diff --git a/Documentation/driver-api/media/drivers/zoran.rst b/Documentation/driver-api/media/drivers/zoran.rst index b205e10c3154..3e05b7f0442a 100644 --- a/Documentation/driver-api/media/drivers/zoran.rst +++ b/Documentation/driver-api/media/drivers/zoran.rst @@ -222,7 +222,7 @@ The CCIR - I uses the PAL colorsystem, and is used in Great Britain, Hong Kong, Ireland, Nigeria, South Africa. The CCIR - N uses the PAL colorsystem and PAL frame size but the NTSC framerate, -and is used in Argentinia, Uruguay, an a few others +and is used in Argentina, Uruguay, an a few others We do not talk about how the audio is broadcast ! diff --git a/Documentation/driver-api/media/maintainer-entry-profile.rst b/Documentation/driver-api/media/maintainer-entry-profile.rst index ffc712a5f632..ad96a89ee916 100644 --- a/Documentation/driver-api/media/maintainer-entry-profile.rst +++ b/Documentation/driver-api/media/maintainer-entry-profile.rst @@ -116,7 +116,7 @@ CEC drivers ``cec-compliance`` .. [3] The ``v4l2-compliance`` also covers the media controller usage inside V4L2 drivers. -Other compilance tools are under development to check other parts of the +Other compliance tools are under development to check other parts of the subsystem. Those tests need to pass before the patches go upstream. diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst index ca16b5acbf30..c205efa4d45b 100644 --- a/Documentation/driver-api/nvdimm/nvdimm.rst +++ b/Documentation/driver-api/nvdimm/nvdimm.rst @@ -535,12 +535,12 @@ internally with a static identifier:: char devname[50]; snprintf(devname, sizeof(devname), "namespace%d.%d", - ndctl_region_get_id(region), paramaters->id); + ndctl_region_get_id(region), parameters->id); ndctl_namespace_set_alt_name(ndns, devname); /* 'uuid' must be set prior to setting size! */ - ndctl_namespace_set_uuid(ndns, paramaters->uuid); - ndctl_namespace_set_size(ndns, paramaters->size); + ndctl_namespace_set_uuid(ndns, parameters->uuid); + ndctl_namespace_set_size(ndns, parameters->size); /* unlike pmem namespaces, blk namespaces have a sector size */ if (parameters->lbasize) ndctl_namespace_set_sector_size(ndns, parameters->lbasize); diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index d448cb57df86..8d86d5da4023 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -358,7 +358,7 @@ the phases are: ``prepare``, ``suspend``, ``suspend_late``, ``suspend_noirq``. is probed against the device in question by passing them to the :c:func:`dev_pm_set_driver_flags` helper function.] If the first of these flags is set, the PM core will not apply the direct-complete - procedure described above to the given device and, consequenty, to any + procedure described above to the given device and, consequently, to any of its ancestors. The second flag, when set, informs the middle layer code (bus types, device types, PM domains, classes) that it should take the return value of the ``->prepare`` callback provided by the driver diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst new file mode 100644 index 000000000000..3f283de297c7 --- /dev/null +++ b/Documentation/edac/features.rst @@ -0,0 +1,103 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later + +================= +EDAC/RAS features +================= + +Copyright (c) 2024-2025 HiSilicon Limited. + +:Author: Shiju Jose <shiju.jose@huawei.com> +:License: The GNU Free Documentation License, Version 1.2 without + Invariant Sections, Front-Cover Texts nor Back-Cover Texts. + (dual licensed under the GPL v2) + +- Written for: 6.15 + +Introduction +------------ + +EDAC/RAS components plugging and high-level design: + +1. Scrub control + +2. Error Check Scrub (ECS) control + +3. ACPI RAS2 features + +4. Post Package Repair (PPR) control + +5. Memory Sparing Repair control + +High level design is illustrated in the following diagram:: + + +-----------------------------------------------+ + | Userspace - Rasdaemon | + | +-------------+ | + | | RAS CXL mem | +---------------+ | + | |error handler|---->| | | + | +-------------+ | RAS dynamic | | + | +-------------+ | scrub, memory | | + | | RAS memory |---->| repair control| | + | |error handler| +----|----------+ | + | +-------------+ | | + +--------------------------|--------------------+ + | + | + +-------------------------------|------------------------------+ + | Kernel EDAC extension for | controlling RAS Features | + |+------------------------------|----------------------------+ | + || EDAC Core Sysfs EDAC| Bus | | + || +--------------------------|---------------------------+| | + || |/sys/bus/edac/devices/<dev>/scrubX/ | | EDAC device || | + || |/sys/bus/edac/devices/<dev>/ecsX/ |<->| EDAC MC || | + || |/sys/bus/edac/devices/<dev>/repairX | | EDAC sysfs || | + || +---------------------------|--------------------------+| | + || EDAC|Bus | | + || | | | + || +----------+ Get feature | Get feature | | + || | | desc +---------|------+ desc +----------+ | | + || |EDAC scrub|<-----| EDAC device | | | | | + || +----------+ | driver- RAS |----->| EDAC mem | | | + || +----------+ | feature control| | repair | | | + || | |<-----| | +----------+ | | + || |EDAC ECS | +---------|------+ | | + || +----------+ Register RAS|features | | + || ______________________|_____________ | | + |+---------|---------------|------------------|--------------+ | + | +-------|----+ +-------|-------+ +----|----------+ | + | | | | CXL mem driver| | Client driver | | + | | ACPI RAS2 | | scrub, ECS, | | memory repair | | + | | driver | | sparing, PPR | | features | | + | +-----|------+ +-------|-------+ +------|--------+ | + | | | | | + +--------|-----------------|--------------------|--------------+ + | | | + +--------|-----------------|--------------------|--------------+ + | +---|-----------------|--------------------|-------+ | + | | | | + | | Platform HW and Firmware | | + | +--------------------------------------------------+ | + +--------------------------------------------------------------+ + + +1. EDAC Features components - Create feature-specific descriptors. For + example: scrub, ECS, memory repair in the above diagram. + +2. EDAC device driver for controlling RAS Features - Get feature's attribute + descriptors from EDAC RAS feature component and registers device's RAS + features with EDAC bus and expose the features control attributes via + sysfs. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/ + +3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for + dynamic scrub/repair control to issue scrubbing/repair when excess number + of corrected memory errors are reported in a short span of time. + +RAS features +------------ +1. Memory Scrub + +Memory scrub features are documented in `Documentation/edac/scrub.rst`. + +2. Memory Repair + +Memory repair features are documented in `Documentation/edac/memory_repair.rst`. diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst new file mode 100644 index 000000000000..420c6601dbae --- /dev/null +++ b/Documentation/edac/index.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later + +============== +EDAC Subsystem +============== + +.. toctree:: + :maxdepth: 1 + + features + memory_repair + scrub diff --git a/Documentation/edac/memory_repair.rst b/Documentation/edac/memory_repair.rst new file mode 100644 index 000000000000..52162a422864 --- /dev/null +++ b/Documentation/edac/memory_repair.rst @@ -0,0 +1,121 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later + +========================== +EDAC Memory Repair Control +========================== + +Copyright (c) 2024-2025 HiSilicon Limited. + +:Author: Shiju Jose <shiju.jose@huawei.com> +:License: The GNU Free Documentation License, Version 1.2 without + Invariant Sections, Front-Cover Texts nor Back-Cover Texts. + (dual licensed under the GPL v2) +:Original Reviewers: + +- Written for: 6.15 + +Introduction +------------ + +Some memory devices support repair operations to address issues in their +memory media. Post Package Repair (PPR) and memory sparing are examples of +such features. + +Post Package Repair (PPR) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Post Package Repair is a maintenance operation which requests the memory +device to perform repair operation on its media. It is a memory self-healing +feature that fixes a failing memory location by replacing it with a spare row +in a DRAM device. + +For example, a CXL memory device with DRAM components that support PPR +features implements maintenance operations. DRAM components support those +types of PPR functions: + + - hard PPR, for a permanent row repair, and + - soft PPR, for a temporary row repair. + +Soft PPR is much faster than hard PPR, but the repair is lost after a power +cycle. + +The data may not be retained and memory requests may not be correctly +processed during a repair operation. In such case, the repair operation should +not be executed at runtime. + +For example, for CXL memory devices, see CXL spec rev 3.1 [1]_ sections +8.2.9.7.1.1 PPR Maintenance Operations, 8.2.9.7.1.2 sPPR Maintenance Operation +and 8.2.9.7.1.3 hPPR Maintenance Operation for more details. + +Memory Sparing +~~~~~~~~~~~~~~ + +Memory sparing is a repair function that replaces a portion of memory with +a portion of functional memory at a particular granularity. Memory +sparing has cacheline/row/bank/rank sparing granularities. For example, in +rank memory-sparing mode, one memory rank serves as a spare for other ranks on +the same channel in case they fail. + +The spare rank is held in reserve and not used as active memory until +a failure is indicated, with reserved capacity subtracted from the total +available memory in the system. + +After an error threshold is surpassed in a system protected by memory sparing, +the content of a failing rank of DIMMs is copied to the spare rank. The +failing rank is then taken offline and the spare rank placed online for use as +active memory in place of the failed rank. + +For example, CXL memory devices can support various subclasses for sparing +operation vary in terms of the scope of the sparing being performed. + +Cacheline sparing subclass refers to a sparing action that can replace a full +cacheline. Row sparing is provided as an alternative to PPR sparing functions +and its scope is that of a single DDR row. Bank sparing allows an entire bank +to be replaced. Rank sparing is defined as an operation in which an entire DDR +rank is replaced. + +See CXL spec 3.1 [1]_ section 8.2.9.7.1.4 Memory Sparing Maintenance +Operations for more details. + +.. [1] https://computeexpresslink.org/cxl-specification/ + +Use cases of generic memory repair features control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. The soft PPR, hard PPR and memory-sparing features share similar control + attributes. Therefore, there is a need for a standardized, generic sysfs + repair control that is exposed to userspace and used by administrators, + scripts and tools. + +2. When a CXL device detects an error in a memory component, it informs the + host of the need for a repair maintenance operation by using an event + record where the "maintenance needed" flag is set. The event record + specifies the device physical address (DPA) and attributes of the memory + that requires repair. The kernel reports the corresponding CXL general + media or DRAM trace event to userspace, and userspace tools (e.g. + rasdaemon) initiate a repair maintenance operation in response to the + device request using the sysfs repair control. + +3. Userspace tools, such as rasdaemon, request a repair operation on a memory + region when maintenance need flag set or an uncorrected memory error or + excess of corrected memory errors above a threshold value is reported or an + exceed corrected errors threshold flag set for that memory. + +4. Multiple PPR/sparing instances may be present per memory device. + +5. Drivers should enforce that live repair is safe. In systems where memory + mapping functions can change between boots, one approach to this is to log + memory errors seen on this boot against which to check live memory repair + requests. + +The File System +--------------- + +The control attributes of a registered memory repair instance could be +accessed in the /sys/bus/edac/devices/<dev-name>/mem_repairX/ + +sysfs +----- + +Sysfs files are documented in +`Documentation/ABI/testing/sysfs-edac-memory-repair`. diff --git a/Documentation/edac/scrub.rst b/Documentation/edac/scrub.rst new file mode 100644 index 000000000000..daab929cdba1 --- /dev/null +++ b/Documentation/edac/scrub.rst @@ -0,0 +1,266 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later + +============= +Scrub Control +============= + +Copyright (c) 2024-2025 HiSilicon Limited. + +:Author: Shiju Jose <shiju.jose@huawei.com> +:License: The GNU Free Documentation License, Version 1.2 without + Invariant Sections, Front-Cover Texts nor Back-Cover Texts. + (dual licensed under the GPL v2) + +- Written for: 6.15 + +Introduction +------------ + +Increasing DRAM size and cost have made memory subsystem reliability an +important concern. These modules are used where potentially corrupted data +could cause expensive or fatal issues. Memory errors are among the top +hardware failures that cause server and workload crashes. + +Memory scrubbing is a feature where an ECC (Error-Correcting Code) engine +reads data from each memory media location, corrects if necessary and writes +the corrected data back to the same memory media location. + +DIMMs can be scrubbed at a configurable rate to detect uncorrected memory +errors and attempt recovery from detected errors, providing the following +benefits: + +1. Proactively scrubbing DIMMs reduces the chance of a correctable error + becoming uncorrectable. + +2. When detected, uncorrected errors caught in unallocated memory pages are + isolated and prevented from being allocated to an application or the OS. + +3. This reduces the likelihood of software or hardware products encountering + memory errors. + +4. The additional data on failures in memory may be used to build up + statistics that are later used to decide whether to use memory repair + technologies such as Post Package Repair or Sparing. + +There are 2 types of memory scrubbing: + +1. Background (patrol) scrubbing while the DRAM is otherwise idle. + +2. On-demand scrubbing for a specific address range or region of memory. + +Several types of interfaces to hardware memory scrubbers have been +identified, such as CXL memory device patrol scrub, CXL DDR5 ECS, ACPI +RAS2 memory scrubbing, and ACPI NVDIMM ARS (Address Range Scrub). + +The control mechanisms vary across different memory scrubbers. To enable +standardized userspace tooling, there is a need to present these controls +through a standardized ABI. + +A generic memory EDAC scrub control allows users to manage underlying +scrubbers in the system through a standardized sysfs control interface. It +abstracts the management of various scrubbing functionalities into a unified +set of functions. + +Use cases of common scrub control feature +----------------------------------------- + +1. Several types of interfaces for hardware memory scrubbers have been + identified, including the CXL memory device patrol scrub, CXL DDR5 ECS, + ACPI RAS2 memory scrubbing features, ACPI NVDIMM ARS (Address Range Scrub), + and software-based memory scrubbers. + + Of the identified interfaces to hardware memory scrubbers some support + control over patrol (background) scrubbing (e.g., ACPI RAS2, CXL) and/or + on-demand scrubbing (e.g., ACPI RAS2, ACPI ARS). However, the scrub control + interfaces vary between memory scrubbers, highlighting the need for + a standardized, generic sysfs scrub control interface that is accessible to + userspace for administration and use by scripts/tools. + +2. User-space scrub controls allow users to disable scrubbing if necessary, + for example, to disable background patrol scrubbing or adjust the scrub + rate for performance-aware operations where background activities need to + be minimized or disabled. + +3. User-space tools enable on-demand scrubbing for specific address ranges, + provided that the scrubber supports this functionality. + +4. User-space tools can also control memory DIMM scrubbing at a configurable + scrub rate via sysfs scrub controls. This approach offers several benefits: + + 4.1. Detects uncorrectable memory errors early, before user access to affected + memory, helping facilitate recovery. + + 4.2. Reduces the likelihood of correctable errors developing into uncorrectable + errors. + +5. Policy control for hotplugged memory is necessary because there may not + be a system-wide BIOS or similar control to manage scrub settings for a CXL + device added after boot. Determining these settings is a policy decision, + balancing reliability against performance, so userspace should control it. + Therefore, a unified interface is recommended for handling this function in + a way that aligns with other similar interfaces, rather than creating a + separate one. + +Scrubbing features +------------------ + +CXL Memory Scrubbing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +CXL spec r3.1 [1]_ section 8.2.9.9.11.1 describes the memory device patrol +scrub control feature. The device patrol scrub proactively locates and makes +corrections to errors in regular cycle. The patrol scrub control allows the +userspace request to change CXL patrol scrubber's configurations. + +The patrol scrub control allows the requester to specify the number of +hours in which the patrol scrub cycles must be completed, provided that +the requested scrub rate must be within the supported range of the +scrub rate that the device is capable of. In the CXL driver, the +number of seconds per scrub cycles, which user requests via sysfs, is +rescaled to hours per scrub cycles. + +In addition, they allow the host to disable the feature in case it interferes +with performance-aware operations which require the background operations to +be turned off. + +Error Check Scrub (ECS) +~~~~~~~~~~~~~~~~~~~~~~~ + +CXL spec r3.1 [1]_ section 8.2.9.9.11.2 describes Error Check Scrub (ECS) +- a feature defined in the JEDEC DDR5 SDRAM Specification (JESD79-5) and +allowing DRAM to internally read, correct single-bit errors, and write back +corrected data bits to the DRAM array while providing transparency to error +counts. + +The DDR5 device contains number of memory media Field Replaceable Units (FRU) +per device. The DDR5 ECS feature and thus the ECS control driver supports +configuring the ECS parameters per FRU. + +ACPI RAS2 Hardware-based Memory Scrubbing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +ACPI spec 6.5 [2]_ section 5.2.21 ACPI RAS2 describes an ACPI RAS2 table +which provides interfaces for platform RAS features and supports independent +RAS controls and capabilities for a given RAS feature for multiple instances +of the same component in a given system. + +Memory RAS features apply to RAS capabilities, controls and operations that +are specific to memory. RAS2 PCC sub-spaces for memory-specific RAS features +have a Feature Type of 0x00 (Memory). + +The platform can use the hardware-based memory scrubbing feature to expose +controls and capabilities associated with hardware-based memory scrub +engines. The RAS2 memory scrubbing feature supports as per spec, + +1. Independent memory scrubbing controls for each NUMA domain, identified + using its proximity domain. + +2. Provision for background (patrol) scrubbing of the entire memory system, + as well as on-demand scrubbing for a specific region of memory. + +ACPI Address Range Scrubbing (ARS) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +ACPI spec 6.5 [2]_ section 9.19.7.2 describes Address Range Scrubbing (ARS). +ARS allows the platform to communicate memory errors to system software. +This capability allows system software to prevent accesses to addresses with +uncorrectable errors in memory. ARS functions manage all NVDIMMs present in +the system. Only one scrub can be in progress system wide at any given time. + +The following functions are supported as per the specification: + +1. Query ARS Capabilities for a given address range, indicates platform + supports the ACPI NVDIMM Root Device Unconsumed Error Notification. + +2. Start ARS triggers an Address Range Scrub for the given memory range. + Address scrubbing can be done for volatile or persistent memory, or both. + +3. Query ARS Status command allows software to get the status of ARS, + including the progress of ARS and ARS error record. + +4. Clear Uncorrectable Error. + +5. Translate SPA + +6. ARS Error Inject etc. + +The kernel supports an existing control for ARS and ARS is currently not +supported in EDAC. + +.. [1] https://computeexpresslink.org/cxl-specification/ + +.. [2] https://uefi.org/specs/ACPI/6.5/ + +Comparison of various scrubbing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +--------------+-----------+-----------+-----------+-----------+ + | | ACPI | CXL patrol| CXL ECS | ARS | + | Name | RAS2 | scrub | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | On-demand | Supported | No | No | Supported | + | Scrubbing | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Background | Supported | Supported | Supported | No | + | scrubbing | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Mode of | Scrub ctrl| per device| per memory| Unknown | + | scrubbing | per NUMA | | media | | + | | domain. | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Query scrub | Supported | Supported | Supported | Supported | + | capabilities | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Setting | Supported | No | No | Supported | + | address range| | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Setting | Supported | Supported | No | No | + | scrub rate | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Unit for | Not | in hours | No | No | + | scrub rate | Defined | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | Supported | | | | + | Scrub | on-demand | No | No | Supported | + | status/ | scrubbing | | | | + | Completion | only | | | | + +--------------+-----------+-----------+-----------+-----------+ + | UC error | |CXL general|CXL general| ACPI UCE | + | reporting | Exception |media/DRAM |media/DRAM | notify and| + | | |event/media|event/media| query | + | | |scan? |scan? | ARS status| + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Support for | Supported | Supported | Supported | No | + | EDAC control | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + +The File System +--------------- + +The control attributes of a registered scrubber instance could be +accessed in: + +/sys/bus/edac/devices/<dev-name>/scrubX/ + +sysfs +----- + +Sysfs files are documented in +`Documentation/ABI/testing/sysfs-edac-scrub` + +`Documentation/ABI/testing/sysfs-edac-ecs` diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt index 02febc883588..d937b7a03575 100644 --- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt +++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt @@ -20,7 +20,7 @@ | openrisc: | TODO | | parisc: | ok | | powerpc: | ok | - | riscv: | ok | + | riscv: | TODO | | s390: | ok | | sh: | TODO | | sparc: | TODO | diff --git a/Documentation/features/list-arch.sh b/Documentation/features/list-arch.sh index e73aa35848f0..ac8ff7f6f859 100755 --- a/Documentation/features/list-arch.sh +++ b/Documentation/features/list-arch.sh @@ -6,6 +6,6 @@ # (If no arguments are given then it will print the host architecture's status.) # -ARCH=${1:-$(uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/')} +ARCH=${1:-$(uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/' | sed 's/s390x/s390/')} $(dirname $0)/../../scripts/get_feat.pl list --arch $ARCH diff --git a/Documentation/filesystems/9p.rst b/Documentation/filesystems/9p.rst index 2bbf68b56b0d..3078f3c9256a 100644 --- a/Documentation/filesystems/9p.rst +++ b/Documentation/filesystems/9p.rst @@ -90,7 +90,7 @@ Just start the 9pfs capable network server like diod/nfs-ganesha e.g.:: $ diod -f -n -d 0 -S -l 0.0.0.0:9999 -e $PWD -Optionaly scan your bus if there are more then one usbg gadgets to find their path:: +Optionally scan your bus if there are more then one usbg gadgets to find their path:: $ python $kernel_dir/tools/usb/p9_fwd.py list diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst index 026b12ae0d6a..4b79ca58faf2 100644 --- a/Documentation/filesystems/bcachefs/SubmittingPatches.rst +++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst @@ -30,7 +30,7 @@ CI: === Instead of running your tests locally, when running the full test suite it's -prefereable to let a server farm do it in parallel, and then have the results +preferable to let a server farm do it in parallel, and then have the results in a nice test dashboard (which can tell you which failures are new, and presents results in a git log view, avoiding the need for most bisecting). @@ -68,7 +68,7 @@ Other things to think about: land - use them. Use them judiciously, and not as a replacement for proper error handling, but use them. -- Does it need to be performance tested? Should we add new peformance counters? +- Does it need to be performance tested? Should we add new performance counters? bcachefs has a set of persistent runtime counters which can be viewed with the 'bcachefs fs top' command; this should give users a basic idea of what diff --git a/Documentation/filesystems/coda.rst b/Documentation/filesystems/coda.rst index bdde7e4e010b..0db3c83a50e5 100644 --- a/Documentation/filesystems/coda.rst +++ b/Documentation/filesystems/coda.rst @@ -141,7 +141,7 @@ kernel support. a process P which accessing a Coda file. It makes a system call which traps to the OS kernel. Examples of such calls trapping to the kernel are ``read``, ``write``, ``open``, ``close``, ``create``, ``mkdir``, - ``rmdir``, ``chmod`` in a Unix ontext. Similar calls exist in the Win32 + ``rmdir``, ``chmod`` in a Unix context. Similar calls exist in the Win32 environment, and are named ``CreateFile``. Generally the operating system handles the request in a virtual diff --git a/Documentation/filesystems/debugfs.rst b/Documentation/filesystems/debugfs.rst index f7f977ffbf8d..610f718ef8b5 100644 --- a/Documentation/filesystems/debugfs.rst +++ b/Documentation/filesystems/debugfs.rst @@ -220,7 +220,7 @@ There are a couple of other directory-oriented helper functions:: A call to debugfs_change_name() will give a new name to an existing debugfs file, always in the same directory. The new_name must not exist prior -to the call; the return value is 0 on success and -E... on failuer. +to the call; the return value is 0 on success and -E... on failure. Symbolic links can be created with debugfs_create_symlink(). There is one important thing that all debugfs users must take into account: diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 2636f2a41bd3..a9cf8e950b15 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -118,7 +118,6 @@ Documentation for filesystem implementations. spufs/index squashfs sysfs - sysv-fs tmpfs ubifs ubifs-authentication diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index b0d0188a095e..e29651a42eec 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -246,6 +246,10 @@ The fields are as follows: * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can be set by the filesystem for its own purposes. + * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target + block assigned to it yet and the file system will do that in the bio + submission handler, splitting the I/O as needed. + These flags can be set by iomap itself during file operations. The filesystem should supply an ``->iomap_end`` function if it needs to observe these flags: @@ -352,6 +356,11 @@ operations: ``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or ``RWF_NOWAIT``. + * ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a + buffered file I/O and would like the kernel to drop the pagecache + after the I/O completes, if it isn't already being used by another + thread. + If it is necessary to read existing file contents from a `different <https://lore.kernel.org/all/20191008071527.29304-9-hch@lst.de/>`_ device or address range on a device, the filesystem should return that diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 2c7f5df9d8b0..3b628e370d88 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``. + * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. + Internal per-Folio State ------------------------ @@ -283,7 +285,7 @@ The ``ops`` structure must be specified and is as follows: struct iomap_writeback_ops { int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned len); - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); void (*discard_folio)(struct folio *folio, loff_t pos); }; @@ -306,13 +308,12 @@ The fields are as follows: purpose. This function must be supplied by the filesystem. - - ``prepare_ioend``: Enables filesystems to transform the writeback - ioend or perform any other preparatory work before the writeback I/O - is submitted. + - ``submit_ioend``: Allows the file systems to hook into writeback bio + submission. This might include pre-write space accounting updates, or installing a custom ``->bi_end_io`` function for internal purposes, such as deferring the ioend completion to a workqueue to run metadata update - transactions from process context. + transactions from process context before submitting the bio. This function is optional. - ``discard_folio``: iomap calls this function after ``->map_blocks`` @@ -341,7 +342,7 @@ This can happen in interrupt or process context, depending on the storage device. Filesystems that need to update internal bookkeeping (e.g. unwritten -extent conversions) should provide a ``->prepare_ioend`` function to +extent conversions) should provide a ``->submit_ioend`` function to set ``struct iomap_end::bio::bi_end_io`` to its own function. This function should call ``iomap_finish_ioends`` after finishing its own work (e.g. unwritten extent conversion). @@ -515,18 +516,33 @@ IOMAP_WRITE`` with any combination of the following enhancements: * ``IOMAP_ATOMIC``: This write is being issued with torn-write protection. - Only a single bio can be created for the write, and the write must - not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be - set. + Torn-write protection may be provided based on HW-offload or by a + software mechanism provided by the filesystem. + + For HW-offload based support, only a single bio can be created for the + write, and the write must not be split into multiple I/O requests, i.e. + flag REQ_ATOMIC must be set. The file range to write must be aligned to satisfy the requirements of both the filesystem and the underlying block device's atomic commit capabilities. If filesystem metadata updates are required (e.g. unwritten extent - conversion or copy on write), all updates for the entire file range + conversion or copy-on-write), all updates for the entire file range must be committed atomically as well. - Only one space mapping is allowed per untorn write. - Untorn writes must be aligned to, and must not be longer than, a - single file block. + Untorn-writes may be longer than a single file block. In all cases, + the mapping start disk block must have at least the same alignment as + the write offset. + The filesystems must set IOMAP_F_ATOMIC_BIO to inform iomap core of an + untorn-write based on HW-offload. + + For untorn-writes based on a software mechanism provided by the + filesystem, all the disk block alignment and single bio restrictions + which apply for HW-offload based untorn-writes do not apply. + The mechanism would typically be used as a fallback for when + HW-offload based untorn-writes may not be issued, e.g. the range of the + write covers multiple extents, meaning that it is not possible to issue + a single bio. + All filesystem metadata updates for the entire file range must be + committed atomically as well. Callers commonly hold ``i_rwsem`` in shared or exclusive mode before calling this function. diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index d20a32b77b60..0ec0bb6eb0fb 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -66,7 +66,7 @@ prototypes:: int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); - int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); + struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 73f0bfd7e903..3886c14f89f4 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -515,7 +515,7 @@ The methods defined in the table are: the cache to expand a request in either direction. This allows the cache to size the request appropriately for the cache granularity. - The function is passed poiners to the start and length in its parameters, + The function is passed pointers to the start and length in its parameters, plus the size of the file for reference, and adjusts the start and length appropriately. It should return one of: diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index 6245b67ae9e0..2db379b4b31e 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -292,13 +292,27 @@ rename or unlink will of course be noticed and handled). Permission model ---------------- +An overlay filesystem stashes credentials that will be used when +accessing lower or upper filesystems. + +In the old mount api the credentials of the task calling mount(2) are +stashed. In the new mount api the credentials of the task creating the +superblock through FSCONFIG_CMD_CREATE command of fsconfig(2) are +stashed. + +Starting with kernel v6.15 it is possible to use the "override_creds" +mount option which will cause the credentials of the calling task to be +recorded. Note that "override_creds" is only meaningful when used with +the new mount api as the old mount api combines setting options and +superblock creation in a single mount(2) syscall. + Permission checking in the overlay filesystem follows these principles: 1) permission check SHOULD return the same result before and after copy up 2) task creating the overlay mount MUST NOT gain additional privileges - 3) non-mounting task MAY gain additional privileges through the overlay, + 3) task[*] MAY gain additional privileges through the overlay, compared to direct access on underlying lower or upper filesystems This is achieved by performing two permission checks on each access: @@ -306,7 +320,7 @@ This is achieved by performing two permission checks on each access: a) check if current task is allowed access based on local DAC (owner, group, mode and posix acl), as well as MAC checks - b) check if mounting task would be allowed real operation on lower or + b) check if stashed credentials would be allowed real operation on lower or upper layer based on underlying filesystem permissions, again including MAC checks @@ -315,10 +329,10 @@ are copied up. On the other hand it can result in server enforced permissions (used by NFS, for example) being ignored (3). Check (b) ensures that no task gains permissions to underlying layers that -the mounting task does not have (2). This also means that it is possible +the stashed credentials do not have (2). This also means that it is possible to create setups where the consistency rule (1) does not hold; normally, -however, the mounting task will have sufficient privileges to perform all -operations. +however, the stashed credentials will have sufficient privileges to +perform all operations. Another way to demonstrate this model is drawing parallels between:: diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 1639e78e3146..767b2927c762 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1144,7 +1144,7 @@ and it *must* be opened exclusive. --- -** mandatory** +**mandatory** ->d_revalidate() gets two extra arguments - inode of parent directory and name our dentry is expected to have. Both are stable (dir is pinned in @@ -1157,3 +1157,49 @@ in normal case it points into the pathname being looked up. NOTE: if you need something like full path from the root of filesystem, you are still on your own - this assists with simple cases, but it's not magic. + +--- + +**recommended** + +kern_path_locked() and user_path_locked() no longer return a negative +dentry so this doesn't need to be checked. If the name cannot be found, +ERR_PTR(-ENOENT) is returned. + +--- + +**recommended** + +lookup_one_qstr_excl() is changed to return errors in more cases, so +these conditions don't require explicit checks: + + - if LOOKUP_CREATE is NOT given, then the dentry won't be negative, + ERR_PTR(-ENOENT) is returned instead + - if LOOKUP_EXCL IS given, then the dentry won't be positive, + ERR_PTR(-EEXIST) is rreturned instread + +LOOKUP_EXCL now means "target must not exist". It can be combined with +LOOK_CREATE or LOOKUP_RENAME_TARGET. + +--- + +**mandatory** +invalidate_inodes() is gone use evict_inodes() instead. + +--- + +**mandatory** + +->mkdir() now returns a dentry. If the created inode is found to +already be in cache and have a dentry (often IS_ROOT()), it will need to +be spliced into the given name in place of the given dentry. That dentry +now needs to be returned. If the original dentry is used, NULL should +be returned. Any error should be returned with ERR_PTR(). + +In general, filesystems which use d_instantiate_new() to install the new +inode can safely return NULL. Filesystems which may not have an I_NEW inode +should use d_drop();d_splice_alias() and return the result of the latter. + +If a positive dentry cannot be returned for some reason, in-kernel +clients such as cachefiles, nfsd, smb/server may not perform ideally but +will fail-safe. diff --git a/Documentation/filesystems/sysv-fs.rst b/Documentation/filesystems/sysv-fs.rst deleted file mode 100644 index 89e40911ad7c..000000000000 --- a/Documentation/filesystems/sysv-fs.rst +++ /dev/null @@ -1,264 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -================== -SystemV Filesystem -================== - -It implements all of - - Xenix FS, - - SystemV/386 FS, - - Coherent FS. - -To install: - -* Answer the 'System V and Coherent filesystem support' question with 'y' - when configuring the kernel. -* To mount a disk or a partition, use:: - - mount [-r] -t sysv device mountpoint - - The file system type names:: - - -t sysv - -t xenix - -t coherent - - may be used interchangeably, but the last two will eventually disappear. - -Bugs in the present implementation: - -- Coherent FS: - - - The "free list interleave" n:m is currently ignored. - - Only file systems with no filesystem name and no pack name are recognized. - (See Coherent "man mkfs" for a description of these features.) - -- SystemV Release 2 FS: - - The superblock is only searched in the blocks 9, 15, 18, which - corresponds to the beginning of track 1 on floppy disks. No support - for this FS on hard disk yet. - - -These filesystems are rather similar. Here is a comparison with Minix FS: - -* Linux fdisk reports on partitions - - - Minix FS 0x81 Linux/Minix - - Xenix FS ?? - - SystemV FS ?? - - Coherent FS 0x08 AIX bootable - -* Size of a block or zone (data allocation unit on disk) - - - Minix FS 1024 - - Xenix FS 1024 (also 512 ??) - - SystemV FS 1024 (also 512 and 2048) - - Coherent FS 512 - -* General layout: all have one boot block, one super block and - separate areas for inodes and for directories/data. - On SystemV Release 2 FS (e.g. Microport) the first track is reserved and - all the block numbers (including the super block) are offset by one track. - -* Byte ordering of "short" (16 bit entities) on disk: - - - Minix FS little endian 0 1 - - Xenix FS little endian 0 1 - - SystemV FS little endian 0 1 - - Coherent FS little endian 0 1 - - Of course, this affects only the file system, not the data of files on it! - -* Byte ordering of "long" (32 bit entities) on disk: - - - Minix FS little endian 0 1 2 3 - - Xenix FS little endian 0 1 2 3 - - SystemV FS little endian 0 1 2 3 - - Coherent FS PDP-11 2 3 0 1 - - Of course, this affects only the file system, not the data of files on it! - -* Inode on disk: "short", 0 means non-existent, the root dir ino is: - - ================================= == - Minix FS 1 - Xenix FS, SystemV FS, Coherent FS 2 - ================================= == - -* Maximum number of hard links to a file: - - =========== ========= - Minix FS 250 - Xenix FS ?? - SystemV FS ?? - Coherent FS >=10000 - =========== ========= - -* Free inode management: - - - Minix FS - a bitmap - - Xenix FS, SystemV FS, Coherent FS - There is a cache of a certain number of free inodes in the super-block. - When it is exhausted, new free inodes are found using a linear search. - -* Free block management: - - - Minix FS - a bitmap - - Xenix FS, SystemV FS, Coherent FS - Free blocks are organized in a "free list". Maybe a misleading term, - since it is not true that every free block contains a pointer to - the next free block. Rather, the free blocks are organized in chunks - of limited size, and every now and then a free block contains pointers - to the free blocks pertaining to the next chunk; the first of these - contains pointers and so on. The list terminates with a "block number" - 0 on Xenix FS and SystemV FS, with a block zeroed out on Coherent FS. - -* Super-block location: - - =========== ========================== - Minix FS block 1 = bytes 1024..2047 - Xenix FS block 1 = bytes 1024..2047 - SystemV FS bytes 512..1023 - Coherent FS block 1 = bytes 512..1023 - =========== ========================== - -* Super-block layout: - - - Minix FS:: - - unsigned short s_ninodes; - unsigned short s_nzones; - unsigned short s_imap_blocks; - unsigned short s_zmap_blocks; - unsigned short s_firstdatazone; - unsigned short s_log_zone_size; - unsigned long s_max_size; - unsigned short s_magic; - - - Xenix FS, SystemV FS, Coherent FS:: - - unsigned short s_firstdatazone; - unsigned long s_nzones; - unsigned short s_fzone_count; - unsigned long s_fzones[NICFREE]; - unsigned short s_finode_count; - unsigned short s_finodes[NICINOD]; - char s_flock; - char s_ilock; - char s_modified; - char s_rdonly; - unsigned long s_time; - short s_dinfo[4]; -- SystemV FS only - unsigned long s_free_zones; - unsigned short s_free_inodes; - short s_dinfo[4]; -- Xenix FS only - unsigned short s_interleave_m,s_interleave_n; -- Coherent FS only - char s_fname[6]; - char s_fpack[6]; - - then they differ considerably: - - Xenix FS:: - - char s_clean; - char s_fill[371]; - long s_magic; - long s_type; - - SystemV FS:: - - long s_fill[12 or 14]; - long s_state; - long s_magic; - long s_type; - - Coherent FS:: - - unsigned long s_unique; - - Note that Coherent FS has no magic. - -* Inode layout: - - - Minix FS:: - - unsigned short i_mode; - unsigned short i_uid; - unsigned long i_size; - unsigned long i_time; - unsigned char i_gid; - unsigned char i_nlinks; - unsigned short i_zone[7+1+1]; - - - Xenix FS, SystemV FS, Coherent FS:: - - unsigned short i_mode; - unsigned short i_nlink; - unsigned short i_uid; - unsigned short i_gid; - unsigned long i_size; - unsigned char i_zone[3*(10+1+1+1)]; - unsigned long i_atime; - unsigned long i_mtime; - unsigned long i_ctime; - - -* Regular file data blocks are organized as - - - Minix FS: - - - 7 direct blocks - - 1 indirect block (pointers to blocks) - - 1 double-indirect block (pointer to pointers to blocks) - - - Xenix FS, SystemV FS, Coherent FS: - - - 10 direct blocks - - 1 indirect block (pointers to blocks) - - 1 double-indirect block (pointer to pointers to blocks) - - 1 triple-indirect block (pointer to pointers to pointers to blocks) - - - =========== ========== ================ - Inode size inodes per block - =========== ========== ================ - Minix FS 32 32 - Xenix FS 64 16 - SystemV FS 64 16 - Coherent FS 64 8 - =========== ========== ================ - -* Directory entry on disk - - - Minix FS:: - - unsigned short inode; - char name[14/30]; - - - Xenix FS, SystemV FS, Coherent FS:: - - unsigned short inode; - char name[14]; - - =========== ============== ===================== - Dir entry size dir entries per block - =========== ============== ===================== - Minix FS 16/32 64/32 - Xenix FS 16 64 - SystemV FS 16 64 - Coherent FS 16 32 - =========== ============== ===================== - -* How to implement symbolic links such that the host fsck doesn't scream: - - - Minix FS normal - - Xenix FS kludge: as regular files with chmod 1000 - - SystemV FS ?? - - Coherent FS kludge: as regular files with chmod 1000 - - -Notation: We often speak of a "block" but mean a zone (the allocation unit) -and not the disk driver's notion of "block". diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 31eea688609a..ae79c30b6c0c 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -495,7 +495,7 @@ As of kernel 2.6.22, the following members are defined: int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); - int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); + struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, @@ -562,7 +562,26 @@ otherwise noted. ``mkdir`` called by the mkdir(2) system call. Only required if you want to support creating subdirectories. You will probably need to - call d_instantiate() just as you would in the create() method + call d_instantiate_new() just as you would in the create() method. + + If d_instantiate_new() is not used and if the fh_to_dentry() + export operation is provided, or if the storage might be + accessible by another path (e.g. with a network filesystem) + then more care may be needed. Importantly d_instantate() + should not be used with an inode that is no longer I_NEW if there + any chance that the inode could already be attached to a dentry. + This is because of a hard rule in the VFS that a directory must + only ever have one dentry. + + For example, if an NFS filesystem is mounted twice the new directory + could be visible on the other mount before it is on the original + mount, and a pair of name_to_handle_at(), open_by_handle_at() + calls could instantiate the directory inode with an IS_ROOT() + dentry before the first mkdir returns. + + If there is any chance this could happen, then the new inode + should be d_drop()ed and attached with d_splice_alias(). The + returned dentry (if any) should be returned by ->mkdir(). ``rmdir`` called by the rmdir(2) system call. Only required if you want diff --git a/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst index 6402ab8e370c..2a2705e975e8 100644 --- a/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst +++ b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst @@ -219,7 +219,7 @@ The log is circular, so the positions in the log are defined by the combination of a cycle number - the number of times the log has been overwritten - and the offset into the log. A LSN carries the cycle in the upper 32 bits and the offset in the lower 32 bits. The offset is in units of "basic blocks" (512 -bytes). Hence we can do realtively simple LSN based math to keep track of +bytes). Hence we can do relatively simple LSN based math to keep track of available space in the log. Log space accounting is done via a pair of constructs called "grant heads". The diff --git a/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst index 32b6ac4ca9d6..ce4584fb3103 100644 --- a/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst +++ b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst @@ -93,7 +93,7 @@ others on a regular basis about burnout. sponsoring work on any part of XFS. - **LTS Maintainer**: Someone who backports and tests bug fixes from - uptream to the LTS kernels. + upstream to the LTS kernels. There tend to be six separate LTS trees at any given time. The maintainer for a given LTS release should identify themselves with an diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst index 12aa63840830..e231d127cd40 100644 --- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -4521,8 +4521,8 @@ Both online and offline repair can use this strategy. | For this second effort, the ondisk parent pointer format as originally | | proposed was ``(parent_inum, parent_gen, dirent_pos) → (dirent_name)``. | | The format was changed during development to eliminate the requirement | -| of repair tools needing to to ensure that the ``dirent_pos`` field | -| always matched when reconstructing a directory. | +| of repair tools needing to ensure that the ``dirent_pos`` field always | +| matched when reconstructing a directory. | | | | There were a few other ways to have solved that problem: | | | diff --git a/Documentation/iio/iio_devbuf.rst b/Documentation/iio/iio_devbuf.rst index 9919e4792d0e..dca1f0200b0d 100644 --- a/Documentation/iio/iio_devbuf.rst +++ b/Documentation/iio/iio_devbuf.rst @@ -148,5 +148,5 @@ applied), however there are corner cases in which the buffered data may be found in a processed form. Please note that these corner cases are not addressed by this documentation. -Please see ``Documentation/ABI/testing/sysfs-bus-iio`` for a complete +Please see Documentation/ABI/testing/sysfs-bus-iio for a complete description of the attributes. diff --git a/Documentation/input/devices/elantech.rst b/Documentation/input/devices/elantech.rst index c3374a7ce7af..98163a258b83 100644 --- a/Documentation/input/devices/elantech.rst +++ b/Documentation/input/devices/elantech.rst @@ -556,7 +556,7 @@ Note on debounce: In case the box has unstable power supply or other electricity issues, or when number of finger changes, F/W would send "debounce packet" to inform driver that the hardware is in debounce status. -The debouce packet has the following signature:: +The debounce packet has the following signature:: byte 0: 0xc4 byte 1: 0xff diff --git a/Documentation/input/input-programming.rst b/Documentation/input/input-programming.rst index c9264814c7aa..2b3e6a34e34b 100644 --- a/Documentation/input/input-programming.rst +++ b/Documentation/input/input-programming.rst @@ -346,3 +346,22 @@ driver can handle these events, it has to set the respective bits in evbit, This callback routine can be called from an interrupt or a BH (although that isn't a rule), and thus must not sleep, and must not take too long to finish. + +Polled input devices +~~~~~~~~~~~~~~~~~~~~ + +Input polling is set up by passing an input device struct and a callback to +the function:: + + int input_setup_polling(struct input_dev *dev, + void (*poll_fn)(struct input_dev *dev)) + +Within the callback, devices should use the regular input_report_* functions +and input_sync as is used by other devices. + +There is also the function:: + + void input_set_poll_interval(struct input_dev *dev, unsigned int interval) + +which is used to configure the interval, in milliseconds, that the device will +be polled at. diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index 8e1ceb0a6619..cc3cd46abd1b 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -4,7 +4,7 @@ Split page table lock Originally, mm->page_table_lock spinlock protected all page tables of the mm_struct. But this approach leads to poor page fault scalability of -multi-threaded applications due high contention on the lock. To improve +multi-threaded applications due to high contention on the lock. To improve scalability, split page table lock was introduced. With split page table lock we have separate per-table lock to serialize diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst index 75e017dfa825..518284e287b0 100644 --- a/Documentation/networking/statistics.rst +++ b/Documentation/networking/statistics.rst @@ -143,7 +143,7 @@ reading multiple stats as it internally performs a full dump of and reports only the stat corresponding to the accessed file. Sysfs files are documented in -`Documentation/ABI/testing/sysfs-class-net-statistics`. +Documentation/ABI/testing/sysfs-class-net-statistics. netlink diff --git a/Documentation/nvme/nvme-pci-endpoint-target.rst b/Documentation/nvme/nvme-pci-endpoint-target.rst index 66e7b7d869b4..b699595d1762 100644 --- a/Documentation/nvme/nvme-pci-endpoint-target.rst +++ b/Documentation/nvme/nvme-pci-endpoint-target.rst @@ -86,7 +86,7 @@ configurable through configfs before starting the controller. To avoid issues with excessive local memory usage for executing commands, MDTS defaults to 512 KB and is limited to a maximum of 2 MB (arbitrary limit). -Mimimum number of PCI Address Mapping Windows Required +Minimum number of PCI Address Mapping Windows Required ------------------------------------------------------ Most PCI endpoint controllers provide a limited number of mapping windows for diff --git a/Documentation/process/5.Posting.rst b/Documentation/process/5.Posting.rst index dbb763a8de90..22fa925353cf 100644 --- a/Documentation/process/5.Posting.rst +++ b/Documentation/process/5.Posting.rst @@ -268,10 +268,15 @@ The tags in common use are: - Cc: the named person received a copy of the patch and had the opportunity to comment on it. -Be careful in the addition of tags to your patches, as only Cc: is appropriate -for addition without the explicit permission of the person named; using -Reported-by: is fine most of the time as well, but ask for permission if -the bug was reported in private. +Be careful in the addition of the aforementioned tags to your patches, as all +except for Cc:, Reported-by:, and Suggested-by: need explicit permission of the +person named. For those three implicit permission is sufficient if the person +contributed to the Linux kernel using that name and email address according +to the lore archives or the commit history -- and in case of Reported-by: +and Suggested-by: did the reporting or suggestion in public. Note, +bugzilla.kernel.org is a public place in this sense, but email addresses +used there are private; so do not expose them in tags, unless the person +used them in earlier contributions. Sending the patch diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index a0beca805362..d564362773b5 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -58,11 +58,11 @@ mcelog 0.6 mcelog --version iptables 1.4.2 iptables -V openssl & libcrypto 1.0.0 openssl version bc 1.06.95 bc --version -Sphinx\ [#f1]_ 2.4.4 sphinx-build --version +Sphinx\ [#f1]_ 3.4.3 sphinx-build --version GNU tar 1.28 tar --version gtags (optional) 6.6.5 gtags --version mkimage (optional) 2017.01 mkimage --version -Python (optional) 3.5.x python3 --version +Python (optional) 3.9.x python3 --version GNU AWK (optional) 5.1.0 gawk --version ====================== =============== ======================================== diff --git a/Documentation/process/code-of-conduct-interpretation.rst b/Documentation/process/code-of-conduct-interpretation.rst index 1d1150954be3..4cdef8360698 100644 --- a/Documentation/process/code-of-conduct-interpretation.rst +++ b/Documentation/process/code-of-conduct-interpretation.rst @@ -145,13 +145,16 @@ kernel community. Any decisions regarding enforcement recommendations will be brought to the TAB for implementation of enforcement with the relevant maintainers -if needed. A decision by the Code of Conduct Committee can be overturned -by the TAB by a two-thirds vote. +if needed. Once the TAB approves one or more of the measures outlined +in the scope of the ban by two-thirds of the members voting for the +measures, the Code of Conduct Committee will enforce the TAB approved +measures. Any Code of Conduct Committee members serving on the TAB will +not vote on the measures. At quarterly intervals, the Code of Conduct Committee and TAB will provide a report summarizing the anonymised reports that the Code of Conduct committee has received and their status, as well details of any -overridden decisions including complete and identifiable voting details. +TAB approved decisions including complete and identifiable voting details. Because how we interpret and enforce the Code of Conduct will evolve over time, this document will be updated when necessary to reflect any @@ -227,9 +230,11 @@ The scope of the ban for a period of time could include: such as mailing lists and social media sites Once the TAB approves one or more of the measures outlined in the scope of -the ban by a two-thirds vote, the Code of Conduct Committee will enforce -the TAB approved measure(s) in collaboration with the community, maintainers, -sub-maintainers, and kernel.org administrators. +the ban by two-thirds of the members voting for the measures, the Code of +Conduct Committee will enforce the TAB approved measure(s) in collaboration +with the community, maintainers, sub-maintainers, and kernel.org +administrators. Any Code of Conduct Committee members serving on the TAB +will not vote on the measures. The Code of Conduct Committee is mindful of the negative impact of seeking public apology and instituting ban could have on individuals. It is also diff --git a/Documentation/process/kernel-docs.rst b/Documentation/process/kernel-docs.rst index 3b5b5983fea8..c67ac12cf789 100644 --- a/Documentation/process/kernel-docs.rst +++ b/Documentation/process/kernel-docs.rst @@ -75,6 +75,17 @@ On-line docs Published books --------------- + * Title: **The Linux Memory Manager** + + :Author: Lorenzo Stoakes + :Publisher: No Starch Press + :Date: February 2025 + :Pages: 1300 + :ISBN: 978-1718504462 + :Notes: Memory management. Full draft available as early access for + pre-order, full release scheduled for Fall 2025. See + https://nostarch.com/linux-memory-manager for further info. + * Title: **Practical Linux System Administration: A Guide to Installation, Configuration, and Management, 1st Edition** :Author: Kenneth Hess diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index e531dd504b6c..beb7f94279fd 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -52,7 +52,8 @@ Provide documentation 4) All new module parameters are documented with ``MODULE_PARM_DESC()`` 5) All new userspace interfaces are documented in ``Documentation/ABI/``. - See ``Documentation/ABI/README`` for more information. + See Documentation/admin-guide/abi.rst (or ``Documentation/ABI/README``) + for more information. Patches that change userspace interfaces should be CCed to linux-api@vger.kernel.org. @@ -91,9 +92,12 @@ Build your code fix any issues. 2) Builds on multiple CPU architectures by using local cross-compile tools - or some other build farm. Note that ppc64 is a good architecture for - cross-compilation checking because it tends to use ``unsigned long`` for - 64-bit quantities. + or some other build farm. + Note that testing against architectures of different word sizes + (32- and 64-bit) and different endianness (big- and little-) is effective + in catching various portability issues due to false assumptions on + representable quantity range, data alignment, or endianness, among + others. 3) Newly-added code has been compiled with ``gcc -W`` (use ``make KCFLAGS=-W``). This will generate lots of noise, but is good diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index 8fdc0ef3e604..cede4e7b29af 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -495,10 +495,10 @@ list archives. A "# Suffix" may also be used in this case to clarify. If a person has had the opportunity to comment on a patch, but has not provided such comments, you may optionally add a ``Cc:`` tag to the patch. -This is the only tag which might be added without an explicit action by the -person it names - but it should indicate that this person was copied on the -patch. This tag documents that potentially interested parties -have been included in the discussion. +This tag documents that potentially interested parties have been included in +the discussion. Note, this is one of only three tags you might be able to use +without explicit permission of the person named (see 'Tagging people requires +permission' below for details). Co-developed-by: states that the patch was co-created by multiple developers; it is used to give attribution to co-authors (in addition to the author @@ -544,9 +544,9 @@ hopefully inspires them to help us again in the future. The tag is intended for bugs; please do not use it to credit feature requests. The tag should be followed by a Closes: tag pointing to the report, unless the report is not available on the web. The Link: tag can be used instead of Closes: if the patch -fixes a part of the issue(s) being reported. Please note that if the bug was -reported in private, then ask for permission first before using the Reported-by -tag. +fixes a part of the issue(s) being reported. Note, the Reported-by tag is one +of only three tags you might be able to use without explicit permission of the +person named (see 'Tagging people requires permission' below for details). A Tested-by: tag indicates that the patch has been successfully tested (in some environment) by the person named. This tag informs maintainers that @@ -596,11 +596,11 @@ Usually removal of someone's Tested-by or Reviewed-by tags should be mentioned in the patch changelog (after the '---' separator). A Suggested-by: tag indicates that the patch idea is suggested by the person -named and ensures credit to the person for the idea. Please note that this -tag should not be added without the reporter's permission, especially if the -idea was not posted in a public forum. That said, if we diligently credit our -idea reporters, they will, hopefully, be inspired to help us again in the -future. +named and ensures credit to the person for the idea: if we diligently credit +our idea reporters, they will, hopefully, be inspired to help us again in the +future. Note, this is one of only three tags you might be able to use without +explicit permission of the person named (see 'Tagging people requires +permission' below for details). A Fixes: tag indicates that the patch fixes an issue in a previous commit. It is used to make it easy to determine where a bug originated, which can help @@ -618,6 +618,21 @@ Finally, while providing tags is welcome and typically very appreciated, please note that signers (i.e. submitters and maintainers) may use their discretion in applying offered tags. +.. _tagging_people: + +Tagging people requires permission +---------------------------------- + +Be careful in the addition of the aforementioned tags to your patches, as all +except for Cc:, Reported-by:, and Suggested-by: need explicit permission of the +person named. For those three implicit permission is sufficient if the person +contributed to the Linux kernel using that name and email address according +to the lore archives or the commit history -- and in case of Reported-by: +and Suggested-by: did the reporting or suggestion in public. Note, +bugzilla.kernel.org is a public place in this sense, but email addresses +used there are private; so do not expose them in tags, unless the person +used them in earlier contributions. + .. _the_canonical_patch_format: The canonical patch format @@ -717,6 +732,12 @@ patch in the permanent changelog. If the ``from`` line is missing, then the ``From:`` line from the email header will be used to determine the patch author in the changelog. +The author may indicate their affiliation or the sponsor of the work +by adding the name of an organization to the ``from`` and ``SoB`` lines, +e.g.: + + From: Patch Author (Company) <author@example.com> + Explanation Body ^^^^^^^^^^^^^^^^ diff --git a/Documentation/scheduler/sched-bwc.rst b/Documentation/scheduler/sched-bwc.rst index 41ed2ceafc92..e881a945c188 100644 --- a/Documentation/scheduler/sched-bwc.rst +++ b/Documentation/scheduler/sched-bwc.rst @@ -59,7 +59,7 @@ At the same time, we can say that the worst case deadline miss, will be \Sum e_i; that is, there is a bounded tardiness (under the assumption that x+e is indeed WCET). -The interferenece when using burst is valued by the possibilities for +The interference when using burst is valued by the possibilities for missing the deadline and the average WCET. Test results showed that when there many cgroups or CPU is under utilized, the interference is limited. More details are shown in: diff --git a/Documentation/scheduler/sched-debug.rst b/Documentation/scheduler/sched-debug.rst index 4d3d24f2a439..b5a92a39eccd 100644 --- a/Documentation/scheduler/sched-debug.rst +++ b/Documentation/scheduler/sched-debug.rst @@ -2,7 +2,7 @@ Scheduler debugfs ================= -Booting a kernel with CONFIG_SCHED_DEBUG=y will give access to +Booting a kernel with debugfs enabled will give access to scheduler specific debug files under /sys/kernel/debug/sched. Some of those files are described below. diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst index 8786f219fc73..b574a2644c77 100644 --- a/Documentation/scheduler/sched-design-CFS.rst +++ b/Documentation/scheduler/sched-design-CFS.rst @@ -96,7 +96,7 @@ picked and the current task is preempted. CFS uses nanosecond granularity accounting and does not rely on any jiffies or other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the way the previous scheduler had, and has no heuristics whatsoever. There is -only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): +only one central tunable: /sys/kernel/debug/sched/base_slice_ns diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 5e996fe973b1..15e3a4cb304a 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -73,9 +73,8 @@ Architectures may override the generic domain builder and the default SD flags for a given topology level by creating a sched_domain_topology_level array and calling set_sched_topology() with this array as the parameter. -The sched-domains debugging infrastructure can be enabled by enabling -CONFIG_SCHED_DEBUG and adding 'sched_verbose' to your cmdline. If you -forgot to tweak your cmdline, you can also flip the +The sched-domains debugging infrastructure can be enabled by 'sched_verbose' +to your cmdline. If you forgot to tweak your cmdline, you can also flip the /sys/kernel/debug/sched/verbose knob. This enables an error checking parse of the sched domains which should catch most possible errors (described above). It also prints out the domain structure in a visual format. diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index c4672d7df2f7..0b2654e2164b 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -107,8 +107,7 @@ detailed information: nr_rejected : 0 enable_seq : 1 -If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can -be determined as follows: +Whether a given task is on sched_ext can be determined as follows: .. code-block:: none @@ -294,6 +293,42 @@ dispatching, and must be dispatched to with ``scx_bpf_dsq_insert()``. See the function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for more information. +Task Lifecycle +-------------- + +The following pseudo-code summarizes the entire lifecycle of a task managed +by a sched_ext scheduler: + +.. code-block:: c + + ops.init_task(); /* A new task is created */ + ops.enable(); /* Enable BPF scheduling for the task */ + + while (task in SCHED_EXT) { + if (task can migrate) + ops.select_cpu(); /* Called on wakeup (optimization) */ + + ops.runnable(); /* Task becomes ready to run */ + + while (task is runnable) { + if (task is not in a DSQ) { + ops.enqueue(); /* Task can be added to a DSQ */ + + /* A CPU becomes available */ + + ops.dispatch(); /* Task is moved to a local DSQ */ + } + ops.running(); /* Task starts running on its assigned CPU */ + ops.tick(); /* Called every 1/HZ seconds */ + ops.stopping(); /* Task stops running (time slice expires or wait) */ + } + + ops.quiescent(); /* Task releases its assigned CPU (wait) */ + } + + ops.disable(); /* Disable BPF scheduling for the task */ + ops.exit_task(); /* Task is destroyed */ + Where to Look ============= diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst index caea83d91c67..08b6bc9a315c 100644 --- a/Documentation/scheduler/sched-stats.rst +++ b/Documentation/scheduler/sched-stats.rst @@ -88,7 +88,7 @@ One of these is produced per domain for each cpu described. (Note that if CONFIG_SMP is not defined, *no* domains are utilized and these lines will not appear in the output. <name> is an extension to the domain field that prints the name of the corresponding sched domain. It can appear in -schedstat version 17 and above, and requires CONFIG_SCHED_DEBUG.) +schedstat version 17 and above. domain<N> <name> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 diff --git a/Documentation/sound/soc/machine.rst b/Documentation/sound/soc/machine.rst index 9db132bc0070..1828f5edca3e 100644 --- a/Documentation/sound/soc/machine.rst +++ b/Documentation/sound/soc/machine.rst @@ -75,7 +75,7 @@ In the above struct, dai’s are registered using names but you can pass either dai name or device tree node but not both. Also, names used here for cpu/codec/platform dais should be globally unique. -Additionaly below example macro can be used to register cpu, codec and +Additionally below example macro can be used to register cpu, codec and platform dai:: SND_SOC_DAILINK_DEFS(wm2200_cpu_dsp, diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py index a413f8dd5115..ecf54d22e9dc 100644 --- a/Documentation/sphinx/automarkup.py +++ b/Documentation/sphinx/automarkup.py @@ -11,13 +11,7 @@ from sphinx.errors import NoUri import re from itertools import chain -# -# Python 2 lacks re.ASCII... -# -try: - ascii_p3 = re.ASCII -except AttributeError: - ascii_p3 = 0 +from kernel_abi import get_kernel_abi # # Regex nastiness. Of course. @@ -26,28 +20,30 @@ except AttributeError: # :c:func: block (i.e. ":c:func:`mmap()`s" flakes out), so the last # bit tries to restrict matches to things that won't create trouble. # -RE_function = re.compile(r'\b(([a-zA-Z_]\w+)\(\))', flags=ascii_p3) +RE_function = re.compile(r'\b(([a-zA-Z_]\w+)\(\))', flags=re.ASCII) # # Sphinx 2 uses the same :c:type role for struct, union, enum and typedef # RE_generic_type = re.compile(r'\b(struct|union|enum|typedef)\s+([a-zA-Z_]\w+)', - flags=ascii_p3) + flags=re.ASCII) # # Sphinx 3 uses a different C role for each one of struct, union, enum and # typedef # -RE_struct = re.compile(r'\b(struct)\s+([a-zA-Z_]\w+)', flags=ascii_p3) -RE_union = re.compile(r'\b(union)\s+([a-zA-Z_]\w+)', flags=ascii_p3) -RE_enum = re.compile(r'\b(enum)\s+([a-zA-Z_]\w+)', flags=ascii_p3) -RE_typedef = re.compile(r'\b(typedef)\s+([a-zA-Z_]\w+)', flags=ascii_p3) +RE_struct = re.compile(r'\b(struct)\s+([a-zA-Z_]\w+)', flags=re.ASCII) +RE_union = re.compile(r'\b(union)\s+([a-zA-Z_]\w+)', flags=re.ASCII) +RE_enum = re.compile(r'\b(enum)\s+([a-zA-Z_]\w+)', flags=re.ASCII) +RE_typedef = re.compile(r'\b(typedef)\s+([a-zA-Z_]\w+)', flags=re.ASCII) # # Detects a reference to a documentation page of the form Documentation/... with # an optional extension # RE_doc = re.compile(r'(\bDocumentation/)?((\.\./)*[\w\-/]+)\.(rst|txt)') +RE_abi_file = re.compile(r'(\bDocumentation/ABI/[\w\-/]+)') +RE_abi_symbol = re.compile(r'(\b/(sys|config|proc)/[\w\-/]+)') RE_namespace = re.compile(r'^\s*..\s*c:namespace::\s*(\S+)\s*$') @@ -83,11 +79,10 @@ def markup_refs(docname, app, node): # # Associate each regex with the function that will markup its matches # - markup_func_sphinx2 = {RE_doc: markup_doc_ref, - RE_function: markup_c_ref, - RE_generic_type: markup_c_ref} - markup_func_sphinx3 = {RE_doc: markup_doc_ref, + markup_func = {RE_doc: markup_doc_ref, + RE_abi_file: markup_abi_file_ref, + RE_abi_symbol: markup_abi_ref, RE_function: markup_func_ref_sphinx3, RE_struct: markup_c_ref, RE_union: markup_c_ref, @@ -95,11 +90,6 @@ def markup_refs(docname, app, node): RE_typedef: markup_c_ref, RE_git: markup_git} - if sphinx.version_info[0] >= 3: - markup_func = markup_func_sphinx3 - else: - markup_func = markup_func_sphinx2 - match_iterators = [regex.finditer(t) for regex in markup_func] # # Sort all references by the starting position in text @@ -270,6 +260,54 @@ def markup_doc_ref(docname, app, match): else: return nodes.Text(match.group(0)) +# +# Try to replace a documentation reference for ABI symbols and files +# with a cross reference to that page +# +def markup_abi_ref(docname, app, match, warning=False): + stddom = app.env.domains['std'] + # + # Go through the dance of getting an xref out of the std domain + # + kernel_abi = get_kernel_abi() + + fname = match.group(1) + target = kernel_abi.xref(fname) + + # Kernel ABI doesn't describe such file or symbol + if not target: + if warning: + kernel_abi.log.warning("%s not found", fname) + return nodes.Text(match.group(0)) + + pxref = addnodes.pending_xref('', refdomain = 'std', reftype = 'ref', + reftarget = target, modname = None, + classname = None, refexplicit = False) + + # + # XXX The Latex builder will throw NoUri exceptions here, + # work around that by ignoring them. + # + try: + xref = stddom.resolve_xref(app.env, docname, app.builder, 'ref', + target, pxref, None) + except NoUri: + xref = None + # + # Return the xref if we got it; otherwise just return the plain text. + # + if xref: + return xref + else: + return nodes.Text(match.group(0)) + +# +# Variant of markup_abi_ref() that warns whan a reference is not found +# +def markup_abi_file_ref(docname, app, match): + return markup_abi_ref(docname, app, match, warning=True) + + def get_c_namespace(app, docname): source = app.env.doc2path(docname) with open(source) as f: diff --git a/Documentation/sphinx/cdomain.py b/Documentation/sphinx/cdomain.py index e6959af25402..e8ea80d4324c 100644 --- a/Documentation/sphinx/cdomain.py +++ b/Documentation/sphinx/cdomain.py @@ -1,6 +1,6 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=W0141,C0113,C0103,C0325 -u""" +""" cdomain ~~~~~~~ @@ -45,9 +45,6 @@ import re __version__ = '1.1' -# Get Sphinx version -major, minor, patch = sphinx.version_info[:3] - # Namespace to be prepended to the full name namespace = None @@ -145,7 +142,7 @@ class CObject(Base_CObject): } def handle_func_like_macro(self, sig, signode): - u"""Handles signatures of function-like macros. + """Handles signatures of function-like macros. If the objtype is 'function' and the signature ``sig`` is a function-like macro, the name of the macro is returned. Otherwise diff --git a/Documentation/sphinx/kernel_abi.py b/Documentation/sphinx/kernel_abi.py index 5911bd0d7965..db6f0380de94 100644 --- a/Documentation/sphinx/kernel_abi.py +++ b/Documentation/sphinx/kernel_abi.py @@ -2,7 +2,7 @@ # coding=utf-8 # SPDX-License-Identifier: GPL-2.0 # -u""" +""" kernel-abi ~~~~~~~~~~ @@ -14,7 +14,7 @@ u""" :license: GPL Version 2, June 1991 see Linux/COPYING for details. The ``kernel-abi`` (:py:class:`KernelCmd`) directive calls the - scripts/get_abi.pl script to parse the Kernel ABI files. + scripts/get_abi.py script to parse the Kernel ABI files. Overview of directive's argument and options. @@ -32,107 +32,137 @@ u""" """ -import codecs import os -import subprocess -import sys import re -import kernellog +import sys from docutils import nodes, statemachine from docutils.statemachine import ViewList from docutils.parsers.rst import directives, Directive -from docutils.utils.error_reporting import ErrorString from sphinx.util.docutils import switch_source_input +from sphinx.util import logging + +srctree = os.path.abspath(os.environ["srctree"]) +sys.path.insert(0, os.path.join(srctree, "scripts/lib/abi")) + +from abi_parser import AbiParser + +__version__ = "1.0" + +logger = logging.getLogger('kernel_abi') +path = os.path.join(srctree, "Documentation/ABI") -__version__ = '1.0' +_kernel_abi = None + +def get_kernel_abi(): + """ + Initialize kernel_abi global var, if not initialized yet. + + This is needed to avoid warnings during Sphinx module initialization. + """ + global _kernel_abi + + if not _kernel_abi: + # Parse ABI symbols only once + _kernel_abi = AbiParser(path, logger=logger) + _kernel_abi.parse_abi() + _kernel_abi.check_issues() + + return _kernel_abi def setup(app): app.add_directive("kernel-abi", KernelCmd) - return dict( - version = __version__ - , parallel_read_safe = True - , parallel_write_safe = True - ) + return { + "version": __version__, + "parallel_read_safe": True, + "parallel_write_safe": True + } -class KernelCmd(Directive): - u"""KernelABI (``kernel-abi``) directive""" +class KernelCmd(Directive): + """KernelABI (``kernel-abi``) directive""" required_arguments = 1 - optional_arguments = 2 + optional_arguments = 3 has_content = False final_argument_whitespace = True + parser = None option_spec = { - "debug" : directives.flag, - "rst" : directives.unchanged + "debug": directives.flag, + "no-symbols": directives.flag, + "no-files": directives.flag, } def run(self): + kernel_abi = get_kernel_abi() + doc = self.state.document if not doc.settings.file_insertion_enabled: raise self.warning("docutils: file insertion disabled") - srctree = os.path.abspath(os.environ["srctree"]) - - args = [ - os.path.join(srctree, 'scripts/get_abi.pl'), - 'rest', - '--enable-lineno', - '--dir', os.path.join(srctree, 'Documentation', self.arguments[0]), - ] - - if 'rst' in self.options: - args.append('--rst-source') - - lines = subprocess.check_output(args, cwd=os.path.dirname(doc.current_source)).decode('utf-8') - nodeList = self.nestedParse(lines, self.arguments[0]) - return nodeList - - def nestedParse(self, lines, fname): env = self.state.document.settings.env content = ViewList() node = nodes.section() - if "debug" in self.options: - code_block = "\n\n.. code-block:: rst\n :linenos:\n" - for l in lines.split("\n"): - code_block += "\n " + l - lines = code_block + "\n\n" + abi_type = self.arguments[0] - line_regex = re.compile(r"^\.\. LINENO (\S+)\#([0-9]+)$") - ln = 0 - n = 0 - f = fname + if "no-symbols" in self.options: + show_symbols = False + else: + show_symbols = True - for line in lines.split("\n"): - n = n + 1 - match = line_regex.search(line) - if match: - new_f = match.group(1) + if "no-files" in self.options: + show_file = False + else: + show_file = True - # Sphinx parser is lazy: it stops parsing contents in the - # middle, if it is too big. So, handle it per input file - if new_f != f and content: - self.do_parse(content, node) - content = ViewList() + tab_width = self.options.get('tab-width', + self.state.document.settings.tab_width) - # Add the file to Sphinx build dependencies - env.note_dependency(os.path.abspath(f)) - - f = new_f - - # sphinx counts lines from 0 - ln = int(match.group(2)) - 1 + old_f = None + n = 0 + n_sym = 0 + for msg, f, ln in kernel_abi.doc(show_file=show_file, + show_symbols=show_symbols, + filter_path=abi_type): + n_sym += 1 + msg_list = statemachine.string2lines(msg, tab_width, + convert_whitespace=True) + if "debug" in self.options: + lines = [ + "", "", ".. code-block:: rst", + " :linenos:", "" + ] + for m in msg_list: + lines.append(" " + m) else: - content.append(line, f, ln) - - kernellog.info(self.state.document.settings.env.app, "%s: parsed %i lines" % (fname, n)) + lines = msg_list - if content: - self.do_parse(content, node) + for line in lines: + # sphinx counts lines from 0 + content.append(line, f, ln - 1) + n += 1 + + if f != old_f: + # Add the file to Sphinx build dependencies + env.note_dependency(os.path.abspath(f)) + + old_f = f + + # Sphinx doesn't like to parse big messages. So, let's + # add content symbol by symbol + if content: + self.do_parse(content, node) + content = ViewList() + + if show_symbols and not show_file: + logger.verbose("%s ABI: %i symbols (%i ReST lines)" % (abi_type, n_sym, n)) + elif not show_symbols and show_file: + logger.verbose("%s ABI: %i files (%i ReST lines)" % (abi_type, n_sym, n)) + else: + logger.verbose("%s ABI: %i data (%i ReST lines)" % (abi_type, n_sym, n)) return node.children diff --git a/Documentation/sphinx/kernel_feat.py b/Documentation/sphinx/kernel_feat.py index 03ace5f01b5c..e3a51867f27b 100644 --- a/Documentation/sphinx/kernel_feat.py +++ b/Documentation/sphinx/kernel_feat.py @@ -1,7 +1,7 @@ # coding=utf-8 # SPDX-License-Identifier: GPL-2.0 # -u""" +""" kernel-feat ~~~~~~~~~~~ @@ -56,7 +56,7 @@ def setup(app): class KernelFeat(Directive): - u"""KernelFeat (``kernel-feat``) directive""" + """KernelFeat (``kernel-feat``) directive""" required_arguments = 1 optional_arguments = 2 diff --git a/Documentation/sphinx/kernel_include.py b/Documentation/sphinx/kernel_include.py index 638762442336..8db176045bc5 100755 --- a/Documentation/sphinx/kernel_include.py +++ b/Documentation/sphinx/kernel_include.py @@ -2,7 +2,7 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=R0903, C0330, R0914, R0912, E0401 -u""" +""" kernel-include ~~~~~~~~~~~~~~ @@ -56,7 +56,7 @@ def setup(app): class KernelInclude(Include): # ============================================================================== - u"""KernelInclude (``kernel-include``) directive""" + """KernelInclude (``kernel-include``) directive""" def run(self): env = self.state.document.settings.env diff --git a/Documentation/sphinx/kerneldoc.py b/Documentation/sphinx/kerneldoc.py index ec1ddfff1863..39ddae6ae7dd 100644 --- a/Documentation/sphinx/kerneldoc.py +++ b/Documentation/sphinx/kerneldoc.py @@ -39,7 +39,7 @@ from docutils.statemachine import ViewList from docutils.parsers.rst import directives, Directive import sphinx from sphinx.util.docutils import switch_source_input -import kernellog +from sphinx.util import logging __version__ = '1.0' @@ -56,16 +56,12 @@ class KernelDocDirective(Directive): 'functions': directives.unchanged, } has_content = False + logger = logging.getLogger('kerneldoc') def run(self): env = self.state.document.settings.env cmd = [env.config.kerneldoc_bin, '-rst', '-enable-lineno'] - # Pass the version string to kernel-doc, as it needs to use a different - # dialect, depending what the C domain supports for each specific - # Sphinx versions - cmd += ['-sphinx-version', sphinx.__version__] - filename = env.config.kerneldoc_srctree + '/' + self.arguments[0] export_file_patterns = [] @@ -109,8 +105,7 @@ class KernelDocDirective(Directive): cmd += [filename] try: - kernellog.verbose(env.app, - 'calling kernel-doc \'%s\'' % (" ".join(cmd))) + self.logger.verbose("calling kernel-doc '%s'" % (" ".join(cmd))) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() @@ -120,8 +115,8 @@ class KernelDocDirective(Directive): if p.returncode != 0: sys.stderr.write(err) - kernellog.warn(env.app, - 'kernel-doc \'%s\' failed with return code %d' % (" ".join(cmd), p.returncode)) + self.logger.warning("kernel-doc '%s' failed with return code %d" + % (" ".join(cmd), p.returncode)) return [nodes.error(None, nodes.paragraph(text = "kernel-doc missing"))] elif env.config.kerneldoc_verbosity > 0: sys.stderr.write(err) @@ -148,8 +143,8 @@ class KernelDocDirective(Directive): return node.children except Exception as e: # pylint: disable=W0703 - kernellog.warn(env.app, 'kernel-doc \'%s\' processing failed with: %s' % - (" ".join(cmd), str(e))) + self.logger.warning("kernel-doc '%s' processing failed with: %s" % + (" ".join(cmd), str(e))) return [nodes.error(None, nodes.paragraph(text = "kernel-doc missing"))] def do_parse(self, result, node): diff --git a/Documentation/sphinx/kernellog.py b/Documentation/sphinx/kernellog.py deleted file mode 100644 index 0bc00c138cad..000000000000 --- a/Documentation/sphinx/kernellog.py +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Sphinx has deprecated its older logging interface, but the replacement -# only goes back to 1.6. So here's a wrapper layer to keep around for -# as long as we support 1.4. -# -# We don't support 1.4 anymore, but we'll keep the wrappers around until -# we change all the code to not use them anymore :) -# -import sphinx -from sphinx.util import logging - -logger = logging.getLogger('kerneldoc') - -def warn(app, message): - logger.warning(message) - -def verbose(app, message): - logger.verbose(message) - -def info(app, message): - logger.info(message) diff --git a/Documentation/sphinx/kfigure.py b/Documentation/sphinx/kfigure.py index 97166333b727..f1a7f13c9c60 100644 --- a/Documentation/sphinx/kfigure.py +++ b/Documentation/sphinx/kfigure.py @@ -1,6 +1,6 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=C0103, R0903, R0912, R0915 -u""" +""" scalable figure and image handling ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -59,12 +59,14 @@ from docutils.parsers.rst import directives from docutils.parsers.rst.directives import images import sphinx from sphinx.util.nodes import clean_astext -import kernellog +from sphinx.util import logging Figure = images.Figure __version__ = '1.0.0' +logger = logging.getLogger('kfigure') + # simple helper # ------------- @@ -163,14 +165,14 @@ def setup(app): def setupTools(app): - u""" + """ Check available build tools and log some *verbose* messages. This function is called once, when the builder is initiated. """ global dot_cmd, dot_Tpdf, convert_cmd, rsvg_convert_cmd # pylint: disable=W0603 global inkscape_cmd, inkscape_ver_one # pylint: disable=W0603 - kernellog.verbose(app, "kfigure: check installed tools ...") + logger.verbose("kfigure: check installed tools ...") dot_cmd = which('dot') convert_cmd = which('convert') @@ -178,7 +180,7 @@ def setupTools(app): inkscape_cmd = which('inkscape') if dot_cmd: - kernellog.verbose(app, "use dot(1) from: " + dot_cmd) + logger.verbose("use dot(1) from: " + dot_cmd) try: dot_Thelp_list = subprocess.check_output([dot_cmd, '-Thelp'], @@ -190,10 +192,11 @@ def setupTools(app): dot_Tpdf_ptn = b'pdf' dot_Tpdf = re.search(dot_Tpdf_ptn, dot_Thelp_list) else: - kernellog.warn(app, "dot(1) not found, for better output quality install " - "graphviz from https://www.graphviz.org") + logger.warning( + "dot(1) not found, for better output quality install graphviz from https://www.graphviz.org" + ) if inkscape_cmd: - kernellog.verbose(app, "use inkscape(1) from: " + inkscape_cmd) + logger.verbose("use inkscape(1) from: " + inkscape_cmd) inkscape_ver = subprocess.check_output([inkscape_cmd, '--version'], stderr=subprocess.DEVNULL) ver_one_ptn = b'Inkscape 1' @@ -204,26 +207,27 @@ def setupTools(app): else: if convert_cmd: - kernellog.verbose(app, "use convert(1) from: " + convert_cmd) + logger.verbose("use convert(1) from: " + convert_cmd) else: - kernellog.verbose(app, + logger.verbose( "Neither inkscape(1) nor convert(1) found.\n" - "For SVG to PDF conversion, " - "install either Inkscape (https://inkscape.org/) (preferred) or\n" - "ImageMagick (https://www.imagemagick.org)") + "For SVG to PDF conversion, install either Inkscape (https://inkscape.org/) (preferred) or\n" + "ImageMagick (https://www.imagemagick.org)" + ) if rsvg_convert_cmd: - kernellog.verbose(app, "use rsvg-convert(1) from: " + rsvg_convert_cmd) - kernellog.verbose(app, "use 'dot -Tsvg' and rsvg-convert(1) for DOT -> PDF conversion") + logger.verbose("use rsvg-convert(1) from: " + rsvg_convert_cmd) + logger.verbose("use 'dot -Tsvg' and rsvg-convert(1) for DOT -> PDF conversion") dot_Tpdf = False else: - kernellog.verbose(app, + logger.verbose( "rsvg-convert(1) not found.\n" - " SVG rendering of convert(1) is done by ImageMagick-native renderer.") + " SVG rendering of convert(1) is done by ImageMagick-native renderer." + ) if dot_Tpdf: - kernellog.verbose(app, "use 'dot -Tpdf' for DOT -> PDF conversion") + logger.verbose("use 'dot -Tpdf' for DOT -> PDF conversion") else: - kernellog.verbose(app, "use 'dot -Tsvg' and convert(1) for DOT -> PDF conversion") + logger.verbose("use 'dot -Tsvg' and convert(1) for DOT -> PDF conversion") # integrate conversion tools @@ -257,13 +261,12 @@ def convert_image(img_node, translator, src_fname=None): # in kernel builds, use 'make SPHINXOPTS=-v' to see verbose messages - kernellog.verbose(app, 'assert best format for: ' + img_node['uri']) + logger.verbose('assert best format for: ' + img_node['uri']) if in_ext == '.dot': if not dot_cmd: - kernellog.verbose(app, - "dot from graphviz not available / include DOT raw.") + logger.verbose("dot from graphviz not available / include DOT raw.") img_node.replace_self(file2literal(src_fname)) elif translator.builder.format == 'latex': @@ -290,10 +293,11 @@ def convert_image(img_node, translator, src_fname=None): if translator.builder.format == 'latex': if not inkscape_cmd and convert_cmd is None: - kernellog.warn(app, - "no SVG to PDF conversion available / include SVG raw." - "\nIncluding large raw SVGs can cause xelatex error." - "\nInstall Inkscape (preferred) or ImageMagick.") + logger.warning( + "no SVG to PDF conversion available / include SVG raw.\n" + "Including large raw SVGs can cause xelatex error.\n" + "Install Inkscape (preferred) or ImageMagick." + ) img_node.replace_self(file2literal(src_fname)) else: dst_fname = path.join(translator.builder.outdir, fname + '.pdf') @@ -306,15 +310,14 @@ def convert_image(img_node, translator, src_fname=None): _name = dst_fname[len(str(translator.builder.outdir)) + 1:] if isNewer(dst_fname, src_fname): - kernellog.verbose(app, - "convert: {out}/%s already exists and is newer" % _name) + logger.verbose("convert: {out}/%s already exists and is newer" % _name) else: ok = False mkdir(path.dirname(dst_fname)) if in_ext == '.dot': - kernellog.verbose(app, 'convert DOT to: {out}/' + _name) + logger.verbose('convert DOT to: {out}/' + _name) if translator.builder.format == 'latex' and not dot_Tpdf: svg_fname = path.join(translator.builder.outdir, fname + '.svg') ok1 = dot2format(app, src_fname, svg_fname) @@ -325,7 +328,7 @@ def convert_image(img_node, translator, src_fname=None): ok = dot2format(app, src_fname, dst_fname) elif in_ext == '.svg': - kernellog.verbose(app, 'convert SVG to: {out}/' + _name) + logger.verbose('convert SVG to: {out}/' + _name) ok = svg2pdf(app, src_fname, dst_fname) if not ok: @@ -354,7 +357,7 @@ def dot2format(app, dot_fname, out_fname): with open(out_fname, "w") as out: exit_code = subprocess.call(cmd, stdout = out) if exit_code != 0: - kernellog.warn(app, + logger.warning( "Error #%d when calling: %s" % (exit_code, " ".join(cmd))) return bool(exit_code == 0) @@ -388,13 +391,14 @@ def svg2pdf(app, svg_fname, pdf_fname): pass if exit_code != 0: - kernellog.warn(app, "Error #%d when calling: %s" % (exit_code, " ".join(cmd))) + logger.warning("Error #%d when calling: %s" % + (exit_code, " ".join(cmd))) if warning_msg: - kernellog.warn(app, "Warning msg from %s: %s" - % (cmd_name, str(warning_msg, 'utf-8'))) + logger.warning( "Warning msg from %s: %s" % + (cmd_name, str(warning_msg, 'utf-8'))) elif warning_msg: - kernellog.verbose(app, "Warning msg from %s (likely harmless):\n%s" - % (cmd_name, str(warning_msg, 'utf-8'))) + logger.verbose("Warning msg from %s (likely harmless):\n%s" % + (cmd_name, str(warning_msg, 'utf-8'))) return bool(exit_code == 0) @@ -418,7 +422,8 @@ def svg2pdf_by_rsvg(app, svg_fname, pdf_fname): # use stdout and stderr from parent exit_code = subprocess.call(cmd) if exit_code != 0: - kernellog.warn(app, "Error #%d when calling: %s" % (exit_code, " ".join(cmd))) + logger.warning("Error #%d when calling: %s" % + (exit_code, " ".join(cmd))) ok = bool(exit_code == 0) return ok @@ -440,7 +445,7 @@ class kernel_image(nodes.image): pass class KernelImage(images.Image): - u"""KernelImage directive + """KernelImage directive Earns everything from ``.. image::`` directive, except *remote URI* and *glob* pattern. The KernelImage wraps a image node into a @@ -476,7 +481,7 @@ class kernel_figure(nodes.figure): """Node for ``kernel-figure`` directive.""" class KernelFigure(Figure): - u"""KernelImage directive + """KernelImage directive Earns everything from ``.. figure::`` directive, except *remote URI* and *glob* pattern. The KernelFigure wraps a figure node into a kernel_figure @@ -513,15 +518,15 @@ def visit_kernel_render(self, node): app = self.builder.app srclang = node.get('srclang') - kernellog.verbose(app, 'visit kernel-render node lang: "%s"' % (srclang)) + logger.verbose('visit kernel-render node lang: "%s"' % srclang) tmp_ext = RENDER_MARKUP_EXT.get(srclang, None) if tmp_ext is None: - kernellog.warn(app, 'kernel-render: "%s" unknown / include raw.' % (srclang)) + logger.warning( 'kernel-render: "%s" unknown / include raw.' % srclang) return if not dot_cmd and tmp_ext == '.dot': - kernellog.verbose(app, "dot from graphviz not available / include raw.") + logger.verbose("dot from graphviz not available / include raw.") return literal_block = node[0] @@ -552,7 +557,7 @@ class kernel_render(nodes.General, nodes.Inline, nodes.Element): pass class KernelRender(Figure): - u"""KernelRender directive + """KernelRender directive Render content by external tool. Has all the options known from the *figure* directive, plus option ``caption``. If ``caption`` has a diff --git a/Documentation/sphinx/load_config.py b/Documentation/sphinx/load_config.py index 8b416bfd75ac..ec50e1ee5223 100644 --- a/Documentation/sphinx/load_config.py +++ b/Documentation/sphinx/load_config.py @@ -9,7 +9,7 @@ from sphinx.util.osutil import fs_encoding def loadConfig(namespace): # ------------------------------------------------------------------------------ - u"""Load an additional configuration file into *namespace*. + """Load an additional configuration file into *namespace*. The name of the configuration file is taken from the environment ``SPHINX_CONF``. The external configuration file extends (or overwrites) the diff --git a/Documentation/sphinx/maintainers_include.py b/Documentation/sphinx/maintainers_include.py index dcad0fff4723..d31cff867436 100755 --- a/Documentation/sphinx/maintainers_include.py +++ b/Documentation/sphinx/maintainers_include.py @@ -3,7 +3,7 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=R0903, C0330, R0914, R0912, E0401 -u""" +""" maintainers-include ~~~~~~~~~~~~~~~~~~~ @@ -37,7 +37,7 @@ def setup(app): ) class MaintainersInclude(Include): - u"""MaintainersInclude (``maintainers-include``) directive""" + """MaintainersInclude (``maintainers-include``) directive""" required_arguments = 0 def parse_maintainers(self, path): diff --git a/Documentation/sphinx/rstFlatTable.py b/Documentation/sphinx/rstFlatTable.py index 16bea0632555..180fbb50c337 100755 --- a/Documentation/sphinx/rstFlatTable.py +++ b/Documentation/sphinx/rstFlatTable.py @@ -2,7 +2,7 @@ # -*- coding: utf-8; mode: python -*- # pylint: disable=C0330, R0903, R0912 -u""" +""" flat-table ~~~~~~~~~~ @@ -99,7 +99,7 @@ class colSpan(nodes.General, nodes.Element): pass # pylint: disable=C0103,C0321 class FlatTable(Table): # ============================================================================== - u"""FlatTable (``flat-table``) directive""" + """FlatTable (``flat-table``) directive""" option_spec = { 'name': directives.unchanged @@ -135,7 +135,7 @@ class FlatTable(Table): class ListTableBuilder(object): # ============================================================================== - u"""Builds a table from a double-stage list""" + """Builds a table from a double-stage list""" def __init__(self, directive): self.directive = directive @@ -212,7 +212,7 @@ class ListTableBuilder(object): raise SystemMessagePropagation(error) def parseFlatTableNode(self, node): - u"""parses the node from a :py:class:`FlatTable` directive's body""" + """parses the node from a :py:class:`FlatTable` directive's body""" if len(node) != 1 or not isinstance(node[0], nodes.bullet_list): self.raiseError( @@ -225,7 +225,7 @@ class ListTableBuilder(object): self.roundOffTableDefinition() def roundOffTableDefinition(self): - u"""Round off the table definition. + """Round off the table definition. This method rounds off the table definition in :py:member:`rows`. diff --git a/Documentation/trace/postprocess/decode_msr.py b/Documentation/trace/postprocess/decode_msr.py index aa9cc7abd5c2..f5609b16f589 100644 --- a/Documentation/trace/postprocess/decode_msr.py +++ b/Documentation/trace/postprocess/decode_msr.py @@ -32,6 +32,6 @@ for j in sys.stdin: break if r: j = j.replace(" " + m.group(2), " " + r + "(" + m.group(2) + ")") - print j, + print(j) diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst index 692be4af9c9b..5bf1b4adebc1 100644 --- a/Documentation/translations/it_IT/process/submit-checklist.rst +++ b/Documentation/translations/it_IT/process/submit-checklist.rst @@ -58,9 +58,10 @@ Fornite documentazione 4) Tutti i nuovi parametri dei moduli sono documentati con ``MODULE_PARM_DESC()``. 5) Tutte le nuove interfacce verso lo spazio utente sono documentate in - ``Documentation/ABI/``. Leggete ``Documentation/ABI/README`` per maggiori - informazioni. Le patch che modificano le interfacce utente dovrebbero - essere inviate in copia anche a linux-api@vger.kernel.org. + ``Documentation/ABI/``. Leggete Documentation/admin-guide/abi.rst + (o ``Documentation/ABI/README``) per maggiori informazioni. + Le patch che modificano le interfacce utente dovrebbero essere inviate + in copia anche a linux-api@vger.kernel.org. 6) Se la patch aggiunge nuove chiamate ioctl, allora aggiornate ``Documentation/userspace-api/ioctl/ioctl-number.rst``. diff --git a/Documentation/translations/ja_JP/SubmitChecklist b/Documentation/translations/ja_JP/SubmitChecklist deleted file mode 100644 index 1759c6b452d6..000000000000 --- a/Documentation/translations/ja_JP/SubmitChecklist +++ /dev/null @@ -1,105 +0,0 @@ -NOTE: -This is a version of Documentation/process/submit-checklist.rst into Japanese. -This document is maintained by Takenori Nagano <t-nagano@ah.jp.nec.com> -and the JF Project team <http://www.linux.or.jp/JF/>. -If you find any difference between this document and the original file -or a problem with the translation, -please contact the maintainer of this file or JF project. - -Please also note that the purpose of this file is to be easier to read -for non English (read: Japanese) speakers and is not intended as a -fork. So if you have any comments or updates of this file, please try -to update the original English file first. - -Last Updated: 2008/07/14 -================================== -ã“れã¯ã€ -linux-2.6.26/Documentation/process/submit-checklist.rst ã®å’Œè¨³ã§ã™ã€‚ - -翻訳団体: JF プãƒã‚¸ã‚§ã‚¯ãƒˆ < http://www.linux.or.jp/JF/ > -翻訳日: 2008/07/14 -翻訳者: Takenori Nagano <t-nagano at ah dot jp dot nec dot com> -æ ¡æ£è€…: Masanori Kobayashi ã•ã‚“ <zap03216 at nifty dot ne dot jp> -================================== - - -Linux カーãƒãƒ«ãƒ‘ãƒƒãƒæŠ•ç¨¿è€…å‘ã‘ãƒã‚§ãƒƒã‚¯ãƒªã‚¹ãƒˆ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -本書ã§ã¯ã€ãƒ‘ッãƒã‚’ã‚ˆã‚Šç´ æ—©ãå–り込んã§ã‚‚らã„ãŸã„開発者ãŒå®Ÿè·µã™ã¹ã基本的ãªäº‹æŸ„ -ã‚’ã„ãã¤ã‹ç´¹ä»‹ã—ã¾ã™ã€‚ã“ã“ã«ã‚ã‚‹å…¨ã¦ã®äº‹æŸ„ã¯ã€Documentation/process/submitting-patches.rst -ãªã©ã®Linuxカーãƒãƒ«ãƒ‘ãƒƒãƒæŠ•ç¨¿ã«éš›ã—ã¦ã®å¿ƒå¾—を補足ã™ã‚‹ã‚‚ã®ã§ã™ã€‚ - - 1: 妥当ãªCONFIGオプションや変更ã•れãŸCONFIGオプションã€ã¤ã¾ã‚Š =y, =m, =n - å…¨ã¦ã§æ£ã—ãビルドã§ãã‚‹ã“ã¨ã‚’確èªã—ã¦ãã ã•ã„。ãã®éš›ã€gccåŠã³ãƒªãƒ³ã‚«ãŒ - warningã‚„errorを出ã—ã¦ã„ãªã„ã“ã¨ã‚‚確èªã—ã¦ãã ã•ã„。 - - 2: allnoconfig, allmodconfig オプションを用ã„ã¦æ£ã—ãビルドã§ãã‚‹ã“ã¨ã‚’ - 確èªã—ã¦ãã ã•ã„。 - - 3: 手許ã®ã‚¯ãƒã‚¹ã‚³ãƒ³ãƒ‘イルツールやOSDLã®PLMã®ã‚ˆã†ãªã‚‚ã®ã‚’用ã„ã¦ã€è¤‡æ•°ã® - アーã‚テクãƒãƒ£ã«ãŠã„ã¦ã‚‚æ£ã—ãビルドã§ãã‚‹ã“ã¨ã‚’確èªã—ã¦ãã ã•ã„。 - - 4: 64bité•·ã®'unsigned long'を使用ã—ã¦ã„ã‚‹ppc64ã¯ã€ã‚¯ãƒã‚¹ã‚³ãƒ³ãƒ‘イルã§ã® - ãƒã‚§ãƒƒã‚¯ã«é©å½“ãªã‚¢ãƒ¼ã‚テクãƒãƒ£ã§ã™ã€‚ - - 5: カーãƒãƒ«ã‚³ãƒ¼ãƒ‡ã‚£ãƒ³ã‚°ã‚¹ã‚¿ã‚¤ãƒ«ã«æº–æ‹ ã—ã¦ã„ã‚‹ã‹ã©ã†ã‹ç¢ºèªã—ã¦ãã ã•ã„(!) - - 6: CONFIGオプションã®è¿½åŠ ãƒ»å¤‰æ›´ã‚’ã—ãŸå ´åˆã«ã¯ã€CONFIGメニューãŒå£Šã‚Œã¦ã„ãªã„ - ã“ã¨ã‚’確èªã—ã¦ãã ã•ã„。 - - 7: æ–°ã—ãKconfigã®ã‚ªãƒ—ã‚·ãƒ§ãƒ³ã‚’è¿½åŠ ã™ã‚‹éš›ã«ã¯ã€å¿…ãšãã®helpも記述ã—ã¦ãã ã•ã„。 - - 8: é©åˆ‡ãªKconfigã®ä¾å˜é–¢ä¿‚を考ãˆãªãŒã‚‰æ…Žé‡ã«ãƒã‚§ãƒƒã‚¯ã—ã¦ãã ã•ã„。 - ãŸã ã—ã€ã“ã®ä½œæ¥ã¯ãƒžã‚·ãƒ³ã‚’使ã£ãŸãƒ†ã‚¹ãƒˆã§ãã¡ã‚“ã¨è¡Œã†ã®ãŒã¨ã¦ã‚‚困難ã§ã™ã€‚ - ã†ã¾ãã‚„ã‚‹ã«ã¯ã€è‡ªåˆ†ã®é ã§è€ƒãˆã‚‹ã“ã¨ã§ã™ã€‚ - - 9: sparseを利用ã—ã¦ã¡ã‚ƒã‚“ã¨ã—ãŸã‚³ãƒ¼ãƒ‰ãƒã‚§ãƒƒã‚¯ã‚’ã—ã¦ãã ã•ã„。 - -10: 'make checkstack' を利用ã—ã€å•題ãŒç™ºè¦‹ã•れãŸã‚‰ä¿®æ£ã—ã¦ãã ã•ã„。 - 'make checkstack' ã¯æ˜Žç¤ºçš„ã«å•題を示ã—ã¾ã›ã‚“ãŒã€ã©ã‚Œã‹ - 1ã¤ã®é–¢æ•°ãŒ512ãƒã‚¤ãƒˆã‚ˆã‚Šå¤§ãã„スタックを使ã£ã¦ã„れã°ã€ä¿®æ£ã™ã¹ã候補㨠- ãªã‚Šã¾ã™ã€‚ - -11: ã‚°ãƒãƒ¼ãƒãƒ«ãªkernel API を説明ã™ã‚‹ kernel-doc をソースã®ä¸ã«å«ã‚ã¦ãã ã•ã„。 - ( staticãªé–¢æ•°ã«ãŠã„ã¦ã¯å¿…é ˆã§ã¯ã‚りã¾ã›ã‚“ãŒã€å«ã‚ã¦ã‚‚らã£ã¦ã‚‚çµæ§‹ã§ã™ ) - ãã—ã¦ã€'make htmldocs' ã‚‚ã—ã㯠'make mandocs' を利用ã—ã¦è¿½è¨˜ã—㟠- ドã‚ュメントã®ãƒã‚§ãƒƒã‚¯ã‚’行ã„ã€å•題ãŒè¦‹ã¤ã‹ã£ãŸå ´åˆã«ã¯ä¿®æ£ã‚’行ã£ã¦ãã ã•ã„。 - -12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, - CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK, - CONFIG_DEBUG_ATOMIC_SLEEP ã“れら全ã¦ã‚’åŒæ™‚ã«æœ‰åйã«ã—ã¦å‹•作確èªã‚’ - 行ã£ã¦ãã ã•ã„。 - -13: CONFIG_SMP, CONFIG_PREEMPT を有効ã«ã—ãŸå ´åˆã¨ç„¡åйã«ã—ãŸå ´åˆã®ä¸¡æ–¹ã§ - ビルドã—ãŸä¸Šã€å‹•作確èªã‚’行ã£ã¦ãã ã•ã„。 - -14: lockdepã®æ©Ÿèƒ½ã‚’å…¨ã¦æœ‰åйã«ã—ãŸä¸Šã§ã€å…¨ã¦ã®ã‚³ãƒ¼ãƒ‰ãƒ‘スを評価ã—ã¦ãã ã•ã„。 - -15: /proc ã«æ–°ã—ã„ã‚¨ãƒ³ãƒˆãƒªã‚’è¿½åŠ ã—ãŸå ´åˆã«ã¯ã€Documentation/ é…下㫠- å¿…ãšãƒ‰ã‚ãƒ¥ãƒ¡ãƒ³ãƒˆã‚’è¿½åŠ ã—ã¦ãã ã•ã„。 - -16: æ–°ã—ã„ãƒ–ãƒ¼ãƒˆãƒ‘ãƒ©ãƒ¡ãƒ¼ã‚¿ã‚’è¿½åŠ ã—ãŸå ´åˆã«ã¯ã€ - å¿…ãšDocumentation/admin-guide/kernel-parameters.rst ã«èª¬æ˜Žã‚’è¿½åŠ ã—ã¦ãã ã•ã„。 - -17: æ–°ã—ãmoduleã«ãƒ‘ãƒ©ãƒ¡ãƒ¼ã‚¿ã‚’è¿½åŠ ã—ãŸå ´åˆã«ã¯ã€MODULE_PARM_DESC()ã‚’ - 利用ã—ã¦å¿…ãšãã®èª¬æ˜Žã‚’記述ã—ã¦ãã ã•ã„。 - -18: æ–°ã—ã„userspaceインタフェースを作æˆã—ãŸå ´åˆã«ã¯ã€Documentation/ABI/ ã« - Documentation/ABI/README ã‚’å‚考ã«ã—ã¦å¿…ãšãƒ‰ã‚ãƒ¥ãƒ¡ãƒ³ãƒˆã‚’è¿½åŠ ã—ã¦ãã ã•ã„。 - -19: å°‘ãªãã¨ã‚‚slabã‚¢ãƒã‚±ãƒ¼ã‚·ãƒ§ãƒ³ã¨pageã‚¢ãƒã‚±ãƒ¼ã‚·ãƒ§ãƒ³ã«å¤±æ•—ã—ãŸå ´åˆã® - 挙動ã«ã¤ã„ã¦ã€fault-injectionを利用ã—ã¦ç¢ºèªã—ã¦ãã ã•ã„。 - Documentation/fault-injection/ ã‚’å‚ç…§ã—ã¦ãã ã•ã„。 - - è¿½åŠ ã—ãŸã‚³ãƒ¼ãƒ‰ãŒã‹ãªã‚Šã®é‡ã§ã‚ã£ãŸãªã‚‰ã°ã€ã‚µãƒ–システム特有㮠- fault-injectionã‚’è¿½åŠ ã—ãŸã»ã†ãŒè‰¯ã„ã‹ã‚‚ã—れã¾ã›ã‚“。 - -20: æ–°ãŸã«è¿½åŠ ã—ãŸã‚³ãƒ¼ãƒ‰ã¯ã€`gcc -W'ã§ã‚³ãƒ³ãƒ‘イルã—ã¦ãã ã•ã„。 - ã“ã®ã‚ªãƒ—ションã¯å¤§é‡ã®ä¸è¦ãªãƒ¡ãƒƒã‚»ãƒ¼ã‚¸ã‚’出力ã—ã¾ã™ãŒã€ - "warning: comparison between signed and unsigned" ã®ã‚ˆã†ãªãƒ¡ãƒƒã‚»ãƒ¼ã‚¸ã¯ã€ - ãƒã‚°ã‚’見ã¤ã‘ã‚‹ã®ã«å½¹ã«ç«‹ã¡ã¾ã™ã€‚ - -21: 投稿ã—ãŸãƒ‘ッãƒãŒ -mm パッãƒã‚»ãƒƒãƒˆã«ãƒžãƒ¼ã‚¸ã•れãŸå¾Œã€å…¨ã¦ã®æ—¢å˜ã®ãƒ‘ッãƒã‚„ - VM, VFS ãŠã‚ˆã³ãã®ä»–ã®ã‚µãƒ–システムã«é–¢ã™ã‚‹æ§˜ã€…ãªå¤‰æ›´ã¨ã€ç¾æ™‚点ã§ã‚‚å…±å˜ - ã§ãã‚‹ã“ã¨ã‚’確èªã™ã‚‹ãƒ†ã‚¹ãƒˆã‚’行ã£ã¦ãã ã•ã„。 diff --git a/Documentation/translations/ja_JP/disclaimer-ja_JP.rst b/Documentation/translations/ja_JP/disclaimer-ja_JP.rst new file mode 100644 index 000000000000..46a026000407 --- /dev/null +++ b/Documentation/translations/ja_JP/disclaimer-ja_JP.rst @@ -0,0 +1,24 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _translations_ja_JP_disclaimer: + +========================== +å…責æ¡é … (Disclaimer) 抄訳 +========================== + +.. note:: ã€è¨³è¨»ã€‘ + ã“ã®æ–‡æ›¸ã¯ã€ + :ref:`Disclaimer (英語版) <translations_disclaimer>` + ã®ä¸€éƒ¨ã‚’翻訳ã—ãŸã‚‚ã®ã§ã™ã€‚全文ã¯è‹±èªžç‰ˆã‚’å‚照願ã„ã¾ã™ã€‚ + +Documentation/translations/ja_JP/ 以下ã®ãƒ•ァイルã¯ã€å¯¾å¿œã™ã‚‹ +Documentation/ 以下ã®ãƒ•ァイル (原文) ã®æ—¥æœ¬èªžè¨³ã§ã™ã€‚ +翻訳ã¨åŽŸæ–‡ã¨ã®é•ã„や翻訳上ã®å•題を見ã¤ã‘ãŸã‚‰ã€ +MAINTAINERS ã«è¨˜è¼‰ã®ç¶æŒç®¡ç†è€…ã«çŸ¥ã‚‰ã›ã¦ãã ã•ã„。 +翻訳ãŒåŽŸæ–‡ã®æ›´æ–°ã«è¿½ã„ã¤ã„ã¦ã„ãªã„å ´åˆã¯ã€ãれを日本語版ã«åæ˜ ã™ã‚‹ãƒ‘ッãƒã® +投稿もæ“迎ã§ã™ã€‚ + +ãªãŠã€ã“ã®ç¿»è¨³ã®ç›®çš„ã¯éžè‹±èªž (ã“ã“ã§ã¯æ—¥æœ¬èªž) 話者ã¸ã®ä¾¿å®œæä¾›ã§ã‚り〠+フォークをæ„図ã—ãŸã‚‚ã®ã§ã¯ãªã„事を念é ã«ãŠã„ã¦ãã ã•ã„。ã—ãŸãŒã£ã¦ã€ +ã“ã®ãƒ•ァイルã®å†…容ã«å¯¾ã™ã‚‹ã‚³ãƒ¡ãƒ³ãƒˆã‚„æ›´æ–°ã™ã¹ãã“ã¨ãŒã‚れã°ã€å…ˆã«åŽŸæ–‡ã® +更新を検討ã—ã¦ãã ã•ã„。 diff --git a/Documentation/translations/ja_JP/index.rst b/Documentation/translations/ja_JP/index.rst index 0b476b429e3b..4159b417bfdd 100644 --- a/Documentation/translations/ja_JP/index.rst +++ b/Documentation/translations/ja_JP/index.rst @@ -11,7 +11,9 @@ .. toctree:: :maxdepth: 1 + disclaimer-ja_JP process/howto + process/submit-checklist .. raw:: latex diff --git a/Documentation/translations/ja_JP/process/howto.rst b/Documentation/translations/ja_JP/process/howto.rst index d9ba40588e46..5e307f90982c 100644 --- a/Documentation/translations/ja_JP/process/howto.rst +++ b/Documentation/translations/ja_JP/process/howto.rst @@ -1,35 +1,18 @@ -.. raw:: latex +.. SPDX-License-Identifier: GPL-2.0 - \kerneldocCJKoff - -NOTE: -This is a version of Documentation/process/howto.rst translated into Japanese. -This document is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com> -If you find any difference between this document and the original file or -a problem with the translation, please contact the maintainer of this file. - -Please also note that the purpose of this file is to be easier to -read for non English (read: Japanese) speakers and is not intended as -a fork. So if you have any comments or updates for this file, please -try to update the original English file first. - ----------------------------------- - -.. raw:: latex - - \kerneldocCJKon - -ã“ã®æ–‡æ›¸ã¯ã€ -Documentation/process/howto.rst -ã®å’Œè¨³ã§ã™ã€‚ - -翻訳者: Tsugikazu Shibata <tshibata@ab.jp.nec.com> - ----------------------------------- +.. Originally contributed by Tsugikazu Shibata Linux カーãƒãƒ«é–‹ç™ºã®ã‚„り方 ========================== +.. note:: ã€è¨³è¨»ã€‘ + ã“ã®æ–‡æ›¸ã¯ã€ + Documentation/process/howto.rst + ã®ç¿»è¨³ã§ã™ã€‚ + å…責æ¡é …ã«ã¤ã„ã¦ã¯ã€ + :ref:`å…責æ¡é …ã®æŠ„è¨³ <translations_ja_JP_disclaimer>` ãŠã‚ˆã³ã€ + :ref:`Disclaimer (英語版) <translations_disclaimer>` ã‚’å‚ç…§ã—ã¦ãã ã•ã„。 + ã“れã¯ä¸Šã®ãƒˆãƒ”ック( Linux カーãƒãƒ«é–‹ç™ºã®ã‚„り方)ã®é‡è¦ãªäº‹æŸ„を網羅ã—㟠ドã‚ュメントã§ã™ã€‚ã“ã“ã«ã¯ Linux カーãƒãƒ«é–‹ç™ºè€…ã«ãªã‚‹ãŸã‚ã®æ–¹æ³•ã¨Linux カーãƒãƒ«é–‹ç™ºã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã¨å…±ã«æ´»å‹•ã™ã‚‹ã‚„り方をå¦ã¶æ–¹æ³•ãŒå«ã¾ã‚Œã¦ã„ã¾ã™ã€‚ diff --git a/Documentation/translations/ja_JP/process/submit-checklist.rst b/Documentation/translations/ja_JP/process/submit-checklist.rst new file mode 100644 index 000000000000..fb3b9e3bd8ee --- /dev/null +++ b/Documentation/translations/ja_JP/process/submit-checklist.rst @@ -0,0 +1,163 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. Translated by Akira Yokosawa <akiyks@gmail.com> + +.. An old translation of this document of a different origin was at + Documentation/translations/ja_JP/SubmitChecklist, which can be found + in the pre-v6.14 tree if you are interested. + Please note that this translation is independent of the previous one. + +====================================== +Linux カーãƒãƒ«ãƒ‘ãƒƒãƒæŠ•ç¨¿ãƒã‚§ãƒƒã‚¯ãƒªã‚¹ãƒˆ +====================================== + +.. note:: ã€è¨³è¨»ã€‘ + ã“ã®æ–‡æ›¸ã¯ã€ + Documentation/process/submit-checklist.rst + ã®ç¿»è¨³ã§ã™ã€‚ + å…責æ¡é …ã«ã¤ã„ã¦ã¯ã€ + :ref:`å…責æ¡é …ã®æŠ„è¨³ <translations_ja_JP_disclaimer>` ãŠã‚ˆã³ã€ + :ref:`Disclaimer (英語版) <translations_disclaimer>` ã‚’å‚ç…§ã—ã¦ãã ã•ã„。 + +以下ã¯ã€ã‚«ãƒ¼ãƒãƒ«ãƒ‘ッãƒã®æŠ•稿時ã«ã€ãã®ã‚¹ãƒ ーズãªå—ã‘入れã®ãŸã‚ã«å¿ƒãŒã‘ã‚‹ +ã¹ã基本的ãªäº‹é …ã§ã™ã€‚ + +ã“れã¯ã€ Documentation/process/submitting-patches.rst ãŠã‚ˆã³ãã®ä»–ã® +Linux カーãƒãƒ«ãƒ‘ãƒƒãƒæŠ•ç¨¿ã«é–¢ã™ã‚‹æ–‡æ›¸ã‚’è¸ã¾ãˆã€ãれを補足ã™ã‚‹ã‚‚ã®ã§ã™ã€‚ + +.. note:: ã€è¨³è¨»ã€‘ + å¯èƒ½ãªé …ç›®ã«ã¤ã„ã¦ã¯ã€ãƒ‘ッãƒã‚‚ã—ãã¯ãƒ‘ッãƒå†…ã®æ›´æ–°ã‚’æš—é»™ã®ä¸»èªžã¨ã—ã¦ã€ + ãã®æœ›ã¾ã—ã„çŠ¶æ…‹ã‚’è¡¨ã™æ–‡ä½“ã¨ã—ã¾ã™ã€‚ãã®ä»–ã€åŽŸç¾©ã‚’æãªã‚ãªã„範囲㧠+ 係りçµã³ã‚’調整ã™ã‚‹ãªã©ã€ç°¡æ½”ã§æŠŠæ¡ã—ã‚„ã™ã„ç®‡æ¡æ›¸ãを目指ã—ã¾ã™ã€‚ + + +コードã®ãƒ¬ãƒ“ュー +================ + +1) 利用ã™ã‚‹æ©Ÿèƒ½ã«ã¤ã„ã¦ã€ãã®æ©Ÿèƒ½ã‚’定義・宣言ã—ã¦ã„るファイルを + ``#include`` ã—ã¦ã„る。 + ä»–ã®ãƒ˜ãƒƒãƒ€ãƒ¼ãƒ•ァイル経由ã§ã®å–り込ã¿ã«ä¾å˜ã—ãªã„。 + +2) Documentation/process/coding-style.rst ã«è©³è¿°ã•れã¦ã„る一般的ãªã‚¹ã‚¿ã‚¤ãƒ« + ã«ã¤ã„ã¦ãƒã‚§ãƒƒã‚¯æ¸ˆã¿ã€‚ + +3) メモリãƒãƒªã‚¢ãƒ¼ (例, ``barrier()``, ``rmb()``, ``wmb()``) ã«ã¤ã„ã¦ã€ + ãã®ã™ã¹ã¦ã«ã€ä½œç”¨ã¨ç›®çš„ã€åŠã³å¿…è¦ç†ç”±ã«ã¤ã„ã¦ã®èª¬æ˜ŽãŒã‚½ãƒ¼ã‚¹ã‚³ãƒ¼ãƒ‰å†…ã® + コメントã¨ã—ã¦è¨˜è¿°ã•れã¦ã„る。 + + +Kconfig 変更ã®ãƒ¬ãƒ“ュー +====================== + +1) æ–°è¦ã®ã€ã‚‚ã—ãã¯å¤‰æ›´ã•れ㟠``CONFIG`` オプションã«ã¤ã„ã¦ã€ãれãŒé–¢ä¿‚ã™ã‚‹ + コンフィグメニューã¸ã®æ‚ªå½±éŸ¿ãŒãªã„。ã¾ãŸã€ + Documentation/kbuild/kconfig-language.rst ã® + "Menu attibutes: default value" ã«è¨˜è¼‰ã®ä¾‹å¤–æ¡ä»¶ã‚’満ãŸã™å ´åˆã‚’除ã〠+ ãã®ãƒ‡ãƒ•ォルトãŒç„¡åйã«ãªã£ã¦ã„る。 + +2) æ–°è¦ã® ``Kconfig`` オプションã«ãƒ˜ãƒ«ãƒ—テã‚ストãŒã‚る。 + +3) 妥当㪠``Kconfig`` ã®çµ„ã¿åˆã‚ã›ã«ã¤ã„ã¦æ³¨æ„æ·±ãレビューã•れã¦ã„る。 + ã“れをテストã§ã‚„り切るã®ã¯å›°é›£ã§ã€çŸ¥åŠ›ãŒæ±ºã‚手ã¨ãªã‚‹ã€‚ + +ドã‚ュメンテーションã®ä½œæˆ +========================== + +1) ã‚°ãƒãƒ¼ãƒãƒ«ãªã‚«ãƒ¼ãƒãƒ« API ㌠:ref:`kernel-doc <kernel_doc>` ã®å½¢å¼ã§ + ドã‚ュメント化ã•れã¦ã„ã‚‹ (é™çš„関数ã«ã¯æ±‚ã‚られãªã„ãŒã€ä»˜ã‘ã¦ã‚‚よã„)。 + +2) æ–°è¦ ``/proc`` エントリーãŒã€ã™ã¹ã¦ ``Documentation/`` 以下ã«è¨˜è¼‰ã•れ㦠+ ã„る。 + +3) æ–°è¦ã‚«ãƒ¼ãƒãƒ«ãƒ»ãƒ–ート・パラメータãŒã€ã™ã¹ã¦ + ``Documentation/admin-guide/kernel-parameters.rst`` ã«è¨˜è¼‰ã•れã¦ã„る。 + +4) æ–°è¦ãƒ¢ã‚¸ãƒ¥ãƒ¼ãƒ«ãƒ»ãƒ‘ラメータãŒã€ã™ã¹ã¦ ``MODULE_PARM_DESC()`` ã«ã‚ˆã£ã¦è¨˜è¿° + ã•れã¦ã„る。 + +5) æ–°è¦ãƒ¦ãƒ¼ã‚¶ãƒ¼ã‚¹ãƒšãƒ¼ã‚¹ãƒ»ã‚¤ãƒ³ã‚¿ãƒ¼ãƒ•ェースãŒã€ã™ã¹ã¦ ``Documentaion/ABI/`` + 以下ã«è¨˜è¼‰ã•れã¦ã„る。詳ã—ãã¯ã€ Documentation/admin-guide/abi.rst + (ã‚‚ã—ã㯠``Documentation/ABI/README``) ã‚’å‚照。 + ユーザースペース・インターフェースを変更ã™ã‚‹ãƒ‘ッãƒã¯ã€ + linux-api@vger.kernel.org ã«ã‚‚ CC ã™ã¹ã—。 + +6) ãªã‚“らã‹ã® ioctl ã‚’è¿½åŠ ã™ã‚‹ãƒ‘ッãƒã¯ã€ + ``Documentation/userspace-api/ioctl/ioctl-number.rst`` + ã®æ›´æ–°ã‚’ä¼´ã†ã€‚ + +ツールã«ã‚ˆã‚‹ã‚³ãƒ¼ãƒ‰ã®ãƒã‚§ãƒƒã‚¯ +============================ + +1) スタイル・ãƒã‚§ãƒƒã‚«ãƒ¼ (``scripts/checkpatch.pl``) ã«ã‚ˆã£ã¦ã€çНã—ãŒã¡ãª + パッãƒãƒ»ã‚¹ã‚¿ã‚¤ãƒ«ã®é•åãŒãªã„ã“ã¨ã‚’ç¢ºèªæ¸ˆã¿ã€‚ + 指摘ã•れるé•åを残ã™å ´åˆã¯ã€ãれをæ£å½“化ã§ãã‚‹ã“ã¨ã€‚ + +2) sparse ã«ã‚ˆã‚Šå…¥å¿µã«ãƒã‚§ãƒƒã‚¯æ¸ˆã¿ã€‚ + +3) ``make checkstack`` ã§æŒ‡æ‘˜ã•れるå•題ãŒã‚れã°ã€ãれãŒä¿®æ£æ¸ˆã¿ã€‚ + ``checkstack`` ã¯å•題点を明示的ã«ã¯æŒ‡æ‘˜ã—ãªã„ãŒã€ スタック消費㌠+ 512 ãƒã‚¤ãƒˆã‚’è¶Šãˆã‚‹é–¢æ•°ã¯è¦‹ç›´ã—ã®å€™è£œã€‚ + +コードã®ãƒ“ルド +============== + +1) ä»¥ä¸‹ã®æ¡ä»¶ã§ã‚¯ãƒªãƒ¼ãƒ³ã«ãƒ“ルドã§ãる。 + + a) é©ç”¨å¯èƒ½ãªã€ãŠã‚ˆã³ ``=y``, ``=m``, ``=n`` を変更ã—㟠``CONFIG`` + オプションã§ã®ãƒ“ルド。 + ``gcc`` ãŠã‚ˆã³ãƒªãƒ³ã‚«ãƒ¼ã‹ã‚‰ã®è¦å‘Šãƒ»ã‚¨ãƒ©ãƒ¼ãŒãªã„ã“ã¨ã€‚ + + b) ``allnoconfig`` 㨠``allmodconfig`` ãŒãƒ‘ス + + c) ``O=builddir`` を指定ã—ã¦ã®ãƒ“ルド + + d) Documentation/ 以下ã®å¤‰æ›´ã«é–¢ã—ã¦ã€ãƒ‰ã‚ュメントã®ãƒ“ãƒ«ãƒ‰ã§æ–°ãŸãªè¦å‘Šã‚„ + エラーãŒå‡ºãªã„。 + ``make htmldocs`` ã¾ãŸã¯ ``make pdfdocs`` ã§ãƒ“ルドã—ã€å•題ãŒã‚れã°ä¿®æ£ã€‚ + +2) ãƒãƒ¼ã‚«ãƒ«ã®ã‚¯ãƒã‚¹ãƒ»ã‚³ãƒ³ãƒ‘イル・ツールã€ãã®ä»–ã®ãƒ“ルド環境 (訳註: build farm) + を使ã£ã¦ã€è¤‡æ•°ã® CPU アーã‚テクãƒãƒ£å‘ã‘ã«ãƒ“ルドã§ãる。 + 特ã«ã€ãƒ¯ãƒ¼ãƒ‰ã‚µã‚¤ã‚º (32 ビット㨠64 ビット) やエンディアン (ビッグã¨ãƒªãƒˆãƒ«) + ã®ç•°ãªã‚‹ã‚¢ãƒ¼ã‚テクãƒãƒ£ã‚’対象ã¨ã™ã‚‹ãƒ†ã‚¹ãƒˆã¯ã€è¡¨ç¾å¯èƒ½æ•°å€¤ç¯„囲・データ整列・ + エンディアンãªã©ã«ã¤ã„ã¦ã®èª¤ã£ãŸä»®å®šã«èµ·å› ã™ã‚‹æ§˜ã€…ãªç§»æ¤ä¸Šã®å•題をæ•ãˆã‚‹ + ã®ã«åŠ¹æžœçš„ã€‚ + +3) æ–°è¦ã«è¿½åŠ ã•れãŸã‚³ãƒ¼ãƒ‰ã«ã¤ã„㦠(``make KCFLAGS=-W`` を使ã£ã¦) + ``gcc -W`` ã§ã‚³ãƒ³ãƒ‘イル。 + ã“れã¯å¤šãã®ãƒŽã‚¤ã‚ºã‚’ä¼´ã†ãŒã€ + ``warning: comparison between signed and unsigned`` + ã®é¡žã„ã®ãƒã‚°ã‚’ã‚ã¶ã‚Šå‡ºã™ã®ã«åŠ¹æžœçš„ã€‚ + +4) 変更ã•れるソースコードãŒã€ä¸‹è¨˜ã® ``Kconfig`` シンボルã«é–¢é€£ã™ã‚‹ã‚«ãƒ¼ãƒãƒ« + API や機能ã«ä¾å˜ (ã‚‚ã—ãã¯åˆ©ç”¨) ã™ã‚‹å ´åˆã€ãれら㮠``Kconfig`` シンボルãŒã€ + 無効ã€ãŠã‚ˆã³ (å¯èƒ½ãªã‚‰) ``=m`` ã®å ´åˆã‚’組ã¿åˆã‚ã›ãŸè¤‡æ•°ã®ãƒ“ルドを + (全部ã¾ã¨ã‚ã¦ã§ã¯ãªãã€ã„ã‚ã„ã‚ãªãƒ©ãƒ³ãƒ€ãƒ ã®çµ„ã¿åˆã‚ã›ã§) テスト済ã¿ã€‚ + + ``CONFIG_SMP``, ``CONFIG_SYSFS``, ``CONFIG_PROC_FS``, ``CONFIG_INPUT``, + ``CONFIG_PCI``, ``CONFIG_BLOCK``, ``CONFIG_PM``, ``CONFIG_MAGIC_SYSRQ``, + ``CONFIG_NET``, ``CONFIG_INET=n`` (ãŸã ã—ã€å¾Œè€…㯠``CONFIG_NET=y`` + ã¨ã®çµ„ã¿åˆã‚ã›)。 + +コードã®ãƒ†ã‚¹ãƒˆ +============== + +1) ``CONFIG_PREEMPT``, ``CONFIG_DEBUG_PREEMPT``, + ``CONFIG_SLUB_DEBUG``, ``CONFIG_DEBUG_PAGEALLOC``, ``CONFIG_DEBUG_MUTEXES``, + ``CONFIG_DEBUG_SPINLOCK``, ``CONFIG_DEBUG_ATOMIC_SLEEP``, + ``CONFIG_PROVE_RCU`` ãŠã‚ˆã³ ``CONFIG_DEBUG_OBJECTS_RCU_HEAD`` ã‚’ã™ã¹ã¦ + åŒæ™‚ã«æœ‰åйã«ã—ã¦ã®ãƒ†ã‚¹ãƒˆæ¸ˆã¿ã€‚ + +2) ``CONFIG_SMP`` 㨠``CONFIG_PREEMPT`` ãŒæœ‰åйã¨ç„¡åйã®å ´åˆã«ã¤ã„ã¦ã€ãƒ“ルド㨠+ ランタイムã®ãƒ†ã‚¹ãƒˆæ¸ˆã¿ã€‚ + +3) lockdep ã®æ©Ÿèƒ½ã‚’ã™ã¹ã¦æœ‰åйã«ã—ã¦ã®å®Ÿè¡Œã§ã€ã™ã¹ã¦ã®ã‚³ãƒ¼ãƒ‰çµŒè·¯ãŒç¢ºèªæ¸ˆã¿ã€‚ + +4) 最低é™ã€slab 㨠ページ・アãƒã‚±ãƒ¼ã‚·ãƒ§ãƒ³ã®å¤±æ•—ã«é–¢ã™ã‚‹èª¤ã‚Šæ³¨å…¥ + (訳註: fault injection) ã«ã‚ˆã‚‹ãƒã‚§ãƒƒã‚¯æ¸ˆã¿ã€‚ + 詳ã—ãã¯ã€ Documentation/fault-injection/index.rst ã‚’å‚照。 + æ–°è¦ã®ã‚³ãƒ¼ãƒ‰ãŒå¤šã„å ´åˆã¯ã€ã‚µãƒ–システム対象ã®èª¤ã‚Šæ³¨å…¥ã‚’è¿½åŠ ã™ã‚‹ã®ãŒæœ›ã¾ã—ã„ + å¯èƒ½æ€§ã‚り。 + +5) linux-next ã®æœ€æ–°ã‚¿ã‚°ã«å¯¾ã™ã‚‹ãƒ†ã‚¹ãƒˆã«ã‚ˆã‚Šã€ä»–ã§ã‚ューイングã•れã¦ã„ã‚‹ + パッãƒã‚„ã€VMã€VFSã€ãã®ä»–ã®ã‚µãƒ–システム内ã®ã™ã¹ã¦ã®å¤‰æ›´ã¨çµ„ã¿åˆã‚ã›ã¦ã® + å‹•ä½œã‚’ç¢ºèªæ¸ˆã¿ã€‚ diff --git a/Documentation/translations/sp_SP/process/submit-checklist.rst b/Documentation/translations/sp_SP/process/submit-checklist.rst index 0d6651f9d871..e7107cc97001 100644 --- a/Documentation/translations/sp_SP/process/submit-checklist.rst +++ b/Documentation/translations/sp_SP/process/submit-checklist.rst @@ -97,9 +97,10 @@ y en otros lugares con respecto al envÃo de parches del kernel de Linux. ``MODULE_PARM_DESC()``. 18) Todas las nuevas interfaces de espacio de usuario están documentadas - en ``Documentation/ABI/``. Consulte ``Documentation/ABI/README`` para - obtener más información. Los parches que cambian las interfaces del - espacio de usuario deben ser CCed a linux-api@vger.kernel.org. + en ``Documentation/ABI/``. Consulte Documentation/admin-guide/abi.rst + (o ``Documentation/ABI/README``) para obtener más información. + Los parches que cambian las interfaces del espacio de usuario deben + ser CCed a linux-api@vger.kernel.org. 19) Se ha comprobado con la inyección de al menos errores de asignación de slab y página. Consulte ``Documentation/fault-injection/``. diff --git a/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst b/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst index dc728c739e28..b35d24464be9 100644 --- a/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst +++ b/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst @@ -112,7 +112,7 @@ CFS usa una granularidad de nanosegundos y no depende de ningún jiffy o detalles como HZ. De este modo, el gestor de tareas CFS no tiene noción de "ventanas de tiempo" de la forma en que tenÃa el gestor de tareas previo, y tampoco tiene heurÃsticos. Únicamente hay un parámetro -central ajustable (se ha de cambiar en CONFIG_SCHED_DEBUG): +central ajustable: /sys/kernel/debug/sched/base_slice_ns diff --git a/Documentation/translations/zh_CN/admin-guide/README.rst b/Documentation/translations/zh_CN/admin-guide/README.rst index e679cbc3c89d..1bdafdc4c8e2 100644 --- a/Documentation/translations/zh_CN/admin-guide/README.rst +++ b/Documentation/translations/zh_CN/admin-guide/README.rst @@ -146,7 +146,7 @@ Linuxå†…æ ¸6.x版本 <http://kernel.org/> "make xconfig" 基于Qtçš„é…置工具。 - "make gconfig" 基于GTK+çš„é…置工具。 + "make gconfig" 基于GTKçš„é…置工具。 "make oldconfig" 基于现有的 ./.config 文件选择所有选项,并询问 æ–°é…置选项。 diff --git a/Documentation/translations/zh_CN/dev-tools/ubsan.rst b/Documentation/translations/zh_CN/dev-tools/ubsan.rst index 2487696b3772..81ef6f77caeb 100644 --- a/Documentation/translations/zh_CN/dev-tools/ubsan.rst +++ b/Documentation/translations/zh_CN/dev-tools/ubsan.rst @@ -3,7 +3,14 @@ .. include:: ../disclaimer-zh_CN.rst :Original: Documentation/dev-tools/ubsan.rst -:Translator: Dongliang Mu <dzm91@hust.edu.cn> + +:翻译: + + 慕冬亮 Dongliang Mu <dzm91@hust.edu.cn> + +:æ ¡è¯‘: + + 王昱力 WangYuli <wangyuli@uniontech.com> 未定义行为消毒剂 - UBSAN ==================================== @@ -55,30 +62,20 @@ GCC自4.9.x [1_] ï¼ˆè¯¦è§ ``-fsanitize=undefined`` 选项åŠå…¶å选项)版æ ä½¿ç”¨å¦‚ä¸‹å†…æ ¸é…ç½®å¯ç”¨UBSAN:: - CONFIG_UBSAN=y - -ä½¿ç”¨å¦‚ä¸‹å†…æ ¸é…ç½®æ£€æŸ¥æ•´ä¸ªå†…æ ¸:: - - CONFIG_UBSAN_SANITIZE_ALL=y - -为了在特定文件或目录å¯åŠ¨ä»£ç æ’桩,需è¦åœ¨ç›¸åº”çš„å†…æ ¸Makefile䏿·»åŠ ä¸€è¡Œç±»ä¼¼å†…å®¹: + CONFIG_UBSAN=y -- 啿–‡ä»¶ï¼ˆå¦‚main.o):: - - UBSAN_SANITIZE_main.o := y - -- 一个目录ä¸çš„æ‰€æœ‰æ–‡ä»¶:: - - UBSAN_SANITIZE := y - -å³ä½¿è®¾ç½®äº†``CONFIG_UBSAN_SANITIZE_ALL=y``,为了é¿å…æ–‡ä»¶è¢«æ’æ¡©ï¼Œå¯ä½¿ç”¨:: +排除è¦è¢«æ£€æµ‹çš„æ–‡ä»¶:: UBSAN_SANITIZE_main.o := n -与:: +排除一个目录ä¸çš„æ‰€æœ‰æ–‡ä»¶:: UBSAN_SANITIZE := n +当全部文件都被ç¦ç”¨ï¼Œå¯é€šè¿‡å¦‚下方å¼ä¸ºç‰¹å®šæ–‡ä»¶å¯ç”¨:: + + UBSAN_SANITIZE_main.o := y + 未对é½çš„内å˜è®¿é—®æ£€æµ‹å¯é€šè¿‡å¼€å¯ç‹¬ç«‹é€‰é¡¹ - CONFIG_UBSAN_ALIGNMENT 检测。 è¯¥é€‰é¡¹åœ¨æ”¯æŒæœªå¯¹é½è®¿é—®çš„æž¶æž„上(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y) 默认为关é—。该选项ä»å¯é€šè¿‡å†…æ ¸é…ç½®å¯ç”¨ï¼Œä½†å®ƒå°†äº§ç”Ÿå¤§é‡çš„UBSAN报告。 diff --git a/Documentation/translations/zh_CN/disclaimer-zh_CN.rst b/Documentation/translations/zh_CN/disclaimer-zh_CN.rst index 3c6db094a63c..37681c0b2a01 100644 --- a/Documentation/translations/zh_CN/disclaimer-zh_CN.rst +++ b/Documentation/translations/zh_CN/disclaimer-zh_CN.rst @@ -1,9 +1,7 @@ :orphan: -.. warning:: +.. note:: æ¤æ–‡ä»¶çš„ç›®çš„æ˜¯ä¸ºè®©ä¸æ–‡è¯»è€…更容易阅读和ç†è§£ï¼Œè€Œä¸æ˜¯ä½œä¸ºä¸€ä¸ªåˆ†æ”¯ã€‚ å› æ¤ï¼Œ å¦‚æžœæ‚¨å¯¹æ¤æ–‡ä»¶æœ‰ä»»ä½•æ„è§æˆ–更新,请先å°è¯•更新原始英文文件。 - -.. note:: - 如果您å‘现本文档与原始文件有任何ä¸åŒæˆ–者有翻译问题,请è”系该文件的译者, - 或者请求时奎亮的帮助:<alexs@kernel.org>。 + 如果您å‘现本文档与原始文件有任何ä¸åŒæˆ–者有翻译问题,请å‘建议或者补ä¸ç»™ + è¯¥æ–‡ä»¶çš„è¯‘è€…ï¼Œæˆ–è€…è¯·æ±‚ä¸æ–‡æ–‡æ¡£ç»´æŠ¤è€…和审阅者的帮助。 diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index 7574e1673180..cc512ca54172 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -26,7 +26,13 @@ é¡ºä¾¿è¯´ä¸‹ï¼Œä¸æ–‡æ–‡æ¡£ä¹Ÿéœ€è¦éµå®ˆå†…æ ¸ç¼–ç é£Žæ ¼ï¼Œé£Žæ ¼ä¸ä¸æ–‡å’Œè‹±æ–‡çš„主è¦ä¸åŒå°±æ˜¯ä¸æ–‡ çš„å—ç¬¦æ ‡ç‚¹å 用两个英文å—ç¬¦å®½åº¦ï¼Œæ‰€ä»¥ï¼Œå½“è‹±æ–‡è¦æ±‚ä¸è¦è¶…过æ¯è¡Œ100个å—符时, 䏿–‡å°±ä¸è¦è¶…过50个å—符。å¦å¤–ï¼Œä¹Ÿè¦æ³¨æ„'-','='ç‰ç¬¦å·ä¸Žç›¸å…³æ ‡é¢˜çš„对é½ã€‚在将 -è¡¥ä¸æäº¤åˆ°ç¤¾åŒºä¹‹å‰ï¼Œä¸€å®šè¦è¿›è¡Œå¿…è¦çš„ ``checkpatch.pl`` 检查和编译测试。 +è¡¥ä¸æäº¤åˆ°ç¤¾åŒºä¹‹å‰ï¼Œä¸€å®šè¦è¿›è¡Œå¿…è¦çš„ ``checkpatch.pl`` æ£€æŸ¥å’Œç¼–è¯‘æµ‹è¯•ï¼Œç¡®ä¿ +在 ``make htmldocs/pdfdocs`` ä¸ä¸å¢žåŠ æ–°çš„å‘Šè¦ï¼Œæœ€åŽï¼Œå®‰è£…æ£€æŸ¥ä½ ç”Ÿæˆçš„ +html/pdf æ–‡ä»¶ï¼Œç¡®è®¤å®ƒä»¬çœ‹èµ·æ¥æ˜¯æ£å¸¸çš„。 + +æäº¤ä¹‹å‰è¯·ç¡®è®¤ä½ 的补ä¸å¯ä»¥æ£å¸¸æäº¤åˆ°ä¸æ–‡æ–‡æ¡£ç»´æŠ¤åº“: +https://git.kernel.org/pub/scm/linux/kernel/git/alexs/linux.git/ +å¦‚æžœä½ çš„è¡¥ä¸ä¾èµ–于其他人的补ä¸, å¯ä»¥ä¸Žå…¶ä»–人商é‡åŽç”±æŸä¸€ä¸ªäººåˆå¹¶æäº¤ã€‚ 与Linux å†…æ ¸ç¤¾åŒºä¸€èµ·å·¥ä½œ ------------------------ diff --git a/Documentation/translations/zh_CN/mm/balance.rst b/Documentation/translations/zh_CN/mm/balance.rst index 6fd79209c307..f877c0cfa39a 100644 --- a/Documentation/translations/zh_CN/mm/balance.rst +++ b/Documentation/translations/zh_CN/mm/balance.rst @@ -64,7 +64,7 @@ kswapdå¹¶ä¸çœŸæ£éœ€è¦å¹³è¡¡é«˜å†…å˜åŒºï¼Œå› ä¸ºä¸æ–上下文并ä¸è¯·æ±‚é« å¦‚æžœä»Žè¿›ç¨‹å†…å˜å’Œshmä¸å·å–页é¢å¯ä»¥å‡è½»è¯¥é¡µé¢èŠ‚ç‚¹ä¸ä»»ä½•区的内å˜åŽ‹åŠ›ï¼Œè€Œè¯¥åŒºçš„å†…å˜åŽ‹åŠ› å·²ç»ä½ŽäºŽå…¶æ°´ä½ï¼Œåˆ™ä¼šè¿›è¡Œå·å–。 -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: +watermark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: 这些是æ¯ä¸ªåŒºçš„å—æ®µï¼Œç”¨äºŽç¡®å®šä¸€ä¸ªåŒºä½•时需è¦å¹³è¡¡ã€‚当页颿•°ä½ŽäºŽæ°´ä½[WMARK_MIN]时, hysteric çš„å—æ®µlow_on_memoryè¢«è®¾ç½®ã€‚è¿™ä¸ªå—æ®µä¼šä¸€ç›´è¢«è®¾ç½®ï¼Œç›´åˆ°ç©ºé—²é¡µæ•°å˜æˆæ°´ä½ [WMARK_HIGH]。当low_on_memory被设置时,页é¢åˆ†é…请求将å°è¯•释放该区域的一些页é¢ï¼ˆå¦‚æžœ diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst index 10536b74aeec..0e524f1c1af5 100644 --- a/Documentation/translations/zh_CN/process/submit-checklist.rst +++ b/Documentation/translations/zh_CN/process/submit-checklist.rst @@ -82,8 +82,8 @@ Linuxå†…æ ¸è¡¥ä¸æäº¤æ£€æŸ¥å• 17) 所有新的模å—傿•°éƒ½è®°å½•在 ``MODULE_PARM_DESC()`` 18) 所有新的用户空间接å£éƒ½è®°å½•在 ``Documentation/ABI/`` ä¸ã€‚有关详细信æ¯ï¼Œ - 请å‚阅 ``Documentation/ABI/README`` 。更改用户空间接å£çš„è¡¥ä¸åº”è¯¥æŠ„é€ - linux-api@vger.kernel.org。 + 请å‚阅 Documentation/admin-guide/abi.rst (或 ``Documentation/ABI/README``)。 + 更改用户空间接å£çš„è¡¥ä¸åº”è¯¥æŠ„é€ linux-api@vger.kernel.org\ 。 19) 已通过至少注入slabå’Œpage分é…失败进行检查。请å‚阅 ``Documentation/fault-injection/`` 。 å¦‚æžœæ–°ä»£ç æ˜¯å®žè´¨æ€§çš„ï¼Œé‚£ä¹ˆæ·»åŠ å系统特定的故障注入å¯èƒ½æ˜¯åˆé€‚的。 diff --git a/Documentation/translations/zh_CN/security/credentials.rst b/Documentation/translations/zh_CN/security/credentials.rst new file mode 100644 index 000000000000..91c353dfb622 --- /dev/null +++ b/Documentation/translations/zh_CN/security/credentials.rst @@ -0,0 +1,479 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/security/credentials.rst + +:翻译: + 赵硕 Shuo Zhao <zhaoshuo@cqsoftware.com.cn> + +============= +Linuxä¸çš„凿® +============= + +作者: David Howells <dhowells@redhat.com> + +.. contents:: :local: + +概述 +==== + +当一个对象对å¦ä¸€ä¸ªå¯¹è±¡è¿›è¡Œæ“作时,Linux执行的安全检查包å«å‡ 个部分: + + 1. 对象 + + 对象是å¯ä»¥ç›´æŽ¥ç”±ç”¨æˆ·ç©ºé—´ç¨‹åºæ“作的系统ä¸çš„实体。Linux具有多ç§å¯æ“作 + 的对象,包括: + + - 任务 + - 文件/索引节点 + - å¥—æŽ¥å— + - 消æ¯é˜Ÿåˆ— + - å…±äº«å†…å˜æ®µ + - ä¿¡å·é‡ + - 密钥 + + 所有这些对象的æè¿°çš„ä¸€éƒ¨åˆ†æ˜¯ä¸€ç»„å‡æ®ã€‚集åˆä¸çš„内容å–决于对象的类型。 + + 2. å¯¹è±¡æ‰€æœ‰æƒ + + å¤§å¤šæ•°å¯¹è±¡çš„å‡æ®ä¸ä¼šæœ‰ä¸€ä¸ªå集用æ¥è¡¨ç¤ºè¯¥å¯¹è±¡çš„æ‰€æœ‰æƒã€‚ + è¿™ç”¨äºŽèµ„æºæ ¸ç®—å’Œé™åˆ¶ï¼ˆå¦‚ç£ç›˜é…é¢å’Œä»»åŠ¡èµ„æºé™åˆ¶ï¼‰ã€‚ + + ä¾‹å¦‚ï¼Œåœ¨æ ‡å‡†çš„UNIX文件系统ä¸ï¼Œè¿™å°†ç”±æ ‡è®°åœ¨ç´¢å¼•节点上的UID定义。 + + 3. 对象上下文 + + æ¤å¤–åœ¨è¿™äº›å¯¹è±¡çš„å‡æ®ä¸ï¼Œå°†æœ‰ä¸€ä¸ªå集表示对象的“对象上下文â€ã€‚ + è¿™å¯èƒ½ä¸Žï¼ˆ2)ä¸ç›¸åŒï¼Œä¹Ÿå¯èƒ½ä¸åŒ —— ä¾‹å¦‚ï¼Œåœ¨æ ‡å‡†çš„UNIX文件ä¸ï¼Œ + è¿™æ˜¯ç”±æ ‡è®°åœ¨ç´¢å¼•èŠ‚ç‚¹ä¸Šçš„UIDå’ŒGID定义的。 + + 对象上下文是进行安全计算的一部分,当对象被æ“作时会用到。 + + 4. 主体 + + 主体是æ£åœ¨å¯¹å…¶ä»–对象执行æ“作的对象。 + + 系统ä¸çš„å¤§å¤šæ•°å¯¹è±¡æ˜¯ä¸æ´»åŠ¨çš„ï¼šä»–ä»¬ä¸ä¼šå¯¹ç³»ç»Ÿä¸çš„其他对象起作用。 + 进程/任务是明显的例外:它们å¯ä»¥è®¿é—®å’Œæ“纵其他对象。 + + 任务之外的其他对象在æŸäº›æƒ…况下也å¯ä»¥æ˜¯ä¸»ä½“。例如,打开的文件å¯ä»¥ä½¿ç”¨ + å为 ``fcntl(F_SETOWN)`` 的任务给它的UIDå’ŒEUIDå‘一个任务å‘é€SIGIO + ä¿¡å·ã€‚åœ¨è¿™ç§æƒ…况下,文件结构也会有一个主体上下文。 + + 5. 主体上下文 + + 䏻体坹其凿®æœ‰ä¸€ä¸ªé¢å¤–çš„è§£é‡Šã€‚å…¶å‡æ®çš„一个å集形æˆäº†â€œä¸»ä½“上下文â€ã€‚主体 + 上下文在主体执行æ“作时作为安全计算的一部分使用。 + + 例如,Linux任务在æ“作文件时会有FSUIDã€FSGIDå’Œé™„åŠ ç»„åˆ—è¡¨ —— è¿™äº›å‡æ® + 与通常构æˆä»»åŠ¡çš„å¯¹è±¡ä¸Šä¸‹æ–‡çš„çœŸå®žUIDå’ŒGID是相互独立的。 + + 6. æ“作 + + Linuxæä¾›è®¸å¤šæ“作,主体å¯ä»¥å¯¹å¯¹è±¡æ‰§è¡Œè¿™äº›æ“作。å¯ç”¨çš„æ“ä½œé›†å–决于主体 + 和对象的性质。 + + + æ“作包括读å–ã€å†™å…¥ã€åˆ›å»ºå’Œåˆ é™¤æ–‡ä»¶ï¼Œä»¥åŠæ´¾ç”Ÿï¼ˆforking)或å‘é€ + ä¿¡å·ï¼ˆsignalling)和跟踪(tracing)任务ç‰ã€‚ + + 7. 规则,访问控制列表和安全计算 + + 当主体对对象进行æ“作时,会进行安全计算。这涉åŠåˆ°ä½¿ç”¨ä¸»ä½“上下文ã€å¯¹è±¡ + 上下文和æ“作,并æœç´¢ä¸€ä¸ªæˆ–多个规则集,以确定在给定这些上下文的情况下, + 主体是å¦è¢«æŽˆäºˆæˆ–æ‹’ç»ä»¥æ‰€éœ€æ–¹å¼å¯¹å¯¹è±¡è¿›è¡Œæ“作的æƒé™ã€‚ + + ä¸»è¦æœ‰ä¸¤ä¸ªè§„åˆ™æ¥æºï¼š + + a. 自主访问控制(DAC): + + 有时,对象的æè¿°ä¸ä¼šåŒ…å«ä¸€ç»„è§„åˆ™ã€‚è¿™å°±æ˜¯æ‰€è°“çš„â€œè®¿é—®æŽ§åˆ¶åˆ—è¡¨â€æˆ–‘ACL’。 + 一个Linux文件å¯ä»¥æä¾›å¤šä¸ªACL。 + + ä¾‹å¦‚ï¼Œä¼ ç»Ÿçš„UNIX文件包括一个æƒé™æŽ©ç ,它是一个简化的ACL,具有三个固定的 + 主体类别(“用户â€ã€â€œç»„â€å’Œâ€œå…¶ä»–â€ï¼‰ï¼Œæ¯ä¸€ä¸ªéƒ½å¯ä»¥è¢«æŽˆäºˆä¸€å®šçš„特æƒï¼ˆå¦‚“读å–â€ã€ + “写入â€å’Œâ€œæ‰§è¡Œâ€ —— æ— è®ºè¿™äº›æ˜ å°„å¯¹äºŽå¯¹è±¡æ„味ç€ä»€ä¹ˆï¼‰ã€‚然而,UNIX文件æƒé™ä¸ + å…è®¸ä»»æ„æŒ‡å®šä¸»ä½“ï¼Œå› æ¤ç”¨é€”有é™ã€‚ + + Linux文件还å¯ä»¥æ”¯æŒPOSIX ACL。这是一个规则列表,为任æ„主体授予å„ç§æƒé™ã€‚ + + b. 强制访问控制(MAC): + + 整个系统å¯èƒ½æœ‰ä¸€ä¸ªæˆ–多个规则集,适用于所有主体和对象,ä¸è€ƒè™‘å®ƒä»¬çš„æ¥æºã€‚ + SELinuxå’ŒSmackå°±æ˜¯è¿™ç§æƒ…况的例å。 + + 在SELinuxå’ŒSmack的情况下,æ¯ä¸ªå¯¹è±¡åœ¨å…¶å‡æ®ä¸éƒ½è¢«èµ‹äºˆä¸€ä¸ªæ ‡ç¾ã€‚当请求执 + 行æ“ä½œæ—¶ï¼Œå®ƒä»¬ä½¿ç”¨ä¸»ä½“æ ‡ç¾ã€å¯¹è±¡æ ‡ç¾å’Œæ“ä½œï¼Œå¯»æ‰¾ä¸€ä¸ªè§„åˆ™ï¼Œè¯¥è§„åˆ™è¡¨ç¤ºæ¤æ“ + 作是授予还是拒ç»çš„。 + + +凿®ç±»åž‹ +======== + +Linuxå†…æ ¸æ”¯æŒä»¥ä¸‹ç±»åž‹çš„凿®ï¼š + + 1. ä¼ ç»Ÿçš„UNIX凿®ã€‚ + + - 真实用户ID + - 真实组ID + + UIDå’ŒGIDç”±å¤§å¤šæ•°ï¼ˆå¦‚æžœä¸æ˜¯å…¨éƒ¨ï¼‰Linux对象æºå¸¦ï¼Œå³ä½¿æœ‰æ—¶å®ƒä»¬éœ€è¦è¢«è™šæž„出 + æ¥ï¼ˆä¾‹å¦‚FAT或CIFSæ–‡ä»¶ï¼Œè¿™äº›æ–‡ä»¶æ¥æºäºŽWindows)。这些(通常)定义了该对象 + 的对象上下文,但任务在æŸäº›æƒ…况下略有ä¸åŒã€‚ + + - 有效用户ID,ä¿å˜ç”¨æˆ·IDå’ŒFS用户ID + - 有效组ID,ä¿å˜ç»„IDå’ŒFS组ID + - 补充组 + + 这些是仅由任务使用的é¢å¤–凿®ã€‚通常,一个EUID/EGID/GROUPS 被用作主体上下文, + 而真实UID/GID è¢«ç”¨ä½œå¯¹è±¡ä¸Šä¸‹æ–‡ã€‚å¯¹äºŽä»»åŠ¡ï¼Œè¿™å¹¶ä¸æ€»æ˜¯æ£ç¡®çš„。 + + 2. 能力 + + - å…è®¸çš„èƒ½åŠ›é›†åˆ + - å¯ç»§æ‰¿çš„èƒ½åŠ›é›†åˆ + - æœ‰æ•ˆçš„èƒ½åŠ›é›†åˆ + - èƒ½åŠ›è¾¹ç•Œé›†åˆ + + 这些仅由任务æºå¸¦ï¼Œè¡¨ç¤ºæŽˆäºˆä»»åŠ¡çš„è¶…å‡ºæ™®é€šä»»åŠ¡æƒé™çš„能力。这些å¯ä»¥é€šè¿‡ä¼ 统 + UNIX凿®çš„æ›´æ”¹è¿›è¡Œéšå¼æ“作,但也å¯ä»¥é€šè¿‡ ``capset()`` 系统调用直接æ“作。 + + å…许的能力是指进程å¯ä»¥é€šè¿‡ ``capset()`` å°†å…¶æ·»åŠ åˆ°å…¶æœ‰æ•ˆæˆ–å…许集åˆä¸çš„ + 那些能力。这个å¯ç»§æ‰¿çš„集åˆä¹Ÿå¯èƒ½å—åˆ°è¿™æ ·çš„é™åˆ¶ã€‚ + + 有效能力是任务本身实际å¯ä»¥ä½¿ç”¨çš„能力。 + + å¯ç»§æ‰¿èƒ½åŠ›æ˜¯é‚£äº›å¯ä»¥é€šè¿‡ ``execve()`` ä¼ é€’çš„èƒ½åŠ›ã€‚ + + 边界集é™åˆ¶äº†é€šè¿‡ ``execve()`` 继承的能力,特别是在以UID 0执行二进制文件时。 + + 3. å®‰å…¨ç®¡ç†æ ‡è®°ï¼ˆsecurebits) + + å®ƒä»¬ç”¨äºŽæŽ§åˆ¶ä¸Šè¿°å‡æ®åœ¨ç‰¹å®šæ“作如execve()ä¸çš„æ“ä½œå’Œç»§æ‰¿æ–¹å¼ã€‚它们并ä¸ç›´æŽ¥ + ç”¨ä½œå¯¹è±¡æˆ–ä¸»ä½“å‡æ®ä½¿ç”¨ã€‚ + + 4. 密钥和密钥环 + + 这些仅由任务æºå¸¦ã€‚它们用于æºå¸¦å’Œç¼“å˜ä¸é€‚åˆæ”¾å…¥å…¶ä»–æ ‡å‡†UNIX凿®ä¸çš„安全令牌。 + 它们用诸如使网络文件系统密钥在进程执行的文件访问时å¯ç”¨ï¼Œè€Œæ— 需让普通程åºäº†è§£ + 涉åŠçš„安全细节。 + + 密钥环是一ç§ç‰¹æ®Šç±»åž‹çš„密钥。它们æºå¸¦ä¸€ç»„其他密钥,并å¯ä»¥æœç´¢æ¥æŸ¥æ‰¾æ‰€éœ€çš„密钥。 + æ¯ä¸ªè¿›ç¨‹å¯ä»¥è®¢é˜…多个密钥环: + + æ¯çº¿ç¨‹å¯†é’¥ + æ¯è¿›ç¨‹å¯†é’¥çޝ + æ¯ä¼šè¯å¯†é’¥çޝ + + 当进程访问一个密钥时,若尚ä¸å˜åœ¨ï¼Œåˆ™é€šå¸¸ä¼šå°†å…¶ç¼“å˜åœ¨ä¸€ä¸ªå¯†é’¥çޝä¸ï¼Œä»¥ä¾¿å°†æ¥çš„ + 访问时找到该密钥。 + + 有关密钥的更多信æ¯ï¼Œè¯·å‚è§ ``Documentation/translations/zh_CN/security/keys/*`` 。 + + 5. LSM + + Linux安全模å—å…许在任务执行æ“ä½œæ—¶æ–½åŠ é¢å¤–的控制。目å‰ï¼ŒLinux支æŒå‡ ç§LSM选项。 + + ä¸€äº›å·¥ä½œé€šè¿‡æ ‡è®°ç³»ç»Ÿä¸çš„对象,并应用一组规则(ç–略)说明æŸä¸ªæ ‡ç¾çš„任务å¯ä»¥å¯¹ + å¦ä¸€æ ‡ç¾çš„对象执行哪些æ“作。 + + 6. AF_KEY + + 这是一ç§åŸºäºŽå¥—接å—网络åè®®æ ˆä¸çš„凿®ç®¡ç†[RFC 2367]ã€‚æœ¬æ–‡æ¡£ä¸æ²¡æœ‰è®¨è®ºå®ƒ,å› ä¸ºä¸ + ç›´æŽ¥ä¸Žä»»åŠ¡å’Œæ–‡ä»¶å‡æ®è¿›è¡Œäº¤äº’,而是ä¿ç•™äº†ç³»ç»Ÿçº§çš„凿®ã€‚ + + +当打开一个文件时,打开任务的主体上下文的一部分会记录在创建的文件结构ä¸ã€‚ +这使得使用该文件结构的æ“作å¯ä»¥ä½¿ç”¨è¿™äº›å‡æ®ï¼Œè€Œä¸æ˜¯å‘出æ“作的任务的主体上下文。 +ä¸€ä¸ªä¾‹åæ˜¯åœ¨ç½‘ç»œæ–‡ä»¶ç³»ç»Ÿä¸Šæ‰“å¼€çš„æ–‡ä»¶ï¼Œæ‰“å¼€æ–‡ä»¶çš„å‡æ®åº”该被呈现给æœåŠ¡å™¨ï¼Œè€Œä¸ç®¡ +å®žé™…è¿›è¡Œè¯»å–æˆ–写入æ“作的是è°ã€‚ + + +æ–‡ä»¶æ ‡è®° +======== + +å˜å‚¨åœ¨ç£ç›˜ä¸Šæˆ–通过网络获å–的文件å¯èƒ½å…·æœ‰æ³¨é‡Šï¼Œæž„æˆè¯¥æ–‡ä»¶çš„对象安全上下文。 +æ ¹æ®æ–‡ä»¶ç³»ç»Ÿçš„类型,这些注释å¯èƒ½åŒ…括以下一项或多项: + + * UNIX UID, GID, mode; + * Windows user ID; + * Access control list; + * LSM security label; + * UNIX exec privilege escalation bits (SUID/SGID); + * File capabilities exec privilege escalation bits. + +å°†è¿™äº›ä¸Žä»»åŠ¡çš„ä¸»ä½“å®‰å…¨ä¸Šä¸‹æ–‡è¿›è¡Œæ¯”è¾ƒï¼Œå¹¶æ ¹æ®æ¯”较结果å…è®¸æˆ–ç¦æ¢æ‰§è¡ŒæŸäº›æ“作。 +在execve()çš„æƒ…å†µä¸‹ï¼Œç‰¹æƒæå‡ä½èµ·ä½œç”¨ï¼Œå¹¶ä¸”å¯èƒ½å…è®¸ç”±å¯æ‰§è¡Œæ–‡ä»¶çš„æ³¨é‡Šå†³å®šçš„ +进程获得é¢å¤–的特æƒã€‚ + + +ä»»åŠ¡å‡æ® +======== + +在Linuxä¸ï¼Œä¸€ä¸ªä»»åŠ¡çš„æ‰€æœ‰å‡æ®éƒ½ä¿å˜åœ¨ä¸€ä¸ªå¼•用计数结构体‘struct cred’ä¸ï¼Œ +通过(uid, gid)或(groups, keys, LSM security)进行访问。æ¯ä¸ªä»»åŠ¡åœ¨å…¶ +task_structä¸é€šè¿‡ä¸€ä¸ªå为‘cred’的指针指å‘其凿®ã€‚ + +ä¸€æ—¦ä¸€ç»„å‡æ®å·²ç»å‡†å¤‡å¥½å¹¶æäº¤ï¼Œé™¤éžä»¥ä¸‹å‡ ç§æƒ…况,å¦åˆ™ä¸èƒ½æ›´æ”¹ï¼š + + 1. 其引用计数å¯ä»¥æ›´æ”¹ï¼› + + 2. 它所指å‘çš„ group_info 结构体的引用计数å¯ä»¥æ›´æ”¹ï¼› + + 3. 它所指å‘的安全数æ®çš„引用计数å¯ä»¥æ›´æ”¹ï¼› + + 4. 它所指å‘的任何密钥环的引用计数å¯ä»¥æ›´æ”¹ï¼› + + 5. 它所指å‘的任何密钥环å¯ä»¥è¢«æ’¤é”€ã€è¿‡æœŸæˆ–其安全属性å¯ä»¥æ›´æ”¹ï¼› + + 6. 它所指å‘的任何密钥环的内容å¯ä»¥æ›´æ”¹ï¼ˆå¯†é’¥çŽ¯çš„æ•´ä¸ªç›®çš„å°±æ˜¯ä½œä¸ºä¸€ç»„å…±äº«å‡æ®ï¼Œ + å¯ç”±å…·æœ‰é€‚当访问æƒé™çš„任何人修改)。 + +è¦æ›´æ”¹cred结构体ä¸çš„任何内容,必须éµå¾ªå¤åˆ¶å’Œæ›¿æ¢çš„原则。首先进行å¤åˆ¶ï¼Œç„¶åŽä¿® +改副本,最åŽä½¿ç”¨RCU(读-å¤åˆ¶-æ›´æ–°ï¼‰å°†ä»»åŠ¡æŒ‡é’ˆæ›´æ”¹ä¸ºæŒ‡å‘æ–°çš„副本。有一些å°è£…å¯ +用于帮助执行这个过程(è§ä¸‹æ–‡ï¼‰ã€‚ + +一个任务åªèƒ½ä¿®æ”¹è‡ªå·±çš„凿®ï¼›ä¸å†å…许一个任务修改å¦ä¸€ä¸ªä»»åŠ¡çš„å‡æ®ã€‚ +è¿™æ„å‘³ç€ ``capset()`` 系统调用ä¸å†å…许使用除当å‰è¿›ç¨‹ä¹‹å¤–的任何PID。 +æ¤å¤–, ``keyctl_instantiate()`` å’Œ ``keyctl_negate()`` 函数也ä¸å† +å…许在请求进程ä¸é™„åŠ åˆ°ç‰¹å®šäºŽè¿›ç¨‹çš„å¯†é’¥çŽ¯ï¼Œå› ä¸ºå®žä¾‹åŒ–è¿›ç¨‹å¯èƒ½éœ€è¦åˆ›å»ºå®ƒä»¬ã€‚ + + +ä¸å¯å˜å‡æ® +---------- + +ä¸€æ—¦ä¸€ç»„å‡æ®å·²ç»è¢«å…¬å¼€ï¼ˆä¾‹å¦‚通过调用 ``commit_creds()`` ),必须将其视为 +ä¸å¯å˜çš„,除了两个例外情况: + + 1. 引用计数å¯ä»¥è¢«ä¿®æ”¹ã€‚ + + 2. è™½ç„¶æ— æ³•æ›´æ”¹ä¸€ç»„å‡æ®çš„密钥环订阅,但订阅的密钥环的内容å¯ä»¥è¢«æ›´æ”¹ã€‚ + +为了在编译时æ•获æ„å¤–çš„å‡æ®ä¿®æ”¹ï¼Œstruct task_struct具有_const_指针指å‘其凿®é›†ï¼Œ +struct file也是如æ¤ã€‚æ¤å¤–,æŸäº›å‡½æ•°å¦‚ ``get_cred()`` å’Œ ``put_cred()`` 在 +const指针上æ“ä½œï¼Œå› æ¤ä¸éœ€è¦è¿›è¡Œç±»åž‹è½¬æ¢ï¼Œä½†éœ€è¦ä¸´æ—¶æ”¾å¼ƒconsté™å®šï¼Œä»¥ä¾¿èƒ½å¤Ÿä¿®æ”¹ +引用计数。 + + +è®¿é—®ä»»åŠ¡å‡æ® +------------ + +任务åªèƒ½ä¿®æ”¹è‡ªå·±çš„凿®ï¼Œå…许当å‰è¿›ç¨‹å¯ä»¥è¯»å–或替æ¢è‡ªå·±çš„凿®ï¼Œæ— 需任何形å¼é”定的 +情况下 —— è¿™æžå¤§ç®€åŒ–了事情。它å¯ä»¥è°ƒç”¨:: + + const struct cred *current_cred() + +èŽ·å–æŒ‡å‘其凿®ç»“构的指针,并且之åŽä¸å¿…释放它。 + +有一些方便的å°è£…ç”¨äºŽæ£€ç´¢ä»»åŠ¡å‡æ®çš„特定方é¢ï¼ˆåœ¨æ¯ç§æƒ…况下都åªè¿”回值):: + + uid_t current_uid(void) Current's real UID + gid_t current_gid(void) Current's real GID + uid_t current_euid(void) Current's effective UID + gid_t current_egid(void) Current's effective GID + uid_t current_fsuid(void) Current's file access UID + gid_t current_fsgid(void) Current's file access GID + kernel_cap_t current_cap(void) Current's effective capabilities + struct user_struct *current_user(void) Current's user account + +还有一些方便的å°è£…ï¼Œç”¨äºŽæ£€ç´¢ä»»åŠ¡å‡æ®çš„特定关è”对:: + + void current_uid_gid(uid_t *, gid_t *); + void current_euid_egid(uid_t *, gid_t *); + void current_fsuid_fsgid(uid_t *, gid_t *); + +在从当å‰ä»»åŠ¡çš„å‡æ®ä¸æ£€ç´¢åŽï¼Œé€šè¿‡å…¶å‚数返回这些值对。 + + +æ¤å¤–,还有一个函数用于获å–当å‰è¿›ç¨‹çš„当å‰å‡æ®é›†çš„引用:: + + const struct cred *get_current_cred(void); + +以åŠç”¨äºŽèŽ·å–对一个实际上ä¸å˜åœ¨äºŽstruct credä¸çš„凿®çš„引用的函数:: + + struct user_struct *get_current_user(void); + struct group_info *get_current_groups(void); + +分别获得对当å‰è¿›ç¨‹çš„ user accounting structure 和补充组列表的引用。 + +一旦获得引用,就必须使用 ``put_cred()``, ``free_uid()`` 或 +``put_group_info()`` æ¥é€‚当释放它。 + + +è®¿é—®å…¶ä»–ä»»åŠ¡çš„å‡æ® +------------------ + +虽然一个任务å¯ä»¥åœ¨ä¸éœ€è¦é”å®šçš„æƒ…å†µä¸‹è®¿é—®è‡ªå·±çš„å‡æ®ï¼Œä½†æƒ³è¦è®¿é—®å¦ä¸€ä¸ªä»»åŠ¡ +çš„å‡æ®çš„任务并éžå¦‚æ¤ã€‚它必须使用RCU读é”å’Œ ``rcu_dereference()``。 + +``rcu_dereference()`` 是由:: + + const struct cred *__task_cred(struct task_struct *task); + +这应该在RCU读é”ä¸ä½¿ç”¨ï¼Œå¦‚下例所示:: + + void foo(struct task_struct *t, struct foo_data *f) + { + const struct cred *tcred; + ... + rcu_read_lock(); + tcred = __task_cred(t); + f->uid = tcred->uid; + f->gid = tcred->gid; + f->groups = get_group_info(tcred->groups); + rcu_read_unlock(); + ... + } + +如果需è¦é•¿æ—¶é—´æŒæœ‰å¦ä¸€ä¸ªä»»åŠ¡çš„å‡æ®ï¼Œå¹¶ä¸”å¯èƒ½åœ¨æ¤è¿‡ç¨‹ä¸ä¼‘çœ ï¼Œåˆ™è°ƒç”¨æ–¹ +应该使用以下函数æ¥èŽ·å–å¯¹è¿™äº›å‡æ®çš„引用:: + + const struct cred *get_task_cred(struct task_struct *task); + +这个函数内部完æˆäº†æ‰€æœ‰çš„RCUæ“ä½œã€‚å½“ä½¿ç”¨å®Œè¿™äº›å‡æ®æ—¶ï¼Œè°ƒç”¨æ–¹å¿…须调用put_cred() +函数释放它们。 + +.. note:: + ``__task_cred()`` 的结果ä¸åº”ç›´æŽ¥ä¼ é€’ç»™ ``get_cred()`` , + å› ä¸ºè¿™å¯èƒ½ä¸Ž ``commit_cred()`` å‘生竞争æ¡ä»¶ã€‚ + +还有一些方便的函数å¯ä»¥è®¿é—®å¦ä¸€ä¸ªä»»åС凿®çš„特定部分,将RCUæ“作对调用方éšè—èµ·æ¥:: + + uid_t task_uid(task) Task's real UID + uid_t task_euid(task) Task's effective UID + +å¦‚æžœè°ƒç”¨æ–¹åœ¨æ¤æ—¶å·²ç»æŒæœ‰RCU读é”,则应使用:: + + __task_cred(task)->uid + __task_cred(task)->euid + +类似地,如果需è¦è®¿é—®ä»»åС凿®çš„多个方é¢ï¼Œåº”使用RCU读é”,调用 ``__task_cred()`` +函数,将结果å˜å‚¨åœ¨ä¸´æ—¶æŒ‡é’ˆä¸ï¼Œç„¶åŽä»Žä¸´æ—¶æŒ‡é’ˆä¸è°ƒç”¨å‡æ®çš„å„个方é¢ï¼Œæœ€åŽé‡Šæ”¾é”。 +è¿™æ ·å¯ä»¥é˜²æ¢å¤šæ¬¡è°ƒç”¨æ˜‚贵的RCUæ“作。 + +如果需è¦è®¿é—®å¦ä¸€ä¸ªä»»åС凿®çš„å…¶ä»–å•个方é¢ï¼Œå¯ä»¥ä½¿ç”¨:: + + task_cred_xxx(task, member) + +这里的‘member’是credç»“æž„ä½“çš„éžæŒ‡é’ˆæˆå‘˜ã€‚例如:: + + uid_t task_cred_xxx(task, suid); + +å°†ä»Žä»»åŠ¡ä¸æ£€ç´¢â€˜struct cred::suid’,并执行适当的RCUæ“作。对于指针æˆå‘˜ï¼Œ +ä¸èƒ½ä½¿ç”¨è¿™ç§å½¢å¼ï¼Œå› 为它们指å‘的内容å¯èƒ½åœ¨é‡Šæ”¾RCU读é”的瞬间消失。 + + +ä¿®æ”¹å‡æ® +-------- + +å¦‚å…ˆå‰æåˆ°çš„ï¼Œä¸€ä¸ªä»»åŠ¡åªèƒ½ä¿®æ”¹è‡ªå·±çš„凿®ï¼Œä¸èƒ½ä¿®æ”¹å…¶ä»–ä»»åŠ¡çš„å‡æ®ã€‚è¿™æ„味 +ç€å®ƒä¸éœ€è¦ä½¿ç”¨ä»»ä½•锿¥ä¿®æ”¹è‡ªå·±çš„凿®ã€‚ + +è¦ä¿®æ”¹å½“å‰è¿›ç¨‹çš„凿®ï¼Œå‡½æ•°åº”首先调用:: + + struct cred *prepare_creds(void); + +这将é”定current->cred_replace_mutex,然åŽåˆ†é…并构建当å‰è¿›ç¨‹å‡æ®çš„副本。 +如果æˆåŠŸï¼Œå‡½æ•°è¿”å›žæ—¶ä»ç„¶ä¿æŒäº’æ–¥é”ã€‚å¦‚æžœä¸æˆåŠŸï¼ˆå†…å˜ä¸è¶³ï¼‰ï¼Œåˆ™è¿”回NULL。 + +互斥é”é˜²æ¢ ``ptrace()`` åœ¨è¿›è¡Œå‡æ®æž„建和更改的安全检查时更改进程的ptrace +状æ€ï¼Œå› 为ptrace状æ€å¯èƒ½ä¼šæ”¹å˜ç»“果,特别是在 ``execve()`` 的情况下。 + +æ–°çš„å‡æ®é›†åº”é€‚å½“åœ°è¿›è¡Œä¿®æ”¹ï¼Œå¹¶è¿›è¡Œä»»ä½•å®‰å…¨æ£€æŸ¥å’ŒæŒ‚é’©ã€‚åœ¨æ¤æ—¶ï¼Œå½“å‰å’Œå»ºè®®çš„ +凿®é›†éƒ½å¯ç”¨ï¼Œå› 为current_cred()将返回当å‰çš„凿®é›†ã€‚ + +在替æ¢ç»„åˆ—è¡¨æ—¶ï¼Œå¿…é¡»åœ¨å°†å…¶æ·»åŠ åˆ°å‡æ®ä¹‹å‰å¯¹æ–°åˆ—表进行排åºï¼Œå› 为使用二分查找 +测试æˆå‘˜èµ„æ ¼ã€‚å®žé™…ä¸Šï¼Œè¿™æ„味ç€åœ¨set_groups()或set_current_groups()之 +å‰åº”调用groups_sort()。groups_sort()ä¸èƒ½åœ¨å…±äº«çš„ ``struct group_list`` +ä¸Šè°ƒç”¨ï¼Œå› ä¸ºå³ä½¿æ•°ç»„å·²ç»æŽ’åºï¼Œå®ƒä¹Ÿå¯èƒ½ä½œä¸ºæŽ’åºè¿‡ç¨‹çš„ä¸€éƒ¨åˆ†å¯¹å…ƒç´ è¿›è¡ŒæŽ’åˆ—ã€‚ + +当凿®é›†å‡†å¤‡å¥½æ—¶ï¼Œåº”通过调用以下函数将其æäº¤ç»™å½“å‰è¿›ç¨‹:: + + int commit_creds(struct cred *new); + +è¿™å°†ä¿®æ”¹å‡æ®å’Œè¿›ç¨‹çš„å„个方é¢ï¼Œç»™LSMæä¾›æœºä¼šåšåŒæ ·çš„修改,然åŽä½¿ç”¨ +``rcu_assign_pointer()`` å°†æ–°çš„å‡æ®å®žé™…æäº¤ç»™ ``current->cred`` , +释放 ``current->cred_replace_mutex`` 以å…许 ``ptrace()`` è¿›è¡Œæ“ +作,并通知调度程åºå’Œå…¶ä»–组件有关更改的情况。 + +该函数ä¿è¯è¿”回0,以便å¯ä»¥åœ¨è¯¸å¦‚ ``sys_setresuid()`` 函数的末尾进行尾调用。 + +请注æ„ï¼Œè¯¥å‡½æ•°ä¼šæ¶ˆè€—è°ƒç”¨è€…å¯¹æ–°å‡æ®çš„引用。调用者在æ¤ä¹‹åŽä¸åº”调用 +``put_cred()`` é‡Šæ”¾æ–°å‡æ®ã€‚ + +æ¤å¤–ï¼Œä¸€æ—¦æ–°çš„å‡æ®ä¸Šè°ƒç”¨äº†è¯¥å‡½æ•°ï¼Œå°±ä¸èƒ½è¿›ä¸€æ¥æ›´æ”¹è¿™äº›å‡æ®ã€‚ + + +如果在调用 ``prepare_creds()`` 之åŽå®‰å…¨æ£€æŸ¥å¤±è´¥æˆ–å‘生其他错误, +则应调用以下函数:: + + void abort_creds(struct cred *new); + +这将释放 ``prepare_creds()`` 获å–çš„ ``current->cred_replace_mutex`` çš„é”, +å¹¶é‡Šæ”¾æ–°çš„å‡æ®ã€‚ + +ä¸€ä¸ªå…¸åž‹çš„å‡æ®ä¿®æ”¹å‡½æ•°çœ‹èµ·æ¥åƒè¿™æ ·:: + + int alter_suid(uid_t suid) + { + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->suid = suid; + ret = security_alter_suid(new); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); + } + + +管ç†å‡æ® +-------- + +有一些函数用æ¥è¾…åŠ©å‡æ®ç®¡ç†: + + - ``void put_cred(const struct cred *cred);`` + + è¿™å°†é‡Šæ”¾å¯¹ç»™å®šå‡æ®é›†çš„å¼•ç”¨ã€‚å¦‚æžœå¼•ç”¨è®¡æ•°ä¸ºé›¶ï¼Œå‡æ®é›†å°†ç”± + RCU系统安排进行销æ¯ã€‚ + + - ``const struct cred *get_cred(const struct cred *cred);`` + + 这将获å–å¯¹æ´»åŠ¨å‡æ®é›†çš„引用。返回指å‘凿®é›†çš„æŒ‡é’ˆã€‚ + + - ``struct cred *get_new_cred(struct cred *cred);`` + + 这将获å–坹当剿£åœ¨æž„建且å¯å˜çš„凿®é›†çš„引用。返回指å‘凿®é›†çš„æŒ‡é’ˆã€‚ + +æ‰“å¼€æ–‡ä»¶å‡æ® +============ + +当打开新文件时,会获å–å¯¹æ‰“å¼€ä»»åŠ¡å‡æ®çš„å¼•ç”¨ï¼Œå¹¶å°†å…¶é™„åŠ åˆ°æ–‡ä»¶ç»“æž„ä½“çš„ +``f_cred`` å—æ®µä¸ï¼Œæ›¿ä»£åŽŸæ¥çš„ ``f_uid`` å’Œ ``f_gid`` 。原æ¥è®¿é—® +``file->f_uid`` å’Œ ``file->f_gid`` 的代ç 现在应访问 ``file->f_cred->fsuid`` +å’Œ ``file->f_cred->fsgid`` 。 + +安全访问 ``f_cred`` 的情况下å¯ä»¥ä¸ä½¿ç”¨RCUæˆ–åŠ é”ï¼Œå› ä¸ºæŒ‡å‘凿®çš„æŒ‡é’ˆ +ä»¥åŠæŒ‡å‘çš„å‡æ®ç»“构的内容在文件结构的整个生命周期ä¸ä¿æŒä¸å˜ï¼Œé™¤éžæ˜¯ +上述列出的例外情况(å‚é˜…ä»»åŠ¡å‡æ®éƒ¨åˆ†ï¼‰ã€‚ + +为了é¿å…“混淆代ç†â€æƒé™æå‡æ”»å‡»ï¼Œåœ¨æ‰“开的文件åŽç»æ“作时,访问控制检查 +åº”è¯¥ä½¿ç”¨è¿™äº›å‡æ®ï¼Œè€Œä¸æ˜¯ä½¿ç”¨â€œå½“å‰â€çš„凿®ï¼Œå› 为该文件å¯èƒ½å·²ç»è¢«ä¼ 递给 +一个更具特æƒçš„进程。 + +覆盖VFS坹凿®çš„使用 +=================== + +在æŸäº›æƒ…况下,需è¦è¦†ç›–VFSä½¿ç”¨çš„å‡æ®ï¼Œå¯ä»¥é€šè¿‡ä½¿ç”¨ä¸åŒçš„凿®é›†è°ƒç”¨ +如 ``vfs_mkdir()`` æ¥å®žçŽ°ã€‚ä»¥ä¸‹æ˜¯ä¸€äº›è¿›è¡Œæ¤æ“作的ä½ç½®: + + * ``sys_faccessat()``. + * ``do_coredump()``. + * nfs4recover.c. diff --git a/Documentation/translations/zh_CN/security/index.rst b/Documentation/translations/zh_CN/security/index.rst index d8aacd1930d9..78d9d4b36dca 100644 --- a/Documentation/translations/zh_CN/security/index.rst +++ b/Documentation/translations/zh_CN/security/index.rst @@ -15,20 +15,20 @@ .. toctree:: :maxdepth: 1 + credentials + snp-tdx-threat-model lsm sak + self-protection siphash + tpm/index digsig landlock TODOLIST: -* credentials -* snp-tdx-threat-model * IMA-templates * keys/index * lsm-development * SCTP -* self-protection -* tpm/index * secrets/index * ipe diff --git a/Documentation/translations/zh_CN/security/keys/index.rst b/Documentation/translations/zh_CN/security/keys/index.rst new file mode 100644 index 000000000000..7c28d003fb0a --- /dev/null +++ b/Documentation/translations/zh_CN/security/keys/index.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/keys/index.rst + +:翻译: + + +======== +å†…æ ¸å¯†é’¥ +======== + +.. toctree:: + :maxdepth: 1 + + +TODOLIST: +* core +* ecryptfs +* request-key +* trusted-encrypted diff --git a/Documentation/translations/zh_CN/security/secrets/index.rst b/Documentation/translations/zh_CN/security/secrets/index.rst new file mode 100644 index 000000000000..5ea78713f10e --- /dev/null +++ b/Documentation/translations/zh_CN/security/secrets/index.rst @@ -0,0 +1,17 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/secrets/index.rst + +:翻译: + +===================== +密钥文档 +===================== + +.. toctree:: + + +TODOLIST: + +* coco diff --git a/Documentation/translations/zh_CN/security/self-protection.rst b/Documentation/translations/zh_CN/security/self-protection.rst new file mode 100644 index 000000000000..3c8a68b1e1be --- /dev/null +++ b/Documentation/translations/zh_CN/security/self-protection.rst @@ -0,0 +1,271 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../disclaimer-zh_CN.rst +:Original: Documentation/security/self-protection.rst + +:翻译: + + å¼ å· zhangwei <zhangwei@cqsoftware.com.cn> + +============ +å†…æ ¸è‡ªæˆ‘ä¿æŠ¤ +============ + +å†…æ ¸è‡ªæˆ‘ä¿æŠ¤æ˜¯æŒ‡åœ¨Linuxå†…æ ¸ä¸è®¾è®¡ä¸Žå®žçŽ°çš„å„ç§ç³»ç»Ÿä¸Žç»“æž„ +以防æ¢å†…æ ¸æœ¬èº«çš„å®‰å…¨æ¼æ´žé—®é¢˜ã€‚它涵盖了广泛问题,包括去除 +æ•´ä¸ªç±»çš„æ¼æ´žï¼Œé˜»æ¢å®‰å…¨æ¼æ´žåˆ©ç”¨æ–¹æ³•,以åŠä¸»åŠ¨æ£€æµ‹æ”»å‡»å° +è¯•ã€‚å¹¶éžæ‰€æœ‰çš„è¯é¢˜éƒ½åœ¨æœ¬æ–‡ä¸æ¶‰åŠï¼Œä½†å®ƒåº”è¯¥ä¸ºäº†è§£å†…æ ¸è‡ªæˆ‘ +ä¿æŠ¤æä¾›ä¸€ä¸ªåˆç†çš„起点,并解ç”常è§çš„问题。(当然,欢迎æ +交补ä¸ï¼ï¼‰ + +在最å的情况下,我们å‡è®¾ä¸€ä¸ªéžç‰¹æƒçš„æœ¬åœ°æ”»å‡»è€…å¯¹å†…æ ¸å†…å˜ +有任æ„读写访问æƒé™ã€‚è™½ç„¶åœ¨è®¸å¤šæƒ…å†µä¸‹ï¼Œæ¼æ´žè¢«åˆ©ç”¨æ—¶å¹¶ä¸ä¼š +æä¾›æ¤çº§åˆ«çš„访问æƒé™ï¼Œä½†å¦‚æžœæˆ‘ä»¬èƒ½é˜²å¾¡æœ€åæƒ…况,也能应对 +æƒé™è¾ƒä½Žçš„æ”»å‡»ã€‚ä¸€ä¸ªæ›´é«˜çš„æ ‡å‡†ï¼Œä¸”éœ€è¦ç‰¢è®°çš„æ˜¯ä¿æŠ¤å†…æ ¸å… +å—具有特æƒçš„æœ¬åœ°æ”»å‡»è€…çš„æ”»å‡»ï¼Œå› ä¸ºroot用户å¯ä»¥æœ‰æ›´å¤šæƒé™ã€‚ +ï¼ˆå°¤å…¶æ˜¯å½“ä»–ä»¬èƒ½å¤ŸåŠ è½½ä»»æ„å†…æ ¸æ¨¡å—æ—¶ï¼‰ + +æˆåŠŸçš„è‡ªæˆ‘ä¿æŠ¤çš„ç›®æ ‡æ˜¯ï¼šæœ‰æ•ˆã€é»˜è®¤å¼€å¯ã€ä¸éœ€è¦å¼€å‘者主动 +é€‰æ‹©ã€æ²¡æœ‰æ€§èƒ½å½±å“ã€ä¸å¦¨ç¢å†…æ ¸è°ƒè¯•ã€å¹¶ä¸”没有测试。虽然很 +éš¾æ»¡è¶³æ‰€æœ‰çš„è¿™äº›ç›®æ ‡ï¼Œä½†æ˜Žç¡®æåˆ°è¿™äº›ç›®æ ‡éžå¸¸é‡è¦ï¼Œå› 为这 +些方é¢éœ€è¦è¢«æŽ¢ç´¢ã€è§£å†³æˆ–接å—。 + +========== +攻击é¢ç¼©å‡ +========== + +防æ¢å®‰å…¨æ¼æ´žæœ€åŸºæœ¬çš„é˜²å¾¡æ–¹å¼æ˜¯å‡å°‘å¯ä»¥è¢«ç”¨æ¥é‡å®šå‘执行的 +å†…æ ¸åŒºåŸŸã€‚è¿™åŒ…æ‹¬é™åˆ¶ç”¨æˆ·å…¬å¼€ä½¿ç”¨çš„APIã€ä½¿å†…æ ¸API更难被错 +è¯¯ä½¿ç”¨ã€æœ€å°åŒ–å¯å†™å†…æ ¸å†…å˜åŒºåŸŸç‰ã€‚ + +ä¸¥æ ¼çš„å†…æ ¸å†…å˜æƒé™ +------------------- + +å½“æ‰€æœ‰å†…æ ¸å†…å˜éƒ½æ˜¯å¯å†™çš„,攻击者å¯ä»¥è½»æ¾åœ°é‡å®šå‘执行æµã€‚ +为了å‡å°‘è¿™ç§æ”»å‡»ç›®æ ‡çš„å¯ç”¨æ€§ï¼Œå†…æ ¸éœ€è¦æ›´ä¸¥æ ¼çš„æƒé™é›†æ¥ +ä¿æŠ¤å…¶å†…å˜ã€‚ + +坿‰§è¡Œä»£ç å’Œåªè¯»æ•°æ®å¿…é¡»ä¸å¯å†™ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +ä»»ä½•å…·æœ‰å¯æ‰§è¡Œå†…å˜çš„区域必须ä¸å¯å†™ï¼Œæ˜¾ç„¶è¿™ä¹ŸåŒ…æ‹¬å†…æ ¸æ–‡æœ¬ +æœ¬èº«ã€‚æˆ‘ä»¬è¿˜å¿…é¡»è€ƒè™‘å…¶ä»–åœ°æ–¹ï¼šå†…æ ¸æ¨¡å—ã€JIT内å˜ç‰ï¼Œï¼ˆåœ¨ +æŸäº›æƒ…况下,为了支æŒåƒæŒ‡ä»¤æ›¿ä»£ã€æ–点ã€kprobesç‰åŠŸèƒ½ï¼Œè¿™äº› +区域会暂时被设置为å¯å†™ã€‚如果这些功能必须å˜åœ¨äºŽå†…æ ¸ä¸ï¼Œå®ƒ +ä»¬çš„å®žçŽ°æ–¹å¼æ˜¯ï¼šåœ¨æ›´æ–°æœŸé—´å°†å†…å˜ä¸´æ—¶è®¾ç½®å¯å†™ï¼Œç„¶åŽå†æ¢å¤ +为原始æƒé™ã€‚) + +为了支æŒè¿™ä¸€ç‚¹ï¼ŒCONFIG_STRICT_KERNEL_RWX å’Œ +CONFIG_STRICT_MODULE_RWX 的设计旨在确ä¿ä»£ç ä¸å¯å†™ï¼Œæ•°æ®ä¸ +坿‰§è¡Œï¼Œä»¥åŠåªè¯»æ•°æ®æ—¢ä¸å¯å†™ä¹Ÿä¸å¯æ‰§è¡Œã€‚ + +大多数架构默认支æŒè¿™äº›é€‰é¡¹ï¼Œä¸”ç”¨æˆ·æ— æ³•é€‰æ‹©ã€‚å¯¹äºŽä¸€äº›åƒarm +è¿™ç§å¸Œæœ›èƒ½å¤Ÿé€‰æ‹©è¿™äº›é€‰é¡¹çš„æž¶æž„,å¯ä»¥åœ¨æž¶æž„Kconfigä¸é€‰æ‹© +ARCH_OPTIONAL_KERNEL_RWX以å¯ç”¨Kconfigæç¤ºã€‚ +CONFIG_ARCH_OPTIONAL_KERNEL_RWX_DEFAULT决定在å¯ç”¨ +ARCH_OPTIONAL_KERNEL_RWX时的默认设置。 + +å‡½æ•°æŒ‡é’ˆå’Œæ•æ„Ÿå˜é‡å¿…é¡»ä¸å¯å†™ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +å†…æ ¸å†…å˜ä¸æœ‰å¤§é‡çš„å‡½æ•°æŒ‡é’ˆï¼Œè¿™äº›æŒ‡é’ˆè¢«å†…æ ¸æŸ¥æ‰¾å¹¶ç”¨äºŽç»§ç»æ‰§è¡Œ +(例如,æè¿°ç¬¦/å‘é‡è¡¨ã€æ–‡ä»¶/ç½‘ç»œç‰æ“作结构ç‰ï¼‰ã€‚这些å˜é‡çš„æ•° +é‡å¿…é¡»å‡å°‘到最低é™åº¦ + +许多åƒè¿™æ ·çš„å˜é‡å¯ä»¥é€šè¿‡è®¾ç½®ä¸º"const"æ¥å®žçްåªè¯»ï¼Œä»Žè€Œä½¿å®ƒä»¬ +å˜æ”¾åœ¨å†…æ ¸çš„.rodata段而éž.dataæ®µï¼Œä»Žè€ŒèŽ·å¾—å†…æ ¸ä¸¥æ ¼å†…å˜æƒé™çš„ +ä¿æŠ¤ã€‚ + +对于在_init是仅åˆå§‹åŒ–一次的å˜é‡ï¼Œå¯ä»¥ä½¿ç”¨_ro_after_init属性 +è¿›è¡Œæ ‡è®°ã€‚ + +剩下的å˜é‡é€šå¸¸æ˜¯é‚£äº›æ›´æ–°é¢‘率较低的(例如GDT)。这些å˜é‡éœ€è¦å¦ +一个机制(类似于上述æåˆ°çš„å¯¹å†…æ ¸ä»£ç æ‰€åšçš„临时例外),以便在 +å…¶ä½™ç”Ÿå‘½å‘¨æœŸå†…ä¿æŒåªè¯»çжæ€ã€‚ï¼ˆä¾‹å¦‚ï¼Œåœ¨è¿›è¡Œæ›´æ–°æ—¶ï¼Œåªæœ‰æ‰§è¡Œ +æ›´æ–°çš„CPU线程会被授予对内å˜çš„ä¸å¯ä¸æ–写入访问æƒé™ã€‚) + +å°†å†…æ ¸å†…å˜ä¸Žç”¨æˆ·ç©ºé—´å†…å˜åˆ†éš”å¼€ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +å†…æ ¸ç»å¯¹ä¸å¯ä»¥æ‰§è¡Œç”¨æˆ·ç©ºé—´å†…å˜ï¼ŒåŒæ—¶ï¼Œå†…æ ¸ä¹Ÿä¸å¾—在没有明确预 +期的情况下访问用户内å˜ç©ºé—´ã€‚这些规则å¯ä»¥é€šè¿‡ä¸€äº›ç¡¬ä»¶é™åˆ¶æ¥æ”¯ +æŒï¼ˆå¦‚x86çš„SMEP/SMAP,ARMçš„PXN/PAN)或通过仿真(如ARMçš„å†…å˜ +域)æ¥å¼ºåˆ¶æ‰§è¡Œã€‚é€šè¿‡è¿™ç§æ–¹å¼é˜»æ¢ç”¨æˆ·ç©ºé—´å†…å˜çš„访问,攻击者就 +æ— æ³•å°†æ‰§è¡Œå’Œæ•°æ®è§£æžè½¬ç§»åˆ°æ˜“于控制的用户空间内å˜ï¼Œä»Žè€Œè¿«ä½¿æ”» +å‡»å®Œå…¨åœ¨å†…æ ¸ä¸è¿›è¡Œã€‚ + +å‡å°‘对系统调用的访问 +-------------------- + +对于64ä½ç³»ç»Ÿï¼Œä¸€ç§æ¶ˆé™¤è®¸å¤šç³»ç»Ÿè°ƒç”¨æœ€ç®€å•的方法是构建时ä¸å¯ç”¨ +CONFIG_CONPATã€‚ç„¶è€Œï¼Œè¿™ç§æƒ…况通常ä¸å¯è¡Œã€‚ + +“seccompâ€ç³»ç»Ÿä¸ºç”¨æˆ·ç©ºé—´æä¾›äº†ä¸€ç§å¯é€‰åŠŸèƒ½ï¼Œæä¾›äº†ä¸€ç§å‡å°‘å¯ä¾› +è¿è¡Œä¸è¿›ç¨‹ä½¿ç”¨å†…æ ¸å…¥å£ç‚¹æ•°é‡çš„æ–¹æ³•。这é™åˆ¶äº†å¯ä»¥è®¿é—®å†…æ ¸ä»£ç +的范围,å¯èƒ½é™ä½Žäº†æŸä¸ªç‰¹å®šæ¼æ´žè¢«æ”»å‡»è€…利用的å¯èƒ½æ€§ã€‚ + +ä¸€ä¸ªæ”¹è¿›çš„æ–¹å‘æ˜¯åˆ›å»ºæœ‰æ•ˆçš„æ–¹æ³•,仅å…许å—信任的进程访问例如兼 +容模å¼ã€ç”¨æˆ·å‘½å空间ã€BPF创建和性能分æžç‰åŠŸèƒ½ã€‚è¿™å°†æŠŠå†…æ ¸å…¥å£ +点范围é™åˆ¶åœ¨é€šå¸¸å¯ä»¥è¢«éžç‰¹æƒç”¨æˆ·ç©ºé—´è¿›ç¨‹è®¿é—®çš„较常è§é›†åˆä¸ + +é™åˆ¶å¯¹å†…æ ¸æ¨¡å—的访问 +-------------------- + +å†…æ ¸ç»ä¸åº”å…许éžç‰¹æƒç”¨æˆ·åŠ è½½ç‰¹å®šçš„å†…æ ¸æ¨¡å—ï¼Œå› ä¸ºè¿™å¯èƒ½ä¸ºæ”»å‡»è€… +æä¾›ä¸€ä¸ªæ„外扩展的å¯ç”¨æ”»å‡»é¢çš„æ–¹æ³•。(通过已预定义åç³»ç»ŸæŒ‰éœ€åŠ +载模å—,如MODULE_ALIAS_*,被认为是“预期的â€ï¼Œä½†å³ä¾¿å¦‚æ¤ï¼Œä¹Ÿåº”对 +这些情况给予更多的关注。)例如,通过éžç‰¹æƒçš„套接å—APIåŠ è½½æ–‡ä»¶ +ç³»ç»Ÿæ¨¡å—æ˜¯æ²¡æœ‰æ„ä¹‰çš„ï¼šåªæœ‰rootç”¨æˆ·æˆ–ç‰©ç†æœ¬åœ°ç”¨æˆ·åº”è¯¥è§¦å‘æ–‡ä»¶ç³» +统模å—çš„åŠ è½½ã€‚ï¼ˆåœ¨æŸäº›æƒ…况下,这甚至å¯èƒ½å˜åœ¨äº‰è®®ã€‚) + +为了防æ¢ç‰¹æƒç”¨æˆ·çš„æ”»å‡»ï¼Œç³»ç»Ÿå¯èƒ½éœ€è¦å®Œå…¨ç¦æ¢æ¨¡å—åŠ è½½ï¼ˆä¾‹å¦‚ï¼Œé€š +过å•ä½“å†…æ ¸æž„å»ºæˆ–modules_disabled sysctl),或者使用ç¾å模å—(例 +如,CONFIG_MODULE_SIG_FORCE或通过LoadPinä¿æŠ¤çš„dm-crypt),以防 +æ¢root用户通过模å—åŠ è½½å™¨åŠ è½½ä»»æ„å†…æ ¸ä»£ç 。 + +内å˜å®Œæ•´æ€§ +---------- + +å†…æ ¸ä¸æœ‰è®¸å¤šå†…å˜ç»“构在攻击过程ä¸è¢«å®šæœŸæ³›æ»¥ç”¨ä»¥èŽ·å–æ‰§è¡ŒæŽ§åˆ¶ï¼Œè¿„今 +为æ¢ï¼Œæœ€å¸¸è§çš„æ˜¯å †æ ˆç¼“å†²åŒºæº¢å‡ºï¼Œåœ¨è¿™ç§æ”»å‡»ä¸ï¼Œå †æ ˆä¸Šå˜å‚¨çš„返回地 +å€è¢«è¦†ç›–。除æ¤ä¹‹å¤–,还有许多其他类型的攻击,防护措施也应è¿è€Œç”Ÿã€‚ + +å †æ ˆç¼“å†²åŒºæº¢å‡º +-------------- + +ç»å…¸çš„å †æ ˆç¼“å†²åŒºæº¢å‡ºæ”»å‡»æ˜¯æŒ‡è¶…å‡ºæ ˆä¸Šåˆ†é…çš„å˜é‡é¢„期大å°ï¼Œä»Žè€Œå°†ä¸€ +ä¸ªå—æŽ§å€¼å†™å…¥æ ˆå¸§çš„è¿”å›žåœ°å€ã€‚最常è§çš„é˜²å¾¡æŽªæ–½æ˜¯å †æ ˆä¿æŠ¤ +(CONFIG_STACKPROTECTOR),它在函数返回å‰ä¼šéªŒè¯æ ˆä¸Šçš„“stack canaryâ€ã€‚ +其他防御措施还包括影åå †æ ˆç‰ã€‚ + +å †æ ˆæ·±åº¦æº¢å‡º +------------ + +一个ä¸å¤ªå®¹æ˜“被ç†è§£çš„æ”»å‡»æ–¹å¼æ˜¯åˆ©ç”¨bug触å‘å†…æ ¸é€šè¿‡æ·±åº¦å‡½æ•°è°ƒç”¨æˆ– +å¤§çš„å †æ ˆåˆ†é…æ¥æ¶ˆè€—å †æ ˆå†…å˜ã€‚é€šè¿‡è¿™ç§æ”»å‡»ï¼Œæ”»å‡»è€…å¯ä»¥å°†æ•°æ®å†™å…¥å†… +æ ¸é¢„åˆ†é…å †æ ˆç©ºé—´ä¹‹å¤–çš„æ•æ„Ÿç»“æž„ã€‚ä¸ºäº†æ›´å¥½çš„é˜²æŠ¤è¿™ç§æ”»å‡»ï¼Œå¿…须进行 +两项é‡è¦çš„æ›´æ”¹ï¼šå°†æ•感的线程信æ¯ç»“æž„è½¬ç§»åˆ°å…¶ä»–åœ°æ–¹ï¼Œå¹¶åœ¨å †æ ˆåº•éƒ¨ +æ·»åŠ ä¸€ä¸ªæ•…éšœå†…å˜æ´žï¼Œä»¥æ•获这些溢出 + +æ ˆå†…å˜å®Œæ•´æ€§ +------------ + +ç”¨äºŽè·Ÿè¸ªå †ç©ºé—²åˆ—è¡¨çš„ç»“æž„å¯ä»¥åœ¨åˆ†é…和释放时进行完整性检查,以确ä¿å®ƒ +们ä¸ä¼šè¢«ç”¨æ¥æ“作其它内å˜åŒºåŸŸã€‚ + +计算器完整性 +------------ + +å†…æ ¸ä¸çš„许多地方使用原å计数器æ¥è·Ÿè¸ªå¯¹è±¡å¼•用或执行类似的生命周期管 +ç†ã€‚当这些计数器å¯èƒ½å‘ç”Ÿæº¢å‡ºæ—¶ï¼ˆæ— è®ºæ˜¯ä¸Šæº¢è¿˜æ˜¯ä¸‹æº¢ï¼‰ï¼Œè¿™é€šå¸¸ä¼šæš´éœ² +出使用åŽé‡Šæ”¾ï¼ˆuse-after-freeï¼‰æ¼æ´žã€‚é€šè¿‡æ•æ‰åŽŸåè®¡æ•°å™¨æº¢å‡ºï¼Œè¿™ç±»æ¼ +æ´žå°±å¯ä»¥æ¶ˆå¤±ã€‚ + +大å°è®¡ç®—溢出检测 +---------------- + +与计算器溢出类似,整数溢出(通常是大å°è®¡ç®—)需è¦åœ¨è¿è¡Œæ—¶è¿›è¡Œæ£€æµ‹ï¼Œ +以防æ¢è¿™ç±»åœ¨ä¼ ç»Ÿä¸Šä¼šå¯¼è‡´èƒ½å¤Ÿå†™å…¥å†…æ ¸ç¼“å†²åŒºæœ«å°¾ä¹‹å¤–çš„æ¼æ´žã€‚ + +概率性防御 +---------- + +尽管许多防御措施å¯ä»¥è¢«è®¤å®šæ˜¯ç¡®å®šçš„(例如,åªè¯»å†…å˜ä¸èƒ½å†™å…¥ï¼‰ï¼Œä½† +æœ‰äº›ç¡®ä¿æŽªæ–½ä»…æä¾›ç»Ÿè®¡é˜²å¾¡ï¼Œå³æ”»å‡»è€…必须收集足够的关于è¿è¡Œç³»ç»Ÿçš„ +ä¿¡æ¯æ‰èƒ½çªç ´é˜²å¾¡ã€‚尽管这些防御并ä¸å®Œç¾Žï¼Œä½†å®ƒä»¬ç¡®å®žæä¾›äº†æœ‰æ„义的 +ä¿æŠ¤ã€‚ + +æ ˆä¿æŠ¤ã€è¿·æƒ‘技术和其他秘密 +-------------------------- + +值得注æ„的是,åƒä¹‹å‰è®¨è®ºçš„æ ˆä¿æŠ¤è¿™æ ·çš„æŠ€æœ¯ï¼Œä»ŽæŠ€æœ¯ä¸Šæ¥è¯´æ˜¯ç»Ÿè®¡æ€§é˜² +å¾¡ï¼Œå› ä¸ºå®ƒä»¬ä¾èµ–äºŽä¸€ä¸ªç§˜å¯†å€¼ï¼Œè€Œè¿™æ ·çš„å€¼å¯èƒ½ä¼šé€šè¿‡ä¿¡æ¯æ³„éœ²æ¼æ´žè€Œè¢« +å‘现。 + +对于想JITï¼ˆåŠæ—¶ç¿»è¯‘å™¨ï¼‰è¿™æ ·çš„æƒ…å†µï¼Œå…¶ä¸å¯æ‰§è¡Œå†…容å¯èƒ½éƒ¨åˆ†ç”±ç”¨æˆ·ç©ºé—´ +控制,也需è¦ç±»ä¼¼çš„秘密之æ¥è¿·æƒ‘。 + +至关é‡è¦çš„æ˜¯ï¼Œæ‰€ä½¿ç”¨çš„秘密值必须是独立的(例如,æ¯ä¸ªæ ˆä½¿ç”¨ä¸åŒçš„æ ˆ +ä¿æŠ¤å€¼ï¼‰ï¼Œå¹¶ä¸”å…·æœ‰é«˜ç†µï¼ˆä¾‹å¦‚ï¼Œéšæœºæ•°ç”Ÿæˆå™¨ï¼ˆRNGï¼‰æ˜¯å¦æ£å¸¸å·¥ä½œï¼Ÿï¼‰ï¼Œ +以最大é™åº¦åœ°æé«˜å…¶æˆåŠŸçŽ‡ã€‚ + +å†…æ ¸åœ°å€ç©ºé—´å¸ƒå±€éšæœºåŒ–ï¼ˆKASLR) +------------------------------- + +ç”±äºŽå†…æ ¸å†…å˜çš„ä½ç½®å‡ 乎总是攻击æˆåŠŸçš„å…³é”®å› ç´ ï¼Œå› æ¤ä½¿å†…æ ¸å†…å˜ä½ç½®å˜ +å¾—éžç¡®å®šæ€§ä¼šå¢žåŠ æ”»å‡»çš„éš¾åº¦ã€‚ï¼ˆè¯·æ³¨æ„,这åè¿‡æ¥æé«˜äº†ä¿¡æ¯æ³„露的价 +å€¼ï¼Œå› ä¸ºæ³„éœ²çš„ä¿¡æ¯å¯ä»¥ç”¨æ¥å‘çŽ°ç›®æ ‡å†…å˜ä½ç½®ã€‚) + +文本和模å—åŸºå€ +-------------- + +通过在å¯åŠ¨æ—¶é‡æ–°è®¾å®šå†…æ ¸çš„ç‰©ç†åŸºåœ°å€å’Œè™šæ‹ŸåŸºåœ°å€ +(CONFIG_RANDOMIZE_BASE),那些需è¦åˆ©ç”¨å†…æ ¸ä»£ç 的攻击将会å—阻。æ¤å¤– +通过å移模å—åŠ è½½åŸºåœ°å€ï¼Œæ„味ç€å³ä½¿ç³»ç»Ÿæ¯æ¬¡å¯åŠ¨æ—¶æŒ‰ç›¸åŒé¡ºåºåŠ è½½åŒä¸€ +组模å—,这些模å—也ä¸ä¼šä¸Žå†…æ ¸æ–‡æœ¬çš„å…¶ä½™éƒ¨åˆ†å…¬ç”¨ä¸€ä¸ªåŸºåœ°å€ã€‚ + +å †æ ˆåŸºåœ°å€ +---------- + +å¦‚æžœè¿›ç¨‹ä¹‹é—´å†…æ ¸å †æ ˆçš„åŸºåœ°å€ä¸ç›¸åŒï¼Œç”šè‡³åœ¨ä¸åŒç³»ç»Ÿè°ƒç”¨ä¹‹é—´ä¹Ÿä¸ç›¸åŒï¼Œ +é‚£ä¹ˆæ ˆä¸Šæˆ–è¶…å‡ºæ ˆçš„ç›®æ ‡ä½ç½®å°±ä¼šå˜å¾—æ›´åŠ éš¾ä»¥ç¡®å®šã€‚ + +动æ€å†…å˜åŸºå€ +------------ + +å¾ˆå¤šå†…æ ¸çš„åŠ¨æ€å†…å˜ï¼ˆä¾‹å¦‚kmalloc,vmallocç‰ï¼‰ç”±äºŽæ—©æœŸå¯åЍåˆå§‹åŒ–的顺 +åºï¼Œæœ€ç»ˆå¸ƒå±€æ˜¯ç›¸å¯¹ç¡®å®šçš„。如果这些区域的基地å€åœ¨å¯åЍ之间ä¸ç›¸åŒï¼Œæ”» +å‡»è€…å°±æ— æ³•è½»æ˜“å®šä½å®ƒä»¬ï¼Œå¿…é¡»ä¾èµ–äºŽé’ˆå¯¹è¯¥åŒºåŸŸçš„ä¿¡æ¯æ³„露æ‰èƒ½æˆåŠŸã€‚ + +结构布局 +-------- + +é€šè¿‡åœ¨æ¯æ¬¡æž„å»ºæ—¶å¯¹æ•æ„Ÿç»“æž„çš„å¸ƒå±€è¿›è¡ŒéšæœºåŒ–处ç†ï¼Œæ”»å‡»è¿™å¿…须将攻击调 +èŠ‚åˆ°å·²çŸ¥çš„å†…æ ¸ç‰ˆæœ¬ï¼Œæˆ–è€…æ³„éœ²è¶³å¤Ÿçš„å†…æ ¸å†…å˜æ¥ç¡®å®šç»“æž„å¸ƒå±€ï¼Œç„¶åŽæ‰èƒ½ +对其进行æ“作。 + +防æ¢ä¿¡æ¯æ³„露 +------------ + +ç”±äºŽæ•æ„Ÿç»“构的ä½ç½®æ˜¯æ”»å‡»çš„主è¦ç›®æ ‡ï¼Œå› æ¤é˜²æ¢å†…æ ¸å†…å˜åœ°å€å’Œå†…æ ¸å†…å˜ +内容泄露éžå¸¸é‡è¦ï¼ˆå› 为它们å¯èƒ½åŒ…å«å†…æ ¸åœ°å€æˆ–è€…å…¶ä»–æ•æ„Ÿæ•°æ®ï¼Œä¾‹å¦‚ +æ ˆä¿æŠ¤å€¼ï¼‰ã€‚ + +å†…æ ¸åœ°å€ +-------- + +å°†å†…æ ¸åœ°å€æ‰“å°åˆ°ç”¨æˆ·ç©ºé—´ä¼šæ³„éœ²æœ‰å…³å†…æ ¸å†…å˜å¸ƒå±€çš„æ•æ„Ÿä¿¡æ¯ã€‚在使用任 +何打å°ç¬¦å·æ‰“å°åŽŸå§‹åœ°å€æ—¶ï¼Œç›®å‰%px,%p[ad](和在æŸäº›æƒ…况下的%p[sSb]) +æ—¶ã€‚ä½¿ç”¨è¿™äº›æ ¼å¼ç¬¦å†™å…¥çš„æ–‡ä»¶éœ€è¦é™åˆ¶ä¸ºåªæœ‰ç‰¹æƒè¿›ç¨‹å¯è¯»ã€‚ + +在4.14åŠä»¥å‰çš„å†…æ ¸ç‰ˆæœ¬ä¸ï¼Œä½¿ç”¨%pæ ¼å¼ç¬¦æ‰“å°çš„æ˜¯åŽŸå§‹åœ°å€ã€‚从4.15-rcl +版本开始,使用%pæ ¼å¼ç¬¦æ‰“å°çš„地å€ä¼šåœ¨æ‰“å°å‰è¿›è¡Œå“ˆå¸Œå¤„ç†ã€‚ + +[*]如果å¯ç”¨KALLSYMSå¹¶ä¸”ç¬¦å·æŸ¥æ‰¾å¤±è´¥ï¼Œåˆ™æ‰“å°åŽŸå§‹åœ°å€ï¼›å¦‚果没有å¯ç”¨ +KALLSYSM,则会直接打å°åŽŸå§‹åœ°å€ã€‚ + +å”¯ä¸€æ ‡è¯†ç¬¦ +---------- + +å†…æ ¸å†…å˜åœ°å€ç»ä¸å¯èƒ½ç”¨ä½œå‘ç”¨æˆ·ç©ºé—´å…¬å¼€çš„æ ‡è¯†ç¬¦ã€‚ç›¸å,应该使用原å +计数器,IDR(IDæ˜ å°„è¡¨ï¼‰æˆ–ç±»ä¼¼çš„å”¯ä¸€æ ‡è¯†ç¬¦ã€‚ + +内å˜åˆå§‹åŒ– +---------- + +å¤åˆ¶åˆ°ç”¨æˆ·ç©ºé—´çš„内å˜å¿…须始终被完全åˆå§‹åŒ–,如果没有显å¼åœ°ä½¿ç”¨memset() +函数进行åˆå§‹åŒ–,那就需è¦ä¿®æ”¹ç¼–è¯‘å™¨ï¼Œç¡®ä¿æ¸…除结构ä¸çš„空洞。 + +å†…å˜æ¸…除 +-------- + +åœ¨é‡Šæ”¾å†…å˜æ—¶ï¼Œæœ€å¥½å¯¹å†…å˜å†…容进行清除处ç†ï¼Œä»¥é˜²æ¢æ”»å‡»è€…é‡ç”¨å†…å˜ä¸ä»¥å‰ +çš„å†…å®¹ã€‚ä¾‹å¦‚ï¼Œåœ¨ç³»ç»Ÿè°ƒç”¨è¿”å›žæ—¶æ¸…é™¤å †æ ˆï¼ˆCONFIG_GCC_PLUGIN_STACKLEAK), +åœ¨é‡Šæ”¾å †å†…å®¹æ˜¯æ¸…é™¤å…¶å†…å®¹ã€‚è¿™æœ‰åŠ©äºŽé˜²æ¢è®¸å¤šæœªåˆå§‹åŒ–å˜é‡æ”»å‡»ã€å †æ ˆå†…容 +泄露ã€å †å†…容泄露以åŠä½¿ç”¨åŽé‡Šæ”¾æ”»å‡»ï¼ˆuser-after-free)。 + +ç›®æ ‡è¿½è¸ª +-------- + +ä¸ºäº†å¸®åŠ©æ¶ˆé™¤å¯¼è‡´å†…æ ¸åœ°å€è¢«å†™å…¥ç”¨æˆ·ç©ºé—´çš„å„ç§é”™è¯¯ï¼Œéœ€è¦è·Ÿè¸ªå†™å…¥çš„ç›®æ ‡ã€‚ +å¦‚æžœç¼“å†²åŒºçš„ç›®æ ‡æ˜¯ç”¨æˆ·ç©ºé—´ï¼ˆä¾‹å¦‚ï¼ŒåŸºäºŽseq_fileçš„/proc文件),则应该自 +åŠ¨å®¡æŸ¥æ•æ„Ÿå€¼ã€‚ diff --git a/Documentation/translations/zh_CN/security/snp-tdx-threat-model.rst b/Documentation/translations/zh_CN/security/snp-tdx-threat-model.rst new file mode 100644 index 000000000000..b51eeaebab67 --- /dev/null +++ b/Documentation/translations/zh_CN/security/snp-tdx-threat-model.rst @@ -0,0 +1,209 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/security/snp-tdx-threat-model.rst + +:翻译: + + 毛玉贤 Yuxian Mao <maoyuxian@cqsoftware.com.cn> + +========================== +Linuxä¸x86虚拟化的机密计算 +========================== + +.. contents:: :local: + +By: Elena Reshetova <elena.reshetova@intel.com> and Carlos Bilbao <carlos.bilbao.osdev@gmail.com> + +动机 +==== + +在x86虚拟环境ä¸ä»Žäº‹æœºå¯†è®¡ç®—å·¥ä½œçš„å†…æ ¸å¼€å‘äººå‘˜ï¼Œæ˜¯åŸºäºŽä¸€ç»„ä¸Žä¼ ç»ŸLinuxå†…æ ¸ +å¨èƒæ¨¡åž‹æœ‰æ‰€ä¸åŒçš„å‡è®¾æ¡ä»¶ä¸‹å¼€å±•å·¥ä½œçš„ã€‚ä¼ ç»Ÿæ„义上,Linuxå¨èƒæ¨¡åž‹æ‰¿è®¤æ”» +击者å¯ä»¥å˜åœ¨äºŽç”¨æˆ·ç©ºé—´ï¼Œä»¥åŠä¸€å°éƒ¨åˆ†èƒ½å¤Ÿé€šè¿‡å„ç§ç½‘ç»œæŽ¥å£æˆ–有é™çš„硬件特定 +暴露接å£ï¼ˆå¦‚USBã€Thunderboltï¼‰ä¸Žå†…æ ¸äº¤äº’çš„å¤–éƒ¨æ”»å‡»è€…ã€‚æœ¬æ–‡æ¡£çš„ç›®çš„æ˜¯è§£é‡Š +在机密计算领域ä¸å‡ºçŽ°çš„é¢å¤–攻击å‘é‡ï¼Œå¹¶è®¨è®ºä¸º Linux å†…æ ¸æå‡ºçš„ä¿æŠ¤æœºåˆ¶ã€‚ + +æ¦‚è¿°ä¸Žæœ¯è¯ +========== + +机密计算(Confidential Computing,简称CoCo)是一个广泛的术è¯ï¼Œæ¶µç›–äº†å¤šç§ +æ—¨åœ¨ä¿æŠ¤æ•°æ®åœ¨ä½¿ç”¨è¿‡ç¨‹ä¸ï¼ˆä¸Žé™æ€æ•°æ®æˆ–ä¼ è¾“æ•°æ®ç›¸æ¯”)的机密性和完整性的安 +全技术。从本质上讲,机密计算(CoCo)解决方案æä¾›äº†ä¸€ä¸ªå—信任执行环境(TEE), +在该环境ä¸å¯ä»¥è¿›è¡Œå®‰å…¨çš„æ•°æ®å¤„ç†ï¼Œå› æ¤ï¼Œå®ƒä»¬é€šå¸¸æ ¹æ®é¢„期在TEEä¸è¿è¡Œçš„软件 +æ¥è¿›ä¸€æ¥åˆ’分为ä¸åŒçš„å类型。本文档专注于一类针对虚拟化环境的机密计算技术 +(Confidential Computing, CoCo),这些技术å…许在å¯ä¿¡æ‰§è¡ŒçŽ¯å¢ƒ +(Trusted Execution Environment, TEE)ä¸è¿è¡Œè™šæ‹Ÿæœºï¼ˆVM)。从现在起,本文档 +将把这一类机密计算(CoCo)技术称为“虚拟化环境(VE)ä¸çš„æœºå¯†è®¡ç®—(CoCo)â€ã€‚ + +在虚拟化环境ä¸ï¼Œæœºå¯†è®¡ç®—(CoCo)指的是一组硬件和/或软件技术,这些技术能够 +为在CoCo虚拟机(VM)内è¿è¡Œçš„软件æä¾›æ›´å¼ºçš„安全ä¿éšœã€‚具体æ¥è¯´ï¼Œæœºå¯†è®¡ç®—å…许 +其用户确认所有软件组件的å¯ä¿¡åº¦ï¼Œä»Žè€Œå°†å…¶åŒ…å«åœ¨ç²¾ç®€çš„å—信任计算基(TCB)ä¸ï¼Œ +这是基于机密计算具备验è¯è¿™äº›å—信组件状æ€çš„能力。 + +虽然ä¸åŒæŠ€æœ¯ä¹‹é—´çš„具体实现细节有所ä¸åŒï¼Œä½†æ‰€æœ‰çŽ°æœ‰æœºåˆ¶éƒ½æ—¨åœ¨ä¸ºè™šæ‹Ÿæœºçš„å®¢æˆ· +内å˜å’Œæ‰§è¡Œçжæ€ï¼ˆvCPU寄å˜å™¨ï¼‰æä¾›æ›´é«˜çš„æœºå¯†æ€§å’Œå®Œæ•´æ€§ï¼Œæ›´ä¸¥æ ¼åœ°æŽ§åˆ¶å®¢æˆ·ä¸æ– +注入,并æä¾›ä¸€äº›é¢å¤–æœºåˆ¶æ¥æŽ§åˆ¶å®¢æˆ·ä¸Žå®¿ä¸»æœºä¹‹é—´çš„é¡µæ˜ å°„ã€‚æœ‰å…³x86特定解决方案 +的更多细节,å¯ä»¥å‚考 +:doc:`Intel Trust Domain Extensions (TDX) </arch/x86/tdx>` å’Œ +`AMD Memory Encryption <https://www.amd.com/system/files/techdocs/sev-snp-strengthening-vm-isolation-with-integrity-protection-and-more.pdf>`_. + +基本的机密计算(CoCo)客户布局包括宿主机ã€å®¢æˆ·æœºã€ç”¨äºŽå®¢æˆ·æœºä¸Žå®¿ä¸»æœºä¹‹é—´é€šä¿¡ +的接å£ã€èƒ½å¤Ÿæ”¯æŒCoCo虚拟机(VM)的平å°ï¼Œä»¥åŠä¸€ä¸ªåœ¨å®¢æˆ·VM和底层平å°ä¹‹é—´å……当安 +全管ç†å‘˜çš„å¯ä¿¡ä¸ä»‹ã€‚宿主机侧的虚拟机监视器(VMMï¼‰é€šå¸¸ç”±ä¼ ç»ŸVMM功能的一个å集 +组æˆï¼Œå¹¶ä»ç„¶è´Ÿè´£å®¢æˆ·æœºç”Ÿå‘½å‘¨æœŸçš„管ç†ï¼Œå³åˆ›å»ºæˆ–销æ¯CoCo虚拟机ã€ç®¡ç†å…¶å¯¹ç³»ç»Ÿèµ„ +æºçš„访问ç‰ã€‚然而,由于它通常ä¸åœ¨CoCo VMçš„å¯ä¿¡è®¡ç®—基(TCB)内,其访问æƒé™å—到 +é™åˆ¶ï¼Œä»¥ç¡®ä¿å®žçŽ°å®‰å…¨ç›®æ ‡ã€‚ + +在下图ä¸ï¼Œ"<--->" 线表示机密计算(CoCo)安全管ç†å‘˜ä¸Žå…¶ä½™ç»„件之间的åŒå‘通信通 +铿ˆ–接å£ï¼Œè¿™äº›ç»„件包括客户机ã€å®¿ä¸»æœºå’Œç¡¬ä»¶ï¼ˆæ•°æ®æµï¼‰:: + + +-------------------+ +-----------------------+ + | CoCo guest VM |<---->| | + +-------------------+ | | + | Interfaces | | CoCo security manager | + +-------------------+ | | + | Host VMM |<---->| | + +-------------------+ | | + | | + +--------------------+ | | + | CoCo platform |<--->| | + +--------------------+ +-----------------------+ + +机密计算(CoCo)安全管ç†å™¨çš„具体细节在在ä¸åŒæŠ€æœ¯ä¹‹é—´å˜åœ¨æ˜¾è‘—å·®å¼‚ã€‚ä¾‹å¦‚ï¼Œåœ¨æŸ +些情况下,它å¯èƒ½é€šè¿‡ç¡¬ä»¶ï¼ˆHW)实现,而在其他情况下,它å¯èƒ½æ˜¯çº¯è½¯ä»¶ï¼ˆSW)实现。 + +现有的Linuxå†…æ ¸å¨èƒæ¨¡åž‹ +======================= + +当å‰Linuxå†…æ ¸å¨èƒæ¨¡åž‹çš„æ€»ä½“组件包括:: + + +-----------------------+ +-------------------+ + | |<---->| Userspace | + | | +-------------------+ + | External attack | | Interfaces | + | vectors | +-------------------+ + | |<---->| Linux Kernel | + | | +-------------------+ + +-----------------------+ +-------------------+ + | Bootloader/BIOS | + +-------------------+ + +-------------------+ + | HW platform | + +-------------------+ + +在å¯åŠ¨è¿‡ç¨‹ä¸ï¼Œå¼•å¯¼åŠ è½½ç¨‹åºï¼ˆbootloaderï¼‰å’Œå†…æ ¸ä¹‹é—´ä¹Ÿå˜åœ¨é€šä¿¡ï¼Œä½†æœ¬å›¾å¹¶æœªæ˜Žç¡® +表示这一点。“接å£â€æ¡†è¡¨ç¤ºå…è®¸å†…æ ¸ä¸Žç”¨æˆ·ç©ºé—´ä¹‹é—´é€šä¿¡çš„å„ç§æŽ¥å£ã€‚ 这包括系统调用〠+å†…æ ¸ APIã€è®¾å¤‡é©±åŠ¨ç¨‹åºç‰ã€‚ + +现有的 Linux å†…æ ¸å¨èƒæ¨¡åž‹é€šå¸¸å‡è®¾å…¶åœ¨ä¸€ä¸ªå—信任的硬件平å°ä¸Šæ‰§è¡Œï¼Œå¹¶ä¸”所有固件 +å’Œå¯åŠ¨åŠ è½½ç¨‹åºéƒ½åŒ…å«åœ¨è¯¥å¹³å°çš„å—信任计算基(TCB)ä¸ã€‚ä¸»è¦æ”»å‡»è€…驻留在用户空间 +ä¸ï¼Œæ¥è‡ªç”¨æˆ·ç©ºé—´çš„æ‰€æœ‰æ•°æ®é€šå¸¸è¢«è®¤ä¸ºæ˜¯ä¸å¯ä¿¡çš„,除éžç”¨æˆ·ç©ºé—´å…·æœ‰è¶³å¤Ÿçš„ç‰¹æƒæ¥ +执行å—信任的æ“作。æ¤å¤–,通常还会考虑外部攻击者,包括那些能够访问å¯ç”¨çš„外部网络 +ï¼ˆä¾‹å¦‚ä»¥å¤ªç½‘ã€æ— 线网络ã€è“ç‰™ï¼‰ã€æš´éœ²çš„硬件接å£ï¼ˆä¾‹å¦‚ USBã€Thunderboltï¼‰ï¼Œä»¥åŠ +能够离线修改ç£ç›˜å†…容的攻击者。 + +关于外部攻击途径,值得注æ„的是,在大多数情况下,外部攻击者会首先å°è¯•利用用户空 +é—´çš„æ¼æ´žï¼Œä½†æ”»å‡»è€…也å¯èƒ½ç›´æŽ¥é’ˆå¯¹å†…æ ¸ï¼Œç‰¹åˆ«æ˜¯åœ¨å®¿ä¸»æœºå…·æœ‰ç‰©ç†è®¿é—®æƒé™çš„æƒ…况下。直 +æŽ¥æ”»å‡»å†…æ ¸çš„ä¾‹ååŒ…æ‹¬æ¼æ´ž CVE-2019-19524ã€CVE-2022-0435 å’Œ CVE-2020-24490。 + +机密计算å¨èƒæ¨¡åž‹åŠå…¶å®‰å…¨ç›®æ ‡ +============================ + +机密计算在上述攻击者列表ä¸å¢žåŠ äº†ä¸€ç§æ–°çš„æ”»å‡»è€…类型:å¯èƒ½å˜åœ¨è¡Œä¸ºä¸å½“的宿主机 +(这å¯èƒ½åŒ…æ‹¬ä¼ ç»Ÿè™šæ‹Ÿæœºç›‘è§†å™¨VMM的部分组件或全部),由于其较大的软件攻击é¢ï¼Œ +通常被置于CoCo VM TCBä¹‹å¤–ã€‚éœ€è¦æ³¨æ„çš„æ˜¯ï¼Œè¿™å¹¶ä¸æ„味ç€å®¿ä¸»æœºæˆ–VMMæ˜¯æ•…æ„æ¶æ„的, +而是强调拥有一个较å°çš„CoCo VM TCBå…·æœ‰å®‰å…¨ä»·å€¼ã€‚è¿™ç§æ–°åž‹çš„æ”»å‡»è€…å¯ä»¥è¢«è§†ä¸ºä¸€ç§ +æ›´å¼ºå¤§çš„å¤–éƒ¨æ”»å‡»è€…ï¼Œå› ä¸ºå®ƒä½äºŽåŒä¸€ç‰©ç†æœºå™¨ä¸Šï¼ˆä¸Žè¿œç¨‹ç½‘络攻击者ä¸åŒï¼‰ï¼Œå¹¶ä¸”对 +å®¢æˆ·æœºå†…æ ¸ä¸Žå¤§éƒ¨åˆ†ç¡¬ä»¶çš„é€šä¿¡å…·æœ‰æŽ§åˆ¶æƒ:: + + +------------------------+ + | CoCo guest VM | + +-----------------------+ | +-------------------+ | + | |<--->| | Userspace | | + | | | +-------------------+ | + | External attack | | | Interfaces | | + | vectors | | +-------------------+ | + | |<--->| | Linux Kernel | | + | | | +-------------------+ | + +-----------------------+ | +-------------------+ | + | | Bootloader/BIOS | | + +-----------------------+ | +-------------------+ | + | |<--->+------------------------+ + | | | Interfaces | + | | +------------------------+ + | CoCo security |<--->| Host/Host-side VMM | + | manager | +------------------------+ + | | +------------------------+ + | |<--->| CoCo platform | + +-----------------------+ +------------------------+ + +ä¼ ç»Ÿä¸Šï¼Œå®¿ä¸»æœºå¯¹å®¢æˆ·æœºæ•°æ®æ‹¥æœ‰æ— é™è®¿é—®æƒé™ï¼Œå¹¶å¯ä»¥åˆ©ç”¨è¿™ç§è®¿é—®æƒé™æ¥æ”»å‡»å®¢æˆ·è™š +拟机。然而,机密计算(CoCoï¼‰ç³»ç»Ÿé€šè¿‡æ·»åŠ è¯¸å¦‚å®¢æˆ·æ•°æ®ä¿å¯†æ€§å’Œå®Œæ•´æ€§ä¿æŠ¤ç‰å®‰å…¨ +特性æ¥ç¼“è§£æ¤ç±»æ”»å‡»ã€‚该å¨èƒæ¨¡åž‹å‡è®¾è¿™äº›å®‰å…¨ç‰¹æ€§æ˜¯å¯ç”¨ä¸”完好的。 + +这个 **Linuxå†…æ ¸æœºå¯†è®¡ç®—è™šæ‹Ÿæœºï¼ˆCoCo VMï¼‰çš„å®‰å…¨ç›®æ ‡** å¯ä»¥æ€»ç»“如下: + +1. ä¿æŠ¤CoCoå®¢æˆ·æœºç§æœ‰å†…å˜å’Œå¯„å˜å™¨çš„æœºå¯†æ€§å’Œå®Œæ•´æ€§ã€‚ + +2. 防æ¢å®¿ä¸»æœºç‰¹æƒå‡çº§åˆ°CoCo客户机Linuxå†…æ ¸ã€‚è™½ç„¶å®¿ä¸»æœºï¼ˆåŠä¸»æœºç«¯è™šæ‹Ÿæœºç®¡ç†ç¨‹åºï¼‰ + 确实需è¦ä¸€å®šçš„ç‰¹æƒæ¥åˆ›å»ºã€é”€æ¯æˆ–æš‚åœè®¿å®¢ï¼Œä½†é˜²æ¢ç‰¹æƒå‡çº§çš„éƒ¨åˆ†ç›®æ ‡æ˜¯ç¡®ä¿è¿™äº› + æ“作ä¸ä¼šä¸ºæ”»å‡»è€…æä¾›èŽ·å–å®¢æˆ·æœºå†…æ ¸è®¿é—®æƒé™çš„途径。 + +ä¸Šè¿°å®‰å…¨ç›®æ ‡å¯¼è‡´äº†ä¸¤ä¸ªä¸»è¦çš„**Linuxå†…æ ¸æœºå¯†è®¡ç®—è™šæ‹Ÿæœºï¼ˆCoCo VM)资产**: + +1. å®¢æˆ·æœºå†…æ ¸æ‰§è¡Œä¸Šä¸‹æ–‡ã€‚ +2. å®¢æˆ·æœºå†…æ ¸ç§æœ‰å†…å˜ã€‚ + +宿主机对CoCo客户机资æºå…·æœ‰å®Œå…¨æŽ§åˆ¶æƒï¼Œå¹¶å¯ä»¥éšæ—¶æ‹’ç»è®¿é—®è¿™äº›èµ„æºã€‚资æºçš„示例包 +括CPUæ—¶é—´ã€å®¢æˆ·æœºå¯ä»¥æ¶ˆè€—的内å˜ã€ç½‘络带宽ç‰ã€‚å› æ¤ï¼Œå®¿ä¸»æœºå¯¹CoCoå®¢æˆ·æœºçš„æ‹’ç»æœåŠ¡ +(DoS)攻击超出了æ¤å¨èƒæ¨¡åž‹çš„范围。 + +Linux CoCoè™šæ‹Ÿæœºæ”»å‡»é¢æ˜¯æŒ‡ä»ŽCoCo客户机Linuxå†…æ ¸æš´éœ²åˆ°ä¸å—信任的主机的任何接å£ï¼Œ +è¿™äº›æŽ¥å£æœªè¢«CoCoæŠ€æœ¯çš„è½¯ç¡¬ä»¶ä¿æŠ¤æ‰€è¦†ç›–ã€‚è¿™åŒ…æ‹¬æ‰€æœ‰å¯èƒ½çš„ä¾§ä¿¡é“æ”»å‡»ä»¥åŠçž¬æ€æ‰§ +è¡Œä¾§ä¿¡é“æ”»å‡»ã€‚显å¼ï¼ˆéžæ—é“)接å£çš„示例包括访问端å£I/Oã€å†…å˜æ˜ å°„I/O(MMIO)和 +直接内å˜è®¿é—®ï¼ˆDMA)接å£ã€è®¿é—®PCIé…置空间ã€ç‰¹å®šäºŽè™šæ‹Ÿæœºç®¡ç†ç¨‹åºï¼ˆVMM)的超调用 +(指å‘主机端VMM)ã€è®¿é—®å…±äº«å†…å˜é¡µã€ä¸»æœºå…è®¸æ³¨å…¥åˆ°è®¿å®¢å†…æ ¸çš„ä¸æ–,以åŠç‰¹å®šäºŽ +CoCo技术的超调用(如果å˜åœ¨ï¼‰ã€‚æ¤å¤–,在CoCo系统ä¸ï¼Œå®¿ä¸»æœºé€šå¸¸æŽ§åˆ¶åˆ›å»ºCoCo客户机 +的过程:它有方法将固件和引导程åºé•œåƒã€å†…æ ¸é•œåƒä»¥åŠå†…æ ¸å‘½ä»¤è¡ŒåŠ è½½åˆ°å®¢æˆ·æœºä¸ã€‚所有 +这些数æ®åœ¨é€šè¿‡è¯æ˜Žæœºåˆ¶ç¡®è®¤å…¶å®Œæ•´æ€§å’ŒçœŸå®žæ€§ä¹‹å‰ï¼Œéƒ½åº”视为ä¸å¯ä¿¡çš„。 + +下表显示了针对CoCo客户机Linuxå†…æ ¸çš„å¨èƒçŸ©é˜µï¼Œä½†å¹¶æœªè®¨è®ºæ½œåœ¨çš„缓解ç–略。该矩阵涉 +åŠçš„æ˜¯CoCo特定版本的客户机ã€å®¿ä¸»æœºå’Œå¹³å°ã€‚ + +.. list-table:: CoCo Linuxå®¢æˆ·æœºå†…æ ¸å¨èƒçŸ©é˜µ + :widths: auto + :align: center + :header-rows: 1 + + * - å¨èƒåç§° + - å¨èƒæè¿° + + * - å®¢æˆ·æœºæ¶æ„é…ç½® + - 一个行为ä¸å½“的主机修改了以下其ä¸ä¸€ä¸ªå®¢æˆ·æœºçš„é…置: + + 1. å®¢æˆ·æœºå›ºä»¶æˆ–å¼•å¯¼åŠ è½½ç¨‹åº + + 2. å®¢æˆ·æœºå†…æ ¸æˆ–æ¨¡å—二进制文件 + + 3. å®¢æˆ·æœºå‘½ä»¤è¡Œå‚æ•° + + è¿™ä½¿å¾—å®¿ä¸»æœºèƒ½å¤Ÿç ´å在CoCo客户虚拟机内部è¿è¡Œä»£ç 的完整性,从而è¿å了机密计算 + (CoCoï¼‰çš„å®‰å…¨ç›®æ ‡ã€‚ + + * - CoCoå®¢æˆ·æœºæ•°æ®æ”»å‡» + - 一个行为ä¸å½“的宿主机对CoCo客户虚拟机与宿主机管ç†çš„ç‰©ç†æˆ–è™šæ‹Ÿè®¾å¤‡ä¹‹é—´ä¼ è¾“çš„æ•° + æ®æ‹¥æœ‰å®Œå…¨æŽ§åˆ¶æƒã€‚这使得宿主机å¯ä»¥å¯¹è¿™ç±»æ•°æ®çš„ä¿å¯†æ€§ã€å®Œæ•´æ€§å’Œæ–°é²œæ€§è¿›è¡Œä»»ä½•攻击。 + + * - æ ¼å¼é”™è¯¯çš„è¿è¡Œæ—¶è¾“å…¥ + - 一个行为ä¸å½“çš„å®¿ä¸»æœºé€šè¿‡å®¢æˆ·æœºå†…æ ¸ä»£ç 使用的任æ„é€šä¿¡æŽ¥å£æ³¨å…¥æ ¼å¼é”™è¯¯çš„输入。 + å¦‚æžœä»£ç æ²¡æœ‰æ£ç¡®å¤„ç†è¿™äº›è¾“入,这å¯èƒ½å¯¼è‡´ä»Žå®¿ä¸»æœºåˆ°å®¢æˆ·æœºå†…æ ¸çš„ç‰¹æƒæå‡ã€‚这包 + æ‹¬ä¼ ç»Ÿçš„ä¾§ä¿¡é“æ”»å‡»å’Œ/æˆ–çž¬æ€æ‰§è¡Œæ”»å‡»è·¯å¾„。 + + * - æ¶æ„è¿è¡Œæ—¶è¾“å…¥ + - 一个行为ä¸å½“çš„å®¿ä¸»æœºé€šè¿‡å®¢æˆ·æœºå†…æ ¸ä»£ç 使用的任æ„é€šä¿¡æŽ¥å£æ³¨å…¥ç‰¹å®šçš„è¾“å…¥å€¼ã€‚ä¸Žä¹‹å‰ + 的攻击å‘é‡ï¼ˆæ ¼å¼é”™è¯¯çš„è¿è¡Œæ—¶è¾“入)ä¸åŒï¼Œè¿™ä¸ªè¾“å…¥å¹¶éžæ ¼å¼é”™è¯¯ï¼Œè€Œæ˜¯å…¶å€¼è¢«ç²¾å¿ƒè®¾ + 计以影å“å®¢æˆ·æœºå†…æ ¸çš„å®‰å…¨æ€§ã€‚è¿™ç±»è¾“å…¥çš„ä¾‹å包括å‘客户机æä¾›æ¶æ„的时间或å‘客户机 + çš„éšæœºæ•°ç”Ÿæˆå™¨æä¾›ç†µå€¼ã€‚æ¤å¤–ï¼Œå¦‚æžœå®ƒå¯¼è‡´å®¢æˆ·æœºå†…æ ¸æ‰§è¡Œç‰¹å®šæ“作(例如处ç†ä¸»æœºæ³¨ + å…¥çš„ä¸æ–),æ¤ç±»äº‹ä»¶çš„æ—¶åºæœ¬èº«ä¹Ÿå¯èƒ½æˆä¸ºä¸€ç§æ”»å‡»è·¯å¾„ã€‚è¿™ç§æ”»å‡»æ˜¯å¯¹æä¾›çš„宿主机输 + å…¥å…·æœ‰æŠµæŠ—æ€§çš„ä¸€ç§æ–¹å¼ã€‚ diff --git a/Documentation/translations/zh_CN/security/tpm/index.rst b/Documentation/translations/zh_CN/security/tpm/index.rst new file mode 100644 index 000000000000..707646590647 --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/index.rst @@ -0,0 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/index.rst + +:翻译: + 赵硕 Shuo Zhao <zhaoshuo@cqsoftware.com.cn> + +================ +å¯ä¿¡å¹³å°æ¨¡å—文档 +================ + +.. toctree:: + + tpm_event_log + tpm-security + tpm_tis + tpm_vtpm_proxy + xen-tpmfront + tpm_ftpm_tee diff --git a/Documentation/translations/zh_CN/security/tpm/tpm-security.rst b/Documentation/translations/zh_CN/security/tpm/tpm-security.rst new file mode 100644 index 000000000000..26818d28c98f --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm-security.rst @@ -0,0 +1,151 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm-security.rst + +:翻译: + 赵硕 Shuo Zhao <zhaoshuo@cqsoftware.com.cn> + +TPM安全 +======= + +本文档的目的是æè¿°æˆ‘ä»¬å¦‚ä½•ä½¿å†…æ ¸ä½¿ç”¨TPM在é¢å¯¹å¤–部窥探和数æ®åŒ…篡改 +攻击(文献ä¸ç§°ä¸ºè¢«åŠ¨å’Œä¸»åŠ¨ä¸é—´äººæ”»å‡»ï¼‰æ—¶ä¿æŒåˆç†çš„ç¨³å¥æ€§ã€‚当å‰çš„ +安全文档适用于TPM2.0。 + +ä»‹ç» +---- + +TPM通常是一个通过æŸç§ä½Žå¸¦å®½æ€»çº¿è¿žæŽ¥åˆ°PC的独立芯片。虽然有一些 +例外,例如Intel PTT,它是è¿è¡Œåœ¨é è¿‘CPU的软件环境ä¸çš„软件TPM, +容易å—到ä¸åŒç±»åž‹çš„æ”»å‡»ï¼Œä½†ç›®å‰å¤§å¤šæ•°å¼ºåŒ–çš„å®‰å…¨çŽ¯å¢ƒè¦æ±‚使用独立 +硬件TPM,这是本文讨论的使用场景。 + +总线上的窥探和篡改攻击 +---------------------- + +当å‰çš„æŠ€æœ¯çжæ€å…许使用 `TPM Genie`_ 硬件ä¸é—´äººï¼Œè¿™æ˜¯ä¸€ç§ç®€å•的外部设备,å¯ä»¥åœ¨ +ä»»ä½•ç³»ç»Ÿæˆ–ç¬”è®°æœ¬ç”µè„‘ä¸Šå‡ ç§’é’Ÿå†…å®‰è£…ã€‚æœ€è¿‘æˆåŠŸæ¼”ç¤ºäº†é’ˆå¯¹ `Windows Bitlocker TPM`_ +ç³»ç»Ÿçš„æ”»å‡»ã€‚æœ€è¿‘åŒæ ·çš„æ”»å‡»é’ˆå¯¹ `基于TPMçš„Linuxç£ç›˜åР坆`_ 方案也éåˆ°äº†åŒæ ·çš„æ”»å‡»ã€‚ +ä¸‹ä¸€é˜¶æ®µçš„ç ”ç©¶ä¼¼ä¹Žæ˜¯å…¥ä¾µæ€»çº¿ä¸ŠçŽ°æœ‰çš„è®¾å¤‡ä»¥å……å½“ä¸é—´äººï¼Œå› æ¤æ”»å‡»è€…需è¦ç‰©ç†è®¿é—®å‡ +ç§’é’Ÿçš„è¦æ±‚å¯èƒ½ä¸å†å˜åœ¨ã€‚ç„¶è€Œï¼Œæœ¬æ–‡æ¡£çš„ç›®æ ‡æ˜¯å°½å¯èƒ½åœ¨è¿™ç§çŽ¯å¢ƒä¸‹ä¿æŠ¤TPM的机密性和 +完整性,并å°è¯•ç¡®ä¿å³ä½¿æˆ‘们ä¸èƒ½é˜²æ¢æ”»å‡»ï¼Œè‡³å°‘å¯ä»¥æ£€æµ‹åˆ°å®ƒã€‚ + +ä¸å¹¸çš„æ˜¯ï¼Œå¤§å¤šæ•°TPM功能,包括硬件é‡ç½®åŠŸèƒ½ï¼Œéƒ½èƒ½è¢«èƒ½å¤Ÿè®¿é—®æ€»çº¿çš„æ”»å‡» +è€…æŽ§åˆ¶ï¼Œå› æ¤ä¸‹é¢æˆ‘们将讨论一些å¯èƒ½å‡ºçŽ°çš„å¹²æ‰°æƒ…å†µã€‚ + +测é‡ï¼ˆPCR)完整性 +----------------- + +由于攻击者å¯ä»¥å‘TPMå‘é€è‡ªå·±çš„命令,他们å¯ä»¥å‘é€ä»»æ„çš„PCRæ‰©å±•ï¼Œä»Žè€Œç ´ +åæµ‹é‡ç³»ç»Ÿï¼Œè¿™å°†æ˜¯ä¸€ç§çƒ¦äººçš„æ‹’ç»æœåŠ¡æ”»å‡»ã€‚ç„¶è€Œï¼Œé’ˆå¯¹å¯†å°åˆ°ä¿¡ä»»æµ‹é‡ä¸ +的实体,有两类更严é‡çš„æ”»å‡»ã€‚ + +1. 攻击者å¯ä»¥æ‹¦æˆªæ¥è‡ªç³»ç»Ÿçš„æ‰€æœ‰PCR扩展,并完全替æ¢ä¸ºè‡ªå·±çš„值,产生 + 一个未篡改状æ€çš„é‡çŽ°ï¼Œè¿™ä¼šä½¿PCR测é‡è¯æ˜ŽçŠ¶æ€æ˜¯å¯ä¿¡çš„,并释放密钥。 + +2. 攻击者å¯èƒ½ä¼šåœ¨æŸä¸ªæ—¶åˆ»é‡ç½®TPM,清除PCR,然åŽå‘é€è‡ªå·±çš„æµ‹é‡ï¼Œä»Žè€Œ + 有效地覆盖TPMå·²ç»å®Œæˆçš„å¯åŠ¨æ—¶é—´æµ‹é‡ã€‚ + +ç¬¬ä¸€ç§æ”»å‡»å¯ä»¥é€šè¿‡å§‹ç»ˆå¯¹PCR扩展和读å–命令进行HMACä¿æŠ¤æ¥é˜²æ¢ï¼Œè¿™æ„å‘³ç€ +如果没有在å“应ä¸äº§ç”Ÿå¯æ£€æµ‹çš„HMAC失败,则测é‡å€¼æ— 法被替æ¢ã€‚ç„¶è€Œç¬¬äºŒç§ +攻击åªèƒ½é€šè¿‡ä¾èµ–æŸç§æœºåˆ¶æ¥æ£€æµ‹ï¼Œè¿™ç§æœºåˆ¶ä¼šåœ¨TPMé‡ç½®åŽå‘生å˜åŒ–。 + +ç§˜å¯†ä¿æŠ¤ +-------- + +æŸäº›è¿›å‡ºTPM的信æ¯,如密钥密å°ã€ç§é’¥å¯¼å…¥å’Œéšæœºæ•°ç”Ÿæˆå®¹æ˜“被拦截,而仅仅 +使用HMACä¿æŠ¤æ— æ³•é˜²æ¢è¿™ç§æƒ…å†µã€‚å› æ¤ï¼Œå¯¹äºŽè¿™äº›ç±»åž‹çš„命令,我们必须使用 +请求和å“åº”åŠ å¯†æ¥é˜²æ¢ç§˜å¯†ä¿¡æ¯çš„æ³„露。 + +与TPM建立åˆå§‹ä¿¡ä»» +----------------- + +为了从一开始就æä¾›å®‰å…¨æ€§ï¼Œå¿…须建立一个åˆå§‹çš„共享或éžå¯¹ç§°ç§˜å¯†ï¼Œå¹¶ä¸”该 +秘钥必须对攻击者ä¸å¯çŸ¥ã€‚最明显的途径是使用背书和å˜å‚¨ç§å,这些å¯ä»¥ç”¨ +æ¥æ´¾ç”Ÿéžå¯¹ç§°å¯†é’¥ã€‚ç„¶è€Œï¼Œä½¿ç”¨è¿™äº›å¯†é’¥å¾ˆå›°éš¾ï¼Œå› ä¸ºå°†å®ƒä»¬ä¼ é€’ç»™å†…æ ¸çš„å”¯ +一方法是通过命令行,这需è¦åœ¨å¯åŠ¨ç³»ç»Ÿä¸è¿›è¡Œå¹¿æ³›çš„æ”¯æŒï¼Œè€Œä¸”æ— æ³•ä¿è¯ä»» +何一个层次ä¸ä¼šæœ‰ä»»ä½•å½¢å¼çš„æŽˆæƒã€‚ + +Linuxå†…æ ¸é€‰æ‹©çš„æœºåˆ¶æ˜¯ä»Žç©ºç§åä½¿ç”¨æ ‡å‡†çš„å˜å‚¨ç§å傿•°æ´¾ç”Ÿå‡ºä¸»æ¤åœ†æ›²çº¿ +密钥。空ç§å有两个优势:首先该层次物ç†ä¸Šæ— 法具有授æƒï¼Œå› æ¤æˆ‘ä»¬å§‹ç»ˆå¯ +以使用它;其次空ç§å在TPMé‡ç½®æ—¶ä¼šå‘生å˜åŒ–,这æ„味ç€å¦‚果我们在当天开始 +时基于空ç§å建立信任,如果TPMé‡ç½®ä¸”ç§åå˜åŒ–ï¼Œåˆ™æ‰€æœ‰æ´¾ç”Ÿçš„å¯†é’¥è¿›è¡ŒåŠ ç› +处ç†çš„会è¯éƒ½å°†å¤±è´¥ã€‚ + +显然,在没有任何其他共享秘密的情况下使用空ç§å,我们必须创建并读å–åˆå§‹ +公钥,这当然å¯èƒ½ä¼šè¢«æ€»çº¿ä¸é—´äººæ‹¦æˆªå¹¶æ›¿æ¢ã€‚然而,TPMæœ‰ä¸€ä¸ªå¯†é’¥è®¤è¯æœºåˆ¶ +(使用EK背书è¯ä¹¦ï¼Œåˆ›å»ºè®¤è¯èº«ä»½å¯†é’¥ï¼Œå¹¶ç”¨è¯¥å¯†é’¥è®¤è¯ç©ºç§å主密钥),但由 +äºŽå®ƒè¿‡äºŽå¤æ‚ï¼Œæ— æ³•åœ¨å†…æ ¸ä¸è¿è¡Œï¼Œå› æ¤æˆ‘们ä¿ç•™ç©ºä¸»å¯†é’¥å称的副本,通过 +sysfs导出,以便用户空间在å¯åŠ¨æ—¶è¿›è¡Œå®Œæ•´çš„è®¤è¯ã€‚这里的明确ä¿è¯æ˜¯ï¼Œå¦‚果空 +ä¸»å¯†é’¥è®¤è¯æˆåŠŸï¼Œé‚£ä¹ˆä»Žå½“å¤©å¼€å§‹çš„æ‰€æœ‰TPM交易都是安全的;如果认è¯å¤±è´¥ï¼Œåˆ™ +说明系统上有ä¸é—´äººï¼ˆå¹¶ä¸”任何在å¯åŠ¨æœŸé—´ä½¿ç”¨çš„ç§˜å¯†å¯èƒ½å·²è¢«æ³„露)。 + +ä¿¡ä»»å †å +-------- + +在当å‰çš„空主密钥场景ä¸ï¼ŒTPM必须在交给下一个使用者之å‰å®Œå…¨æ¸…除。然而, +å†…æ ¸å°†æ´¾ç”Ÿå‡ºçš„ç©ºç§å密钥的åç§°ä¼ é€’ç»™ç”¨æˆ·ç©ºé—´ï¼Œç”¨æˆ·ç©ºé—´å¯ä»¥é€šè¿‡è®¤è¯æ¥ +验è¯è¯¥åç§°ã€‚å› æ¤ï¼Œè¿™ç§åç§°ä¼ é€’é“¾ä¹Ÿå¯ä»¥ç”¨äºŽå„个å¯åŠ¨ç»„ä»¶ä¹‹é—´ï¼ˆé€šè¿‡æœªæŒ‡ +定的机制)。例如grubå¯ä»¥ä½¿ç”¨ç©ºç§å方案æ¥å®žçŽ°å®‰å…¨æ€§ï¼Œå¹¶å°†åç§°äº¤ç»™å†…æ ¸ã€‚ +å†…æ ¸å¯ä»¥æ´¾ç”Ÿå‡ºå¯†é’¥å’Œå称,并确定如果它们与交接的版本ä¸åŒï¼Œåˆ™è¡¨ç¤ºå‘生 +äº†ç¯¡æ”¹ã€‚å› æ¤å¯ä»¥é€šè¿‡åç§°ä¼ é€’å°†ä»»æ„å¯åŠ¨ç»„ä»¶ï¼ˆä»ŽUEFI到grubåˆ°å†…æ ¸ï¼‰ä¸²è” +èµ·æ¥ï¼Œåªè¦æ¯ä¸ªåŽç»ç»„件知é“如何收集该åç§°,å¹¶æ ¹æ®å…¶æ´¾ç”Ÿçš„密钥进行验è¯ã€‚ + +会è¯å±žæ€§ +-------- + +æ‰€æœ‰å†…æ ¸ä½¿ç”¨çš„TPM命令都å…许会è¯ã€‚HMAC会è¯å¯ç”¨äºŽæ£€æŸ¥è¯·æ±‚å’Œå“应的完整性, +è€Œè§£å¯†å’ŒåŠ å¯†æ ‡å¿—å¯ç”¨äºŽä¿æŠ¤å‚æ•°å’Œå“应。HMACå’ŒåŠ å¯†å¯†é’¥é€šå¸¸æ˜¯ä»Žå…±äº«æŽˆæƒç§˜ +钥推导出æ¥çš„ï¼Œä½†å¯¹äºŽè®¸å¤šå†…æ ¸æ“作æ¥è¯´ï¼Œè¿™äº›å¯†é’¥æ˜¯å·²çŸ¥çš„ï¼ˆé€šå¸¸ä¸ºç©ºï¼‰ã€‚å› +æ¤å†…æ ¸ä½¿ç”¨ç©ºä¸»å¯†é’¥ä½œä¸ºç›å¯†é’¥æ¥åˆ›å»ºæ¯ä¸ªHMAC会è¯ï¼Œè¿™æ ·å°±ä¸ºä¼šè¯å¯†é’¥çš„æ´¾ç”Ÿ +æä¾›äº†åŠ å¯†è¾“å…¥ã€‚å› æ¤å†…æ ¸ä»…éœ€åˆ›å»ºä¸€æ¬¡ç©ºä¸»å¯†é’¥ï¼ˆä½œä¸ºä¸€ä¸ªæ˜“å¤±çš„TPM奿Ÿ„), +并将其ä¿å˜åœ¨tpm_chipä¸ï¼Œç”¨äºŽæ¯æ¬¡åœ¨å†…æ ¸ä¸ä½¿ç”¨TPMæ—¶ã€‚ç”±äºŽå†…æ ¸èµ„æºç®¡ç†å™¨ç¼º +ä¹åŽ»é—´éš™åŒ–ï¼Œå½“å‰æ¯æ¬¡æ“作都需è¦åˆ›å»ºå’Œé”€æ¯ä¼šè¯ï¼Œä½†æœªæ¥å¯èƒ½ä¼šå°†å•个会è¯é‡ç”¨ +ç”¨äºŽå†…æ ¸ä¸çš„HMACã€åŠ å¯†å’Œè§£å¯†ä¼šè¯ã€‚ + +ä¿æŠ¤ç±»åž‹ +-------- + +对于æ¯ä¸ªå†…æ ¸æ“ä½œï¼Œæˆ‘ä»¬ä½¿ç”¨ç©ºä¸»å¯†é’¥åŠ ç›çš„HMACæ¥ä¿æŠ¤å®Œæ•´æ€§ã€‚æ¤å¤–æˆ‘ä»¬ä½¿ç”¨å‚æ•° +åŠ å¯†æ¥ä¿æŠ¤å¯†é’¥å¯†å°ï¼Œå¹¶ä½¿ç”¨å‚数解密æ¥ä¿æŠ¤å¯†é’¥è§£å°å’Œéšæœºæ•°ç”Ÿæˆã€‚ + +空主密钥认è¯åœ¨ç”¨æˆ·ç©ºé—´çš„实现 +============================ + +æ¯ä¸ªTPMéƒ½ä¼šé™„å¸¦å‡ ä¸ªX.509è¯ä¹¦ï¼Œé€šå¸¸ç”¨äºŽä¸»èƒŒä¹¦å¯†é’¥ã€‚本文档å‡è®¾å˜åœ¨æ¤åœ†æ›²çº¿ +版本的è¯ä¹¦ï¼Œä½äºŽ01C00002ï¼Œä½†ä¹ŸåŒæ ·é€‚用于RSAè¯ä¹¦(ä½äºŽ01C00001)。 + +认è¯çš„ç¬¬ä¸€æ¥æ˜¯ä½¿ç”¨ `TCG EK Credential Profile`_ 模æ¿è¿›è¡Œä¸»å¯†é’¥çš„创建,该 +模æ¿å…许将生æˆçš„主密钥与è¯ä¹¦ä¸çš„主密钥进行比较(公钥必须匹é…ï¼‰ã€‚éœ€è¦æ³¨æ„ +的是,生æˆEK主密钥需è¦EK层级密ç ,但EC主密钥的预生æˆç‰ˆæœ¬åº”ä½äºŽ81010002, +并且å¯ä»¥æ— 需密钥授æƒå¯¹å…¶æ‰§è¡ŒTPM2_ReadPublic()æ“作。接下æ¥ï¼Œè¯ä¹¦æœ¬èº«å¿…é¡» +ç»è¿‡éªŒè¯ï¼Œä»¥ç¡®ä¿å…¶å¯ä»¥è¿½æº¯åˆ°åˆ¶é€ å•†æ ¹è¯ä¹¦ï¼ˆè¯¥æ ¹è¯ä¹¦åº”å…¬å¼€åœ¨åˆ¶é€ å•†ç½‘ç«™ä¸Šï¼‰ã€‚ +å®Œæˆæ¤æ¥éª¤åŽå°†åœ¨TPM内部生æˆä¸€ä¸ªè®¤è¯å¯†é’¥ï¼ˆAK),并使用TPM2_MakeCredential〠+AKçš„åç§°å’ŒEKå…¬é’¥åŠ å¯†ä¸€ä¸ªç§˜å¯†ã€‚ç„¶åŽTPM执行TPM2_ActivateCredentialï¼Œåªæœ‰åœ¨ +TPMã€EKå’ŒAK之间的绑定关系æˆç«‹æ—¶ï¼Œæ‰èƒ½æ¢å¤ç§˜å¯†ã€‚现在,生æˆçš„AKå¯ä»¥ç”¨äºŽå¯¹ç”± +å†…æ ¸å¯¼å‡ºçš„ç©ºä¸»å¯†é’¥è¿›è¡Œè®¤è¯ã€‚由于TPM2_MakeCredential/ActivateCredentialæ“作 +ç›¸å¯¹å¤æ‚,下é¢å°†æè¿°ä¸€ç§æ¶‰åŠå¤–部生æˆç§é’¥çš„简化过程。 + +这个过程是通常基于éšç§CA认è¯è¿‡ç¨‹çš„简化缩写。å‡è®¾æ¤æ—¶è®¤è¯ç”±TPM所有者进行, +所有者åªèƒ½è®¿é—®æ‰€æœ‰è€…层次。所有者创建一个外部公/ç§é’¥å¯¹ï¼ˆå‡è®¾æ˜¯æ¤åœ†æ›²çº¿ï¼‰ï¼Œ +并使用内部包装过程将ç§é’¥è¿›è¡Œå°è£…以便导入,该ç§é’¥è¢«å…¶çˆ¶çº§ç”±EC派生的å˜å‚¨ä¸»å¯† +é’¥ä¿æŠ¤ã€‚TPM2_Import()æ“作使用一个以EK主密钥为ç›å€¼çš„傿•°è§£å¯†HMAC会è¯ï¼ˆè¿™ä¹Ÿä¸ +需è¦EK密钥授æƒï¼‰ï¼Œæ„味ç€å†…部å°è£…å¯†é’¥æ˜¯åŠ å¯†å‚æ•°ï¼Œå› æ¤é™¤éžTPM拥有认è¯çš„EKï¼Œå¦ +åˆ™æ— æ³•æ‰§è¡Œå¯¼å…¥æ“作。如果该命令æˆåŠŸæ‰§è¡Œå¹¶ä¸”HMAC在返回时通过验è¯ï¼Œæˆ‘ä»¬å°±çŸ¥é“ +我们有一个åªä¸ºè®¤è¯TPMåŠ è½½çš„ç§é’¥å‰¯æœ¬ã€‚çŽ°åœ¨è¯¥å¯†é’¥å·²åŠ è½½åˆ°TPMä¸ï¼Œå¹¶ä¸”å˜å‚¨ä¸»å¯† +钥已被清除(以释放空间用于生æˆç©ºå¯†é’¥ï¼‰ã€‚ + +çŽ°åœ¨æ ¹æ® `TCG TPM v2.0 Provisioning Guidance`_ ä¸çš„å˜å‚¨é…置生æˆç©ºEC主密钥; +该密钥的å称(å³å…¬é’¥åŒºåŸŸçš„哈希值)被计算出æ¥å¹¶ä¸Žå†…æ ¸åœ¨/sys/class/tpm/tpm0/null_name +ä¸æä¾›çš„ç©ºç§åå称进行比较。如果åç§°ä¸åŒ¹é…,TPMå°±è¢«è®¤ä¸ºæ˜¯å—æŸçš„。如果å称匹é…, +用户执行TPM2_Certify()ï¼Œä½¿ç”¨ç©ºä¸»å¯†é’¥ä½œä¸ºå¯¹è±¡å¥æŸ„ï¼Œä½¿ç”¨åŠ è½½çš„ç§é’¥ä½œä¸ºç¾å奿Ÿ„, +å¹¶æä¾›éšæœºçš„åˆæ ¼æ•°æ®ã€‚返回的certifyInfoçš„ç¾åå°†ä¸ŽåŠ è½½çš„ç§é’¥çš„公钥部分进行验è¯ï¼Œ +å¹¶æ£€æŸ¥åˆæ ¼æ•°æ®ä»¥é˜²æ¢é‡æ”¾ã€‚如果所有测试都通过,用户就å¯ä»¥ç¡®ä¿¡TPM的完整性和éšç§ +æ€§åœ¨æ•´ä¸ªå†…æ ¸å¯åŠ¨è¿‡ç¨‹ä¸å¾—åˆ°äº†ä¿æŠ¤ã€‚ + +.. _TPM Genie: https://www.nccgroup.trust/globalassets/about-us/us/documents/tpm-genie.pdf +.. _Windows Bitlocker TPM: https://dolosgroup.io/blog/2021/7/9/from-stolen-laptop-to-inside-the-company-network +.. _基于TPMçš„Linuxç£ç›˜åР坆: https://www.secura.com/blog/tpm-sniffing-attacks-against-non-bitlocker-targets +.. _TCG EK Credential Profile: https://trustedcomputinggroup.org/resource/tcg-ek-credential-profile-for-tpm-family-2-0/ +.. _TCG TPM v2.0 Provisioning Guidance: https://trustedcomputinggroup.org/resource/tcg-tpm-v2-0-provisioning-guidance/ diff --git a/Documentation/translations/zh_CN/security/tpm/tpm_event_log.rst b/Documentation/translations/zh_CN/security/tpm/tpm_event_log.rst new file mode 100644 index 000000000000..9c173291ac3e --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm_event_log.rst @@ -0,0 +1,49 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm_event_log.rst + +:翻译: + 赵硕 Shuo Zhao <zhaoshuo@cqsoftware.com.cn> + +=========== +TPM事件日志 +=========== + +本文档简è¦ä»‹ç»äº†ä»€ä¹ˆæ˜¯TPM日志,以åŠå®ƒæ˜¯å¦‚何从预å¯åŠ¨å›ºä»¶ç§»äº¤åˆ°æ“作系统的。 + +ä»‹ç» +==== + +预å¯åŠ¨å›ºä»¶ç»´æŠ¤ä¸€ä¸ªäº‹ä»¶æ—¥å¿—ï¼Œæ¯å½“它将æŸäº›å†…容哈希到任何一个PCR寄å˜å™¨æ—¶ï¼Œè¯¥ +æ—¥å¿—ä¼šæ·»åŠ æ–°æ¡ç›®ã€‚这些事件按类型分类,并包å«å“ˆå¸ŒåŽçš„PCR寄å˜å™¨å€¼ã€‚通常,预 +å¯åŠ¨å›ºä»¶ä¼šå“ˆå¸Œé‚£äº›å³å°†ç§»äº¤æ‰§è¡Œæƒæˆ–与å¯åŠ¨è¿‡ç¨‹ç›¸å…³çš„ç»„ä»¶ã€‚ + +其主è¦åº”用是远程认è¯ï¼Œè€Œå®ƒä¹‹æ‰€ä»¥æœ‰ç”¨çš„åŽŸå› åœ¨[1]ä¸ç¬¬ä¸€éƒ¨åˆ†å¾ˆå¥½åœ°é˜è¿°äº†ï¼š + +认è¯ç”¨äºŽå‘挑战者æä¾›æœ‰å…³å¹³å°çжæ€çš„ä¿¡æ¯ã€‚然而,PCRçš„å†…å®¹éš¾ä»¥è§£è¯»ï¼›å› æ¤ï¼Œå½“ +PCRå†…å®¹é™„æœ‰æµ‹é‡æ—¥å¿—时,认è¯é€šå¸¸ä¼šæ›´æœ‰ç”¨ã€‚å°½ç®¡æµ‹é‡æ—¥å¿—本身并ä¸å¯ä¿¡ï¼Œä½†å®ƒä»¬ +åŒ…å«æ¯”PCR内容更为丰富的信æ¯é›†ã€‚PCRå†…å®¹ç”¨äºŽå¯¹æµ‹é‡æ—¥å¿—进行验è¯ã€‚ + +UEFI事件日志 +============ + +UEFIæä¾›çš„事件日志有一些比较奇怪的特性。 + +在调用ExitBootServices()之å‰ï¼ŒLinux EFIå¼•å¯¼åŠ è½½ç¨‹åºä¼šå°†äº‹ä»¶æ—¥å¿—å¤åˆ¶åˆ°ç”± +å¼•å¯¼åŠ è½½ç¨‹åºè‡ªå®šä¹‰çš„é…置表ä¸ã€‚ä¸å¹¸çš„æ˜¯ï¼Œé€šè¿‡ExitBootServices()生æˆçš„事件 +å¹¶ä¸ä¼šå‡ºçŽ°åœ¨è¿™ä¸ªè¡¨é‡Œã€‚ + +固件æä¾›äº†ä¸€ä¸ªæ‰€è°“的最终事件é…ç½®è¡¨æŽ’åºæ¥è§£å†³è¿™ä¸ªé—®é¢˜ã€‚事件会在第一次调用 +EFI_TCG2_PROTOCOL.GetEventLog()åŽè¢«é•œåƒåˆ°è¿™ä¸ªè¡¨ä¸ã€‚ + +这引出了å¦ä¸€ä¸ªé—®é¢˜ï¼šæ— 法ä¿è¯å®ƒä¸ä¼šåœ¨ Linux EFI stub 开始è¿è¡Œä¹‹å‰è¢«è°ƒç”¨ã€‚ +å› æ¤ï¼Œåœ¨ stub è¿è¡Œæ—¶ï¼Œå®ƒéœ€è¦è®¡ç®—并将最终事件表的大å°ä¿å˜åˆ°è‡ªå®šä¹‰é…置表ä¸ï¼Œ +以便TPM驱动程åºå¯ä»¥åœ¨ç¨åŽè¿žæŽ¥æ¥è‡ªè‡ªå®šä¹‰é…置表和最终事件表的两个事件日志时 +跳过这些事件。 + +å‚考文献 +======== + +- [1] https://trustedcomputinggroup.org/resource/pc-client-specific-platform-firmware-profile-specification/ +- [2] The final concatenation is done in drivers/char/tpm/eventlog/efi.c diff --git a/Documentation/translations/zh_CN/security/tpm/tpm_ftpm_tee.rst b/Documentation/translations/zh_CN/security/tpm/tpm_ftpm_tee.rst new file mode 100644 index 000000000000..5901eee32563 --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm_ftpm_tee.rst @@ -0,0 +1,31 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm_ftpm_tee.rst + +:翻译: + 赵硕 Shuo Zhao <zhaoshuo@cqsoftware.com.cn> + +=========== +固件TPM驱动 +=========== + +本文档æè¿°äº†å›ºä»¶å¯ä¿¡å¹³å°æ¨¡å—(fTPM)设备驱动。 + +ä»‹ç» +==== + +è¯¥é©±åŠ¨ç¨‹åºæ˜¯ç”¨äºŽARMçš„TrustZone环境ä¸å®žçŽ°çš„å›ºä»¶çš„é€‚é…器。该驱动 +程åºå…许程åºä»¥ä¸Žç¡¬ä»¶TPM相åŒçš„æ–¹å¼ä¸ŽTPM进行交互。 + +设计 +==== + +该驱动程åºå……å½“ä¸€ä¸ªè–„å±‚ï¼Œä¼ é€’å‘½ä»¤åˆ°å›ºä»¶å®žçŽ°çš„TPM并接收其å“应。驱动 +ç¨‹åºæœ¬èº«å¹¶ä¸åŒ…å«å¤ªå¤šé€»è¾‘ï¼Œæ›´åƒæ˜¯å›ºä»¶ä¸Žå†…æ ¸/用户空间之间的一个管é“。 + +固件本身基于以下论文: +https://www.microsoft.com/en-us/research/wp-content/uploads/2017/06/ftpm1.pdf + +当驱动程åºè¢«åŠ è½½æ—¶ï¼Œå®ƒä¼šå‘用户空间暴露 ``/dev/tpmX`` å—符设备,å…许 +用户空间通过该设备与固件TPM进行通信。 diff --git a/Documentation/translations/zh_CN/security/tpm/tpm_tis.rst b/Documentation/translations/zh_CN/security/tpm/tpm_tis.rst new file mode 100644 index 000000000000..0fb009f93e10 --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm_tis.rst @@ -0,0 +1,43 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm_tis.rst + +:翻译: + 赵硕 Shuo Zhao <zhaoshuo@cqsoftware.com.cn> + +TPM FIFO接å£é©±åЍ +================ + +TCG PTPè§„èŒƒå®šä¹‰äº†ä¸¤ç§æŽ¥å£ç±»åž‹ï¼šFIFOå’ŒCRB。å‰è€…基于顺åºçš„读写æ“作, +åŽè€…基于包å«å®Œæ•´å‘½ä»¤æˆ–å“应的缓冲区。 + +FIFO(先进先出)接å£è¢«tpm_tis_coreä¾èµ–的驱动程åºä½¿ç”¨ã€‚最åˆï¼ŒLinuxåª +有一个å为tpm_tisçš„é©±åŠ¨ï¼Œè¦†ç›–äº†å†…å˜æ˜ å°„ï¼ˆå³ MMIO)接å£ï¼Œä½†åŽæ¥å®ƒè¢« +扩展以支æŒTCGæ ‡å‡†æ‰€æ”¯æŒçš„å…¶ä»–ç‰©ç†æŽ¥å£ã€‚ + +由于历å²åŽŸå› ï¼Œæœ€åˆçš„MMIO驱动被称为tpm_tis,而FIFO驱动的框架被命å为 +tpm_tis_core。在tpm_tisä¸çš„“tisâ€åŽç¼€æ¥è‡ªTPM接å£è§„范,这是针对TPM1.x +芯片的硬件接å£è§„范。 + +通信基于一个由TPMèŠ¯ç‰‡é€šè¿‡ç¡¬ä»¶æ€»çº¿æˆ–å†…å˜æ˜ 射共享的20KiB 缓冲区,具体 +å–å†³äºŽç‰©ç†æŽ¥çº¿ã€‚è¯¥ç¼“å†²åŒºè¿›ä¸€æ¥åˆ†ä¸ºäº”个相ç‰å¤§å°çš„4KiB缓冲区,为CPUå’Œ +TPM之间的通信æä¾›ç‰æ•ˆçš„寄å˜å™¨é›†ã€‚这些通信端点在TCG术è¯ä¸ç§°ä¸ºlocalities。 + +å½“å†…æ ¸æƒ³è¦å‘TPM芯片å‘é€å‘½ä»¤æ—¶ï¼Œå®ƒé¦–先通过在TPM_ACCESS寄å˜å™¨ä¸è®¾ç½® +requestUse使¥ä¿ç•™locality0。当访问被授予时,该ä½ç”±èŠ¯ç‰‡æ¸…é™¤ã€‚ä¸€æ—¦å®Œæˆ +é€šä¿¡ï¼Œå†…æ ¸ä¼šå†™å…¥TPM_ACCESS.activeLocalityä½ã€‚这告诉芯片该本地性已被 +释放。 + +待处ç†çš„æœ¬åœ°æ€§ç”±èŠ¯ç‰‡æŒ‰ä¼˜å…ˆçº§é™åºé€ä¸ªæœåŠ¡ï¼Œä¸€æ¬¡ä¸€ä¸ªï¼š + +- Locality0优先级最低。 +- Locality5优先级最高。 + +关于localities的更多信æ¯å’Œå«ä¹‰ï¼Œè¯·å‚阅TCG PC客户端平å°TPM é…置文件规范的第3.2节。 + +å‚考文献 +======== + +TCG PC客户端平å°TPMé…置文件(PTP)规范 +https://trustedcomputinggroup.org/resource/pc-client-platform-tpm-profile-ptp-specification/ diff --git a/Documentation/translations/zh_CN/security/tpm/tpm_vtpm_proxy.rst b/Documentation/translations/zh_CN/security/tpm/tpm_vtpm_proxy.rst new file mode 100644 index 000000000000..bc92cfb684c3 --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/tpm_vtpm_proxy.rst @@ -0,0 +1,51 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/tpm_vtpm_proxy.rst + +:翻译: + 赵硕 Shuo Zhao <zhaoshuo@cqsoftware.com.cn> + +========================== +Linux容器的虚拟TPM代ç†é©±åЍ +========================== + +| 作者: +| Stefan Berger <stefanb@linux.vnet.ibm.com> + +本文档æè¿°äº†ç”¨äºŽLinux容器的虚拟å¯ä¿¡å¹³å°æ¨¡å—(vTPM)代ç†è®¾å¤‡é©±åŠ¨ã€‚ + +ä»‹ç» +==== + +è¿™é¡¹å·¥ä½œçš„ç›®æ ‡æ˜¯ä¸ºæ¯ä¸ªLinux容器æä¾›TPM功能。这使得程åºèƒ½å¤Ÿåƒä¸Žç‰©ç†ç³»ç»Ÿ +上的TPMäº¤äº’ä¸€æ ·ï¼Œä¸Žå®¹å™¨ä¸çš„TPM进行交互。æ¯ä¸ªå®¹å™¨éƒ½ä¼šèŽ·å¾—ä¸€ä¸ªå”¯ä¸€çš„ã€æ¨¡ +拟的软件TPM。 + +设计 +==== + +为了使æ¯ä¸ªå®¹å™¨éƒ½èƒ½ä½¿ç”¨æ¨¡æ‹Ÿçš„软件TPMï¼Œå®¹å™¨ç®¡ç†æ ˆéœ€è¦åˆ›å»ºä¸€å¯¹è®¾å¤‡ï¼Œå…¶ä¸ +包括一个客户端TPMå—符设备 ``/dev/tpmX`` (X=0,1,2...)和一个‘æœåŠ¡å™¨ç«¯â€™ +文件æè¿°ç¬¦ã€‚当文件æè¿°ç¬¦ä¼ 被递给TPM模拟器时,å‰è€…通过创建具有适当主次 +设备å·çš„å—符设备被移入容器,然åŽï¼Œå®¹å™¨å†…的软件å¯ä»¥ä½¿ç”¨å—符设备å‘é€TPM +命令,模拟器将通过文件æè¿°ç¬¦æŽ¥æ”¶è¿™äº›å‘½ä»¤ï¼Œå¹¶ç”¨å®ƒæ¥å‘é€å“应。 + +为了支æŒè¿™ä¸€ç‚¹ï¼Œè™šæ‹ŸTPM代ç†é©±åŠ¨ç¨‹åºæä¾›äº†ä¸€ä¸ªè®¾å¤‡ ``/dev/vtpmx`` ,该设备 +用于通过ioctl创建设备对。ioctl将其作为é…ç½®è®¾å¤‡çš„è¾“å…¥æ ‡å¿—ï¼Œä¾‹å¦‚è¿™äº›æ ‡å¿—æŒ‡ç¤º +TPMæ¨¡æ‹Ÿå™¨æ˜¯å¦æ”¯æŒTPM1.2或TPM2功能。ioctl的结果是返回‘æœåŠ¡å™¨ç«¯â€™çš„æ–‡ä»¶æè¿°ç¬¦ +以åŠåˆ›å»ºçš„å—符设备的主次设备å·ã€‚æ¤å¤–,还会返回TPMå—符设备的编å·ã€‚例如,如果 +创建了 ``/dev/tpm10`` ,则返回编å·ï¼ˆ ``dev_num`` )10。 + +一旦设备被创建,驱动程åºå°†ç«‹å³å°è¯•与TPM进行通信。æ¥è‡ªé©±åŠ¨ç¨‹åºçš„æ‰€æœ‰å‘½ä»¤ +都å¯ä»¥ä»Žioctl返回的文件æè¿°ç¬¦ä¸è¯»å–。这些命令应该立å³å¾—到å“应。 + +UAPI +==== + +该APIåœ¨ä»¥ä¸‹å†…æ ¸ä»£ç ä¸ï¼š + +include/uapi/linux/vtpm_proxy.h +drivers/char/tpm/tpm_vtpm_proxy.c + +函数:vtpmx_ioc_new_dev diff --git a/Documentation/translations/zh_CN/security/tpm/xen-tpmfront.rst b/Documentation/translations/zh_CN/security/tpm/xen-tpmfront.rst new file mode 100644 index 000000000000..fa085d98a99b --- /dev/null +++ b/Documentation/translations/zh_CN/security/tpm/xen-tpmfront.rst @@ -0,0 +1,114 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../../disclaimer-zh_CN.rst + +:Original: Documentation/security/tpm/xen-tpmfront.rst + +:翻译: + 赵硕 Shuo Zhao <zhaoshuo@cqsoftware.com.cn> + +================ +Xen的虚拟TPMæŽ¥å£ +================ + +作者:Matthew Fioravante (JHUAPL), Daniel De Graaf (NSA) + +本文档æè¿°äº†ç”¨äºŽXen的虚拟å¯ä¿¡å¹³å°æ¨¡å—(vTPM)å系统。å‡å®šè¯»è€…熟悉 +Xenå’ŒLinux的构建和安装,并对TPMå’ŒvTPM概念有基本的ç†è§£ã€‚ + +ä»‹ç» +---- + +è¿™é¡¹å·¥ä½œçš„ç›®æ ‡æ˜¯ä¸ºè™šæ‹Ÿå®¢æˆ·æ“作系统(在Xenä¸ç§°ä¸ºDomU)æä¾›TPM功能。这使得 +程åºèƒ½å¤Ÿåƒä¸Žç‰©ç†ç³»ç»Ÿä¸Šçš„TPMäº¤äº’ä¸€æ ·ï¼Œä¸Žè™šæ‹Ÿç³»ç»Ÿä¸çš„TPM进行交互。æ¯ä¸ªå®¢æˆ· +æ“ä½œç³»ç»Ÿéƒ½ä¼šèŽ·å¾—ä¸€ä¸ªå”¯ä¸€çš„ã€æ¨¡æ‹Ÿçš„软件TPM。然而,vTPM的所有秘密(如密钥〠+NVRAM ç‰ï¼‰ç”±vTPM管ç†åŸŸè¿›è¡Œç®¡ç†ï¼Œè¯¥åŸŸå°†è¿™äº›ç§˜å¯†å°å˜åˆ°ç‰©ç†TPMä¸ã€‚如果创建这 +些域(管ç†åŸŸã€vTPM域和客户域)的过程是å¯ä¿¡çš„,vTPMåç³»ç»Ÿå°±èƒ½å°†æ ¹æ¤äºŽç¡¬ä»¶ +TPM的信任链扩展到Xenä¸çš„虚拟机。vTPMçš„æ¯ä¸ªä¸»è¦ç»„件都作为一个独立的域实现, +从而通过虚拟机监控程åºï¼ˆhypervisor)æä¾›å®‰å…¨éš”离。 + +这个mini-os vTPM å系统是建立在IBMå’ŒIntelå…¬å¸ä¹‹å‰çš„vTPM工作基础上的。 + + +设计概述 +-------- + +vTPM的架构æè¿°å¦‚下:: + + +------------------+ + | Linux DomU | ... + | | ^ | + | v | | + | xen-tpmfront | + +------------------+ + | ^ + v | + +------------------+ + | mini-os/tpmback | + | | ^ | + | v | | + | vtpm-stubdom | ... + | | ^ | + | v | | + | mini-os/tpmfront | + +------------------+ + | ^ + v | + +------------------+ + | mini-os/tpmback | + | | ^ | + | v | | + | vtpmmgr-stubdom | + | | ^ | + | v | | + | mini-os/tpm_tis | + +------------------+ + | ^ + v | + +------------------+ + | Hardware TPM | + +------------------+ + +* Linux DomU: + 希望使用vTPM的基于Linux的客户机。å¯èƒ½æœ‰å¤šä¸ªè¿™æ ·çš„实例。 + +* xen-tpmfront.ko: + Linuxå†…æ ¸è™šæ‹ŸTPMå‰ç«¯é©±åŠ¨ç¨‹åºã€‚该驱动程åºä¸ºåŸºäºŽLinuxçš„DomUæä¾› + vTPM访问。 + +* mini-os/tpmback: + Mini-os TPMåŽç«¯é©±åŠ¨ç¨‹åºã€‚Linuxå‰ç«¯é©±åŠ¨ç¨‹åºé€šè¿‡è¯¥åŽç«¯é©±åŠ¨ç¨‹åºè¿ž + 接,以便在Linux DomU和其vTPM之间进行通信。该驱动程åºè¿˜è¢« + vtpmmgr-stubdom用于与vtpm-stubdom通信。 + +* vtpm-stubdom: + 一个实现vTPMçš„mini-oså˜æ ¹åŸŸã€‚æ¯ä¸ªæ£åœ¨è¿è¡Œçš„vtpm-stubdom实例与系统 + 上的逻辑vTPM之间有一一对应的关系。vTPMå¹³å°é…置寄å˜å™¨ï¼ˆPCRs)通常都 + åˆå§‹åŒ–为零。 + +* mini-os/tpmfront: + Mini-os TPMå‰ç«¯é©±åŠ¨ç¨‹åºã€‚vTPM mini-os域vtpm-stubdomä½¿ç”¨è¯¥é©±åŠ¨ç¨‹åº + 与vtpmmgr-stubdom通信。æ¤é©±åŠ¨ç¨‹åºè¿˜ç”¨äºŽä¸ŽvTPM域通信的mini-os域,例 + 如 pv-grub。 + +* vtpmmgr-stubdom: + 一个实现vTPM管ç†å™¨çš„mini-os域。系统ä¸åªæœ‰ä¸€ä¸ªvTPM管ç†å™¨ï¼Œå¹¶ä¸”在整个 + 机器生命周期内应一直è¿è¡Œã€‚æ¤åŸŸè°ƒèŠ‚å¯¹ç³»ç»Ÿä¸ç‰©ç†TPMçš„è®¿é—®ï¼Œå¹¶ç¡®ä¿æ¯ä¸ª + vTPMçš„æŒä¹…状æ€ã€‚ + +* mini-os/tpm_tis: + Mini-osTPM1.2版本TPM 接å£è§„范(TIS)驱动程åºã€‚该驱动程åºç”±vtpmmgr-stubdom + 用于直接与硬件TPM通信。通信通过将硬件内å˜é¡µæ˜ 射到vtpmmgr-stubdomæ¥å®žçŽ°ã€‚ + +* 硬件TPM: + 固定在主æ¿ä¸Šçš„ç‰©ç† TPM。 + +与Xençš„é›†æˆ +----------- + +vTPM驱动程åºçš„æ”¯æŒå·²åœ¨Xen4.3ä¸é€šè¿‡libxlå·¥å…·å †æ ˆæ·»åŠ ã€‚æœ‰å…³è®¾ç½®vTPMå’ŒvTPM +管ç†å™¨å˜æ ¹åŸŸçš„详细信æ¯ï¼Œè¯·å‚è§Xen文档(docs/misc/vtpm.txtï¼‰ã€‚ä¸€æ—¦å˜æ ¹åŸŸ +è¿è¡Œï¼Œä¸Žç£ç›˜æˆ–网络设备相åŒï¼ŒvTPM设备将在域的é…置文件ä¸è¿›è¡Œè®¾ç½® + +为了使用诸如IMAï¼ˆå®Œæ•´æ€§æµ‹é‡æž¶æž„)ç‰éœ€è¦åœ¨initrd之å‰åŠ è½½TPM的功能,必须将 +xen-tpmfront驱动程åºç¼–è¯‘åˆ°å†…æ ¸ä¸ã€‚如果ä¸ä½¿ç”¨è¿™äº›åŠŸèƒ½ï¼Œé©±åŠ¨ç¨‹åºå¯ä»¥ä½œä¸º +模å—编译,并åƒå¾€å¸¸ä¸€æ ·åŠ è½½ã€‚ diff --git a/Documentation/translations/zh_TW/admin-guide/README.rst b/Documentation/translations/zh_TW/admin-guide/README.rst index a6e34c200ea3..0b038074d9d1 100644 --- a/Documentation/translations/zh_TW/admin-guide/README.rst +++ b/Documentation/translations/zh_TW/admin-guide/README.rst @@ -149,7 +149,7 @@ Linuxå…§æ ¸6.x版本 <http://kernel.org/> "make xconfig" 基於Qtçš„é…置工具。 - "make gconfig" 基於GTK+çš„é…置工具。 + "make gconfig" 基於GTKçš„é…置工具。 "make oldconfig" åŸºæ–¼ç¾æœ‰çš„ ./.config æ–‡ä»¶é¸æ“‡æ‰€æœ‰é¸é …ï¼Œä¸¦è©¢å• æ–°é…ç½®é¸é …。 diff --git a/Documentation/translations/zh_TW/process/submit-checklist.rst b/Documentation/translations/zh_TW/process/submit-checklist.rst index 0ecb187753e4..a0cb91a6945f 100644 --- a/Documentation/translations/zh_TW/process/submit-checklist.rst +++ b/Documentation/translations/zh_TW/process/submit-checklist.rst @@ -85,8 +85,8 @@ Linuxå…§æ ¸è£œä¸æäº¤æª¢æŸ¥å–® 17) æ‰€æœ‰æ–°çš„æ¨¡å¡Šåƒæ•¸éƒ½è¨˜éŒ„在 ``MODULE_PARM_DESC()`` 18) 所有新的用戶空間接å£éƒ½è¨˜éŒ„在 ``Documentation/ABI/`` ä¸ã€‚有關詳細信æ¯ï¼Œ - è«‹åƒé–± ``Documentation/ABI/README`` 。更改用戶空間接å£çš„è£œä¸æ‡‰è©²æŠ„é€ - linux-api@vger.kernel.org。 + è«‹åƒé–± Documentation/admin-guide/abi.rst (或 ``Documentation/ABI/README``)。 + 更改用戶空間接å£çš„è£œä¸æ‡‰è©²æŠ„é€ linux-api@vger.kernel.org\ 。 19) 已通éŽè‡³å°‘注入slabå’Œpage分é…失敗進行檢查。請åƒé–± ``Documentation/fault-injection/`` 。 å¦‚æžœæ–°ä»£ç¢¼æ˜¯å¯¦è³ªæ€§çš„ï¼Œé‚£éº¼æ·»åŠ å系統特定的故障注入å¯èƒ½æ˜¯åˆé©çš„。 diff --git a/Documentation/usb/gadget-testing.rst b/Documentation/usb/gadget-testing.rst index bf555c2270f5..1998dc146c56 100644 --- a/Documentation/usb/gadget-testing.rst +++ b/Documentation/usb/gadget-testing.rst @@ -1050,7 +1050,7 @@ Its attributes are: midi1_num_groups The number of groups for MIDI 1.0 (0-16) ui_hint UI-hint of this FB 0: unknown, 1: receiver, 2: sender, 3: both - midi_ci_verison Supported MIDI-CI version number (8 bit) + midi_ci_version Supported MIDI-CI version number (8 bit) is_midi1 Legacy MIDI 1.0 device (0-2) 0: MIDI 2.0 device, 1: MIDI 1.0 without restriction, or diff --git a/Documentation/userspace-api/accelerators/ocxl.rst b/Documentation/userspace-api/accelerators/ocxl.rst index db7570d5e50d..4e213af70237 100644 --- a/Documentation/userspace-api/accelerators/ocxl.rst +++ b/Documentation/userspace-api/accelerators/ocxl.rst @@ -3,8 +3,11 @@ OpenCAPI (Open Coherent Accelerator Processor Interface) ======================================================== OpenCAPI is an interface between processors and accelerators. It aims -at being low-latency and high-bandwidth. The specification is -developed by the `OpenCAPI Consortium <http://opencapi.org/>`_. +at being low-latency and high-bandwidth. + +The specification was developed by the OpenCAPI Consortium, and is now +available from the `Compute Express Link Consortium +<https://computeexpresslink.org/resource/opencapi-specification-archive/>`_. It allows an accelerator (which could be an FPGA, ASICs, ...) to access the host memory coherently, using virtual addresses. An OpenCAPI diff --git a/Documentation/userspace-api/dma-buf-heaps.rst b/Documentation/userspace-api/dma-buf-heaps.rst new file mode 100644 index 000000000000..535f49047ce6 --- /dev/null +++ b/Documentation/userspace-api/dma-buf-heaps.rst @@ -0,0 +1,25 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +Allocating dma-buf using heaps +============================== + +Dma-buf Heaps are a way for userspace to allocate dma-buf objects. They are +typically used to allocate buffers from a specific allocation pool, or to share +buffers across frameworks. + +Heaps +===== + +A heap represents a specific allocator. The Linux kernel currently supports the +following heaps: + + - The ``system`` heap allocates virtually contiguous, cacheable, buffers. + + - The ``cma`` heap allocates physically contiguous, cacheable, + buffers. Only present if a CMA region is present. Such a region is + usually created either through the kernel commandline through the + `cma` parameter, a memory region Device-Tree node with the + `linux,cma-default` property set, or through the `CMA_SIZE_MBYTES` or + `CMA_SIZE_PERCENTAGE` Kconfig options. Depending on the platform, it + might be called ``reserved``, ``linux,cma``, or ``default-pool``. diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index b1395d94b3fd..9cbe4390c872 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -44,6 +44,7 @@ Devices and I/O :maxdepth: 1 accelerators/ocxl + dma-buf-heaps dma-buf-alloc-exchange gpio/index iommufd diff --git a/Documentation/userspace-api/media/rc/rc-sysfs-nodes.rst b/Documentation/userspace-api/media/rc/rc-sysfs-nodes.rst index 34d6a0a1f4d3..70b5966aaff8 100644 --- a/Documentation/userspace-api/media/rc/rc-sysfs-nodes.rst +++ b/Documentation/userspace-api/media/rc/rc-sysfs-nodes.rst @@ -6,7 +6,7 @@ Remote Controller's sysfs nodes ******************************* -As defined at ``Documentation/ABI/testing/sysfs-class-rc``, those are +As defined at Documentation/ABI/testing/sysfs-class-rc, those are the sysfs nodes that control the Remote Controllers: diff --git a/MAINTAINERS b/MAINTAINERS index c9763412a508..c32759cdc569 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2213,6 +2213,7 @@ ARM/APPLE MACHINE SUPPORT M: Sven Peter <sven@svenpeter.dev> M: Janne Grunau <j@jannau.net> R: Alyssa Rosenzweig <alyssa@rosenzweig.io> +R: Neal Gompa <neal@gompa.dev> L: asahi@lists.linux.dev L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -2237,6 +2238,7 @@ F: Documentation/devicetree/bindings/pci/apple,pcie.yaml F: Documentation/devicetree/bindings/pinctrl/apple,pinctrl.yaml F: Documentation/devicetree/bindings/power/apple* F: Documentation/devicetree/bindings/pwm/apple,s5l-fpwm.yaml +F: Documentation/devicetree/bindings/spi/apple,spi.yaml F: Documentation/devicetree/bindings/watchdog/apple,wdt.yaml F: arch/arm64/boot/dts/apple/ F: drivers/bluetooth/hci_bcm4377.c @@ -2254,6 +2256,7 @@ F: drivers/nvmem/apple-efuses.c F: drivers/pinctrl/pinctrl-apple-gpio.c F: drivers/pwm/pwm-apple.c F: drivers/soc/apple/* +F: drivers/spi/spi-apple.c F: drivers/watchdog/apple_wdt.c F: include/dt-bindings/interrupt-controller/apple-aic.h F: include/dt-bindings/pinctrl/apple.h @@ -4011,10 +4014,10 @@ F: include/vdso/bits.h F: lib/bitmap-str.c F: lib/bitmap.c F: lib/cpumask.c -F: lib/cpumask_kunit.c F: lib/find_bit.c F: lib/find_bit_benchmark.c F: lib/test_bitmap.c +F: lib/tests/cpumask_kunit.c F: tools/include/linux/bitfield.h F: tools/include/linux/bitmap.h F: tools/include/linux/bits.h @@ -4023,6 +4026,11 @@ F: tools/include/vdso/bits.h F: tools/lib/bitmap.c F: tools/lib/find_bit.c +BITMAP API BINDINGS [RUST] +M: Yury Norov <yury.norov@gmail.com> +S: Maintained +F: rust/helpers/cpumask.c + BITOPS API M: Yury Norov <yury.norov@gmail.com> R: Rasmus Villemoes <linux@rasmusvillemoes.dk> @@ -5399,6 +5407,8 @@ F: Documentation/dev-tools/checkpatch.rst CHINESE DOCUMENTATION M: Alex Shi <alexs@kernel.org> M: Yanteng Si <siyanteng@loongson.cn> +R: Dongliang Mu <dzm91@hust.edu.cn> +T: git git://git.kernel.org/pub/scm/linux/kernel/git/alexs/linux.git S: Maintained F: Documentation/translations/zh_CN/ @@ -6915,6 +6925,7 @@ L: dri-devel@lists.freedesktop.org L: linaro-mm-sig@lists.linaro.org (moderated for non-subscribers) S: Maintained T: git https://gitlab.freedesktop.org/drm/misc/kernel.git +F: Documentation/userspace-api/dma-buf-heaps.rst F: drivers/dma-buf/dma-heap.c F: drivers/dma-buf/heaps/* F: include/linux/dma-heap.h @@ -8225,6 +8236,7 @@ F: drivers/edac/aspeed_edac.c EDAC-BLUEFIELD M: Shravan Kumar Ramani <shravankr@nvidia.com> +M: David Thompson <davthompson@nvidia.com> S: Supported F: drivers/edac/bluefield_edac.c @@ -8644,7 +8656,6 @@ F: rust/kernel/net/phy/reg.rs EXEC & BINFMT API, ELF M: Kees Cook <kees@kernel.org> -R: Eric Biederman <ebiederm@xmission.com> L: linux-mm@kvack.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve @@ -8901,6 +8912,9 @@ F: include/linux/fs.h F: include/linux/fs_types.h F: include/uapi/linux/fs.h F: include/uapi/linux/openat2.h +F: Documentation/driver-api/early-userspace/buffer-format.rst +F: init/do_mounts* +F: init/*initramfs* FILESYSTEMS [EXPORTFS] M: Chuck Lever <chuck.lever@oracle.com> @@ -9070,9 +9084,9 @@ L: linux-hardening@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: include/linux/fortify-string.h -F: lib/fortify_kunit.c -F: lib/memcpy_kunit.c F: lib/test_fortify/* +F: lib/tests/fortify_kunit.c +F: lib/tests/memcpy_kunit.c K: \bunsafe_memcpy\b K: \b__NO_FORTIFY\b @@ -9757,9 +9771,9 @@ F: include/linux/string.h F: include/linux/string_choices.h F: include/linux/string_helpers.h F: lib/string.c -F: lib/string_kunit.c F: lib/string_helpers.c -F: lib/string_helpers_kunit.c +F: lib/tests/string_helpers_kunit.c +F: lib/tests/string_kunit.c F: scripts/coccinelle/api/string_choices.cocci GENERIC UIO DRIVER FOR PCI DEVICES @@ -9779,6 +9793,7 @@ F: include/asm-generic/vdso/vsyscall.h F: include/vdso/ F: kernel/time/vsyscall.c F: lib/vdso/ +F: tools/testing/selftests/vDSO/ GENWQE (IBM Generic Workqueue Card) M: Frank Haverkamp <haver@linux.ibm.com> @@ -12586,8 +12601,9 @@ F: Documentation/ABI/testing/sysfs-kernel-warn_count F: arch/*/configs/hardening.config F: include/linux/overflow.h F: include/linux/randomize_kstack.h +F: include/linux/ucopysize.h F: kernel/configs/hardening.config -F: lib/usercopy_kunit.c +F: lib/tests/usercopy_kunit.c F: mm/usercopy.c F: security/Kconfig.hardening K: \b(add|choose)_random_kstack_offset\b @@ -12824,9 +12840,7 @@ F: fs/kernfs/ F: include/linux/kernfs.h KEXEC -M: Eric Biederman <ebiederm@xmission.com> L: kexec@lists.infradead.org -S: Maintained W: http://kernel.org/pub/linux/utils/kernel/kexec/ F: include/linux/kexec.h F: include/uapi/linux/kexec.h @@ -12987,7 +13001,7 @@ F: Documentation/trace/kprobes.rst F: include/asm-generic/kprobes.h F: include/linux/kprobes.h F: kernel/kprobes.c -F: lib/test_kprobes.c +F: lib/tests/test_kprobes.c F: samples/kprobes KS0108 LCD CONTROLLER DRIVER @@ -13317,7 +13331,7 @@ M: Mark Brown <broonie@kernel.org> R: Matti Vaittinen <mazziesaccount@gmail.com> F: include/linux/linear_range.h F: lib/linear_ranges.c -F: lib/test_linear_ranges.c +F: lib/tests/test_linear_ranges.c LINUX FOR POWER MACINTOSH L: linuxppc-dev@lists.ozlabs.org @@ -13445,7 +13459,7 @@ M: David Gow <davidgow@google.com> L: linux-kselftest@vger.kernel.org L: kunit-dev@googlegroups.com S: Maintained -F: lib/list-test.c +F: lib/tests/list-test.c LITEX PLATFORM M: Karol Gugala <kgugala@antmicro.com> @@ -13752,12 +13766,10 @@ F: drivers/hwmon/ltc4282.c LTC4286 HARDWARE MONITOR DRIVER M: Delphine CC Chiu <Delphine_CC_Chiu@Wiwynn.com> -L: linux-i2c@vger.kernel.org +L: linux-hwmon@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/hwmon/lltc,ltc4286.yaml F: Documentation/hwmon/ltc4286.rst -F: drivers/hwmon/pmbus/Kconfig -F: drivers/hwmon/pmbus/Makefile F: drivers/hwmon/pmbus/ltc4286.c LTC4306 I2C MULTIPLEXER DRIVER @@ -16662,6 +16674,17 @@ F: net/mptcp/ F: tools/testing/selftests/bpf/*/*mptcp*.[ch] F: tools/testing/selftests/net/mptcp/ +NETWORKING [SRv6] +M: Andrea Mayer <andrea.mayer@uniroma2.it> +L: netdev@vger.kernel.org +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git +F: include/linux/seg6* +F: include/net/seg6* +F: include/uapi/linux/seg6* +F: net/ipv6/seg6* +F: tools/testing/selftests/net/srv6* + NETWORKING [TCP] M: Eric Dumazet <edumazet@google.com> M: Neal Cardwell <ncardwell@google.com> @@ -19912,6 +19935,7 @@ S: Supported F: Documentation/arch/x86/resctrl* F: arch/x86/include/asm/resctrl.h F: arch/x86/kernel/cpu/resctrl/ +F: include/linux/resctrl*.h F: tools/testing/selftests/resctrl/ READ-COPY UPDATE (RCU) @@ -21183,8 +21207,7 @@ S: Maintained W: https://github.com/sched-ext/scx T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git F: include/linux/sched/ext.h -F: kernel/sched/ext.h -F: kernel/sched/ext.c +F: kernel/sched/ext* F: tools/sched_ext/ F: tools/testing/selftests/sched_ext @@ -21766,7 +21789,7 @@ M: Jason A. Donenfeld <Jason@zx2c4.com> S: Maintained F: include/linux/siphash.h F: lib/siphash.c -F: lib/siphash_kunit.c +F: lib/tests/siphash_kunit.c SIS 190 ETHERNET DRIVER M: Francois Romieu <romieu@fr.zoreil.com> @@ -23110,12 +23133,6 @@ L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/system76_acpi.c -SYSV FILESYSTEM -S: Orphan -F: Documentation/filesystems/sysv-fs.rst -F: fs/sysv/ -F: include/linux/sysv_fs.h - TASKSTATS STATISTICS INTERFACE M: Balbir Singh <bsingharora@gmail.com> S: Maintained @@ -25447,8 +25464,8 @@ R: Sergey Senozhatsky <senozhatsky@chromium.org> S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git F: Documentation/core-api/printk-formats.rst -F: lib/test_printf.c -F: lib/test_scanf.c +F: lib/tests/printf_kunit.c +F: lib/tests/scanf_kunit.c F: lib/vsprintf.c VT1211 HARDWARE MONITOR DRIVER @@ -25981,7 +25998,6 @@ F: include/xen/swiotlb-xen.h XFS FILESYSTEM M: Carlos Maiolino <cem@kernel.org> -R: Darrick J. Wong <djwong@kernel.org> L: linux-xfs@vger.kernel.org S: Supported W: http://xfs.org/ @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Baby Opossum Posse # *DOCUMENTATION* @@ -1014,6 +1014,9 @@ CC_FLAGS_CFI := -fsanitize=kcfi ifdef CONFIG_CFI_ICALL_NORMALIZE_INTEGERS CC_FLAGS_CFI += -fsanitize-cfi-icall-experimental-normalize-integers endif +ifdef CONFIG_FINEIBT_BHI + CC_FLAGS_CFI += -fsanitize-kcfi-arity +endif ifdef CONFIG_RUST # Always pass -Zsanitizer-cfi-normalize-integers as CONFIG_RUST selects # CONFIG_CFI_ICALL_NORMALIZE_INTEGERS. diff --git a/arch/Kconfig b/arch/Kconfig index b8a4ff365582..9f6eb09ef12d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1584,6 +1584,10 @@ config HAVE_SPARSE_SYSCALL_NR entries at 4000, 5000 and 6000 locations. This option turns on syscall related optimizations for a given architecture. +config ARCH_HAS_VDSO_ARCH_DATA + depends on GENERIC_VDSO_DATA_STORE + bool + config ARCH_HAS_VDSO_TIME_DATA bool diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index c59d53d6d3f3..2dd6340de6b4 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -506,3 +506,4 @@ 574 common getxattrat sys_getxattrat 575 common listxattrat sys_listxattrat 576 common removexattrat sys_removexattrat +577 common open_tree_attr sys_open_tree_attr diff --git a/arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi b/arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi index 6bf4241fe3b7..c78ed064d166 100644 --- a/arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi +++ b/arch/arm/boot/dts/broadcom/bcm2711-rpi.dtsi @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcm2835-rpi.dtsi" -#include <dt-bindings/power/raspberrypi-power.h> #include <dt-bindings/reset/raspberrypi,firmware-reset.h> / { @@ -101,7 +100,3 @@ &vchiq { interrupts = <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>; }; - -&xhci { - power-domains = <&power RPI_POWER_DOMAIN_USB>; -}; diff --git a/arch/arm/boot/dts/broadcom/bcm2711.dtsi b/arch/arm/boot/dts/broadcom/bcm2711.dtsi index e4e42af21ef3..c06d9f5e53c8 100644 --- a/arch/arm/boot/dts/broadcom/bcm2711.dtsi +++ b/arch/arm/boot/dts/broadcom/bcm2711.dtsi @@ -134,7 +134,7 @@ clocks = <&clocks BCM2835_CLOCK_UART>, <&clocks BCM2835_CLOCK_VPU>; clock-names = "uartclk", "apb_pclk"; - arm,primecell-periphid = <0x00241011>; + arm,primecell-periphid = <0x00341011>; status = "disabled"; }; @@ -145,7 +145,7 @@ clocks = <&clocks BCM2835_CLOCK_UART>, <&clocks BCM2835_CLOCK_VPU>; clock-names = "uartclk", "apb_pclk"; - arm,primecell-periphid = <0x00241011>; + arm,primecell-periphid = <0x00341011>; status = "disabled"; }; @@ -156,7 +156,7 @@ clocks = <&clocks BCM2835_CLOCK_UART>, <&clocks BCM2835_CLOCK_VPU>; clock-names = "uartclk", "apb_pclk"; - arm,primecell-periphid = <0x00241011>; + arm,primecell-periphid = <0x00341011>; status = "disabled"; }; @@ -167,7 +167,7 @@ clocks = <&clocks BCM2835_CLOCK_UART>, <&clocks BCM2835_CLOCK_VPU>; clock-names = "uartclk", "apb_pclk"; - arm,primecell-periphid = <0x00241011>; + arm,primecell-periphid = <0x00341011>; status = "disabled"; }; @@ -451,8 +451,6 @@ IRQ_TYPE_LEVEL_LOW)>, <GIC_PPI 10 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>; - /* This only applies to the ARMv7 stub */ - arm,cpu-registers-not-fw-configured; }; cpus: cpus { @@ -610,6 +608,7 @@ #address-cells = <1>; #size-cells = <0>; interrupts = <GIC_SPI 176 IRQ_TYPE_LEVEL_HIGH>; + power-domains = <&pm BCM2835_POWER_DOMAIN_USB>; /* DWC2 and this IP block share the same USB PHY, * enabling both at the same time results in lockups. * So keep this node disabled and let the bootloader @@ -1177,6 +1176,7 @@ }; &uart0 { + arm,primecell-periphid = <0x00341011>; interrupts = <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>; }; diff --git a/arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts b/arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts index 53cb0c58f6d0..3da2daee0c84 100644 --- a/arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts +++ b/arch/arm/boot/dts/broadcom/bcm4709-asus-rt-ac3200.dts @@ -124,19 +124,19 @@ }; port@1 { - label = "lan1"; + label = "lan4"; }; port@2 { - label = "lan2"; + label = "lan3"; }; port@3 { - label = "lan3"; + label = "lan2"; }; port@4 { - label = "lan4"; + label = "lan1"; }; }; }; diff --git a/arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts b/arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts index 6c666dc7ad23..01ec8c03686a 100644 --- a/arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts +++ b/arch/arm/boot/dts/broadcom/bcm47094-asus-rt-ac5300.dts @@ -126,11 +126,11 @@ ports { port@0 { - label = "lan4"; + label = "wan"; }; port@1 { - label = "lan3"; + label = "lan1"; }; port@2 { @@ -138,11 +138,11 @@ }; port@3 { - label = "lan1"; + label = "lan3"; }; port@4 { - label = "wan"; + label = "lan4"; }; }; }; diff --git a/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi b/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi index dffab5aa8b9c..88be29166c1a 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6qdl-apalis.dtsi @@ -108,6 +108,11 @@ }; }; + poweroff { + compatible = "regulator-poweroff"; + cpu-supply = <&vgen2_reg>; + }; + reg_module_3v3: regulator-module-3v3 { compatible = "regulator-fixed"; regulator-always-on; @@ -236,10 +241,6 @@ status = "disabled"; }; -&clks { - fsl,pmic-stby-poweroff; -}; - /* Apalis SPI1 */ &ecspi1 { cs-gpios = <&gpio5 25 GPIO_ACTIVE_LOW>; @@ -527,7 +528,6 @@ pmic: pmic@8 { compatible = "fsl,pfuze100"; - fsl,pmic-stby-poweroff; reg = <0x08>; regulators { diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 1815748f5d2a..bae5edf348ef 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -381,7 +381,7 @@ void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size); void iounmap(volatile void __iomem *io_addr); #define iounmap iounmap -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size); +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); #define arch_memremap_wb arch_memremap_wb /* diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h index 5b85889f82ee..88364a6727ff 100644 --- a/arch/arm/include/asm/vdso.h +++ b/arch/arm/include/asm/vdso.h @@ -4,6 +4,8 @@ #ifdef __KERNEL__ +#define __VDSO_PAGES 4 + #ifndef __ASSEMBLY__ struct mm_struct; diff --git a/arch/arm/include/asm/vdso/gettimeofday.h b/arch/arm/include/asm/vdso/gettimeofday.h index 592d3d015ca7..1e9f81639c88 100644 --- a/arch/arm/include/asm/vdso/gettimeofday.h +++ b/arch/arm/include/asm/vdso/gettimeofday.h @@ -112,7 +112,7 @@ static inline bool arm_vdso_hres_capable(void) #define __arch_vdso_hres_capable arm_vdso_hres_capable static __always_inline u64 __arch_get_hw_counter(int clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { #ifdef CONFIG_ARM_ARCH_TIMER u64 cycle_now; @@ -135,11 +135,6 @@ static __always_inline u64 __arch_get_hw_counter(int clock_mode, #endif } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm/include/asm/vdso/vsyscall.h b/arch/arm/include/asm/vdso/vsyscall.h index 705414710dcd..4e7226ad02ec 100644 --- a/arch/arm/include/asm/vdso/vsyscall.h +++ b/arch/arm/include/asm/vdso/vsyscall.h @@ -7,22 +7,14 @@ #include <vdso/datapage.h> #include <asm/cacheflush.h> -extern struct vdso_data *vdso_data; extern bool cntvct_ok; static __always_inline -struct vdso_data *__arm_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __arm_get_k_vdso_data - -static __always_inline -void __arm_sync_vdso_data(struct vdso_data *vdata) +void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { flush_dcache_page(virt_to_page(vdata)); } -#define __arch_sync_vdso_data __arm_sync_vdso_data +#define __arch_sync_vdso_time_data __arch_sync_vdso_time_data /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 4853875740d0..123f4a8ef446 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -153,10 +153,6 @@ int main(void) DEFINE(CACHE_WRITEBACK_ORDER, __CACHE_WRITEBACK_ORDER); DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE); BLANK(); -#ifdef CONFIG_VDSO - DEFINE(VDSO_DATA_SIZE, sizeof(union vdso_data_store)); -#endif - BLANK(); #ifdef CONFIG_ARM_MPU DEFINE(MPU_RNG_INFO_RNGS, offsetof(struct mpu_rgn_info, rgns)); DEFINE(MPU_RNG_INFO_USED, offsetof(struct mpu_rgn_info, used)); diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 6ea645939573..afbd2ebe5c39 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -258,13 +258,6 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) barrier(); } -#ifdef CONFIG_PREEMPT -#define S_PREEMPT " PREEMPT" -#elif defined(CONFIG_PREEMPT_RT) -#define S_PREEMPT " PREEMPT_RT" -#else -#define S_PREEMPT "" -#endif #ifdef CONFIG_SMP #define S_SMP " SMP" #else @@ -282,8 +275,8 @@ static int __die(const char *str, int err, struct pt_regs *regs) static int die_counter; int ret; - pr_emerg("Internal error: %s: %x [#%d]" S_PREEMPT S_SMP S_ISA "\n", - str, err, ++die_counter); + pr_emerg("Internal error: %s: %x [#%d]" S_SMP S_ISA "\n", + str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, SIGSEGV); diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 29dd2f3c62fe..325448ffbba0 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -7,6 +7,7 @@ */ #include <linux/cache.h> +#include <linux/vdso_datastore.h> #include <linux/elf.h> #include <linux/err.h> #include <linux/kernel.h> @@ -33,15 +34,6 @@ extern char vdso_start[], vdso_end[]; /* Total number of pages needed for the data and text portions of the VDSO. */ unsigned int vdso_total_pages __ro_after_init; -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; - -static struct page *vdso_data_page __ro_after_init; -static const struct vm_special_mapping vdso_data_mapping = { - .name = "[vvar]", - .pages = &vdso_data_page, -}; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -192,9 +184,6 @@ static int __init vdso_init(void) if (vdso_text_pagelist == NULL) return -ENOMEM; - /* Grab the VDSO data page. */ - vdso_data_page = virt_to_page(vdso_data); - /* Grab the VDSO text pages. */ for (i = 0; i < text_pages; i++) { struct page *page; @@ -205,7 +194,7 @@ static int __init vdso_init(void) vdso_text_mapping.pages = vdso_text_pagelist; - vdso_total_pages = 1; /* for the data/vvar page */ + vdso_total_pages = VDSO_NR_PAGES; /* for the data/vvar pages */ vdso_total_pages += text_pages; cntvct_ok = cntvct_functional(); @@ -216,16 +205,7 @@ static int __init vdso_init(void) } arch_initcall(vdso_init); -static int install_vvar(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma; - - vma = _install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ | VM_MAYREAD, - &vdso_data_mapping); - - return PTR_ERR_OR_ZERO(vma); -} +static_assert(__VDSO_PAGES == VDSO_NR_PAGES); /* assumes mmap_lock is write-locked */ void arm_install_vdso(struct mm_struct *mm, unsigned long addr) @@ -238,12 +218,12 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr) if (vdso_text_pagelist == NULL) return; - if (install_vvar(mm, addr)) + if (IS_ERR(vdso_install_vvar_mapping(mm, addr))) return; - /* Account for vvar page. */ - addr += PAGE_SIZE; - len = (vdso_total_pages - 1) << PAGE_SHIFT; + /* Account for vvar pages. */ + addr += VDSO_NR_PAGES * PAGE_SIZE; + len = (vdso_total_pages - VDSO_NR_PAGES) << PAGE_SHIFT; vma = _install_special_mapping(mm, addr, len, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig index 2a8a9fe46586..3fa15f342240 100644 --- a/arch/arm/mach-davinci/Kconfig +++ b/arch/arm/mach-davinci/Kconfig @@ -27,6 +27,7 @@ config ARCH_DAVINCI_DA830 config ARCH_DAVINCI_DA850 bool "DA850/OMAP-L138/AM18x based system" + select ARCH_DAVINCI_DA8XX select DAVINCI_CP_INTC config ARCH_DAVINCI_DA8XX diff --git a/arch/arm/mach-davinci/da830.c b/arch/arm/mach-davinci/da830.c index 2e497745b624..a044ea5cb4f1 100644 --- a/arch/arm/mach-davinci/da830.c +++ b/arch/arm/mach-davinci/da830.c @@ -11,7 +11,6 @@ #include <linux/gpio.h> #include <linux/init.h> #include <linux/io.h> -#include <linux/irqchip/irq-davinci-cp-intc.h> #include <clocksource/timer-davinci.h> diff --git a/arch/arm/mach-imx/mmdc.c b/arch/arm/mach-imx/mmdc.c index e898f7c2733e..94e4f4a2f73f 100644 --- a/arch/arm/mach-imx/mmdc.c +++ b/arch/arm/mach-imx/mmdc.c @@ -509,9 +509,8 @@ static int imx_mmdc_perf_init(struct platform_device *pdev, void __iomem *mmdc_b pmu_mmdc->mmdc_ipg_clk = mmdc_ipg_clk; pmu_mmdc->devtype_data = device_get_match_data(&pdev->dev); - hrtimer_init(&pmu_mmdc->hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - pmu_mmdc->hrtimer.function = mmdc_pmu_timer_handler; + hrtimer_setup(&pmu_mmdc->hrtimer, mmdc_pmu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cpumask_set_cpu(raw_smp_processor_id(), &pmu_mmdc->cpu); diff --git a/arch/arm/mach-omap1/Kconfig b/arch/arm/mach-omap1/Kconfig index a643b71e30a3..08ec6bd84ada 100644 --- a/arch/arm/mach-omap1/Kconfig +++ b/arch/arm/mach-omap1/Kconfig @@ -8,6 +8,7 @@ menuconfig ARCH_OMAP1 select ARCH_OMAP select CLKSRC_MMIO select FORCE_PCI if PCCARD + select GENERIC_IRQ_CHIP select GPIOLIB help Support for older TI OMAP1 (omap7xx, omap15xx or omap16xx) diff --git a/arch/arm/mach-shmobile/headsmp.S b/arch/arm/mach-shmobile/headsmp.S index a956b489b6ea..2bc7e73a8582 100644 --- a/arch/arm/mach-shmobile/headsmp.S +++ b/arch/arm/mach-shmobile/headsmp.S @@ -136,6 +136,7 @@ ENDPROC(shmobile_smp_sleep) .long shmobile_smp_arg - 1b .bss + .align 2 .globl shmobile_smp_mpidr shmobile_smp_mpidr: .space NR_CPUS * 4 diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 2b6f50dd5478..5c1023a6d78c 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -928,6 +928,7 @@ config VDSO select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_32 select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_DATA_STORE help Place in the process address space an ELF shared object providing fast implementations of gettimeofday and diff --git a/arch/arm/mm/cache-l2x0-pmu.c b/arch/arm/mm/cache-l2x0-pmu.c index 993fefdc167a..93ef0502b7ff 100644 --- a/arch/arm/mm/cache-l2x0-pmu.c +++ b/arch/arm/mm/cache-l2x0-pmu.c @@ -539,8 +539,7 @@ static __init int l2x0_pmu_init(void) * at higher frequencies. */ l2x0_pmu_poll_period = ms_to_ktime(1000); - hrtimer_init(&l2x0_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - l2x0_pmu_hrtimer.function = l2x0_pmu_poll; + hrtimer_setup(&l2x0_pmu_hrtimer, l2x0_pmu_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cpumask_set_cpu(0, &pmu_cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_ARM_L2X0_ONLINE, diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 89f1c97f3079..748698e91a4b 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -436,7 +436,7 @@ void __arm_iomem_set_ro(void __iomem *ptr, size_t size) set_memory_ro((unsigned long)ptr, PAGE_ALIGN(size) / PAGE_SIZE); } -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) { return (__force void *)arch_ioremap_caller(phys_addr, size, MT_MEMORY_RW, diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 1a8f6914ee59..d638cc87807e 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -248,7 +248,7 @@ void __iomem *pci_remap_cfgspace(resource_size_t res_cookie, size_t size) EXPORT_SYMBOL_GPL(pci_remap_cfgspace); #endif -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) { return (void *)phys_addr; } diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 49eeb2ad8dbd..27c1d5ebcd91 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -481,3 +481,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 8a306bbec4a0..cb044bfd145d 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include hostprogs := vdsomunge diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S index 9bfa0f52923c..7c08371f4400 100644 --- a/arch/arm/vdso/vdso.lds.S +++ b/arch/arm/vdso/vdso.lds.S @@ -11,16 +11,16 @@ */ #include <linux/const.h> -#include <asm/asm-offsets.h> #include <asm/page.h> #include <asm/vdso.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") OUTPUT_ARCH(arm) SECTIONS { - PROVIDE(_vdso_data = . - VDSO_DATA_SIZE); + VDSO_VVAR_SYMS . = SIZEOF_HEADERS; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 940343beb3d4..f2d724c1dfd2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -162,6 +162,7 @@ config ARM64 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select HARDIRQS_SW_RESEND select HAS_IOPORT @@ -250,6 +251,7 @@ config ARM64 select HAVE_KRETPROBES select HAVE_GENERIC_VDSO select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU + select HOTPLUG_SMT if HOTPLUG_CPU select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN @@ -323,7 +325,7 @@ config ARCH_MMAP_RND_BITS_MIN default 18 # max bits determined by the following formula: -# VA_BITS - PAGE_SHIFT - 3 +# VA_BITS - PTDESC_TABLE_SHIFT config ARCH_MMAP_RND_BITS_MAX default 19 if ARM64_VA_BITS=36 default 24 if ARM64_VA_BITS=39 @@ -1302,6 +1304,15 @@ config NVIDIA_CARMEL_CNP_ERRATUM If unsure, say Y. +config ROCKCHIP_ERRATUM_3568002 + bool "Rockchip 3568002: GIC600 can not access physical addresses higher than 4GB" + default y + help + The Rockchip RK3566 and RK3568 GIC600 SoC integrations have AXI + addressing limited to the first 32bit of physical address space. + + If unsure, say Y. + config ROCKCHIP_ERRATUM_3588001 bool "Rockchip 3588001: GIC600 can not support shareability attributes" default y diff --git a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi index 689c82b7f596..9e610a89a337 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712.dtsi +++ b/arch/arm64/boot/dts/broadcom/bcm2712.dtsi @@ -227,7 +227,7 @@ interrupts = <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clk_uart>, <&clk_vpu>; clock-names = "uartclk", "apb_pclk"; - arm,primecell-periphid = <0x00241011>; + arm,primecell-periphid = <0x00341011>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi index ce20de259805..3d0b14968131 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin-dahlia.dtsi @@ -16,10 +16,10 @@ "Headphone Jack", "HPOUTR", "IN2L", "Line In Jack", "IN2R", "Line In Jack", - "Headphone Jack", "MICBIAS", - "IN1L", "Headphone Jack"; + "Microphone Jack", "MICBIAS", + "IN1L", "Microphone Jack"; simple-audio-card,widgets = - "Microphone", "Headphone Jack", + "Microphone", "Microphone Jack", "Headphone", "Headphone Jack", "Line", "Line In Jack"; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi index 336785a9fba8..3ddc5aaa7c5f 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-tqma8mpql.dtsi @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later OR MIT /* - * Copyright 2021-2022 TQ-Systems GmbH - * Author: Alexander Stein <alexander.stein@tq-group.com> + * Copyright 2021-2025 TQ-Systems GmbH <linux@ew.tq-group.com>, + * D-82229 Seefeld, Germany. + * Author: Alexander Stein */ #include "imx8mp.dtsi" @@ -23,15 +24,6 @@ regulator-max-microvolt = <3300000>; regulator-always-on; }; - - /* e-MMC IO, needed for HS modes */ - reg_vcc1v8: regulator-vcc1v8 { - compatible = "regulator-fixed"; - regulator-name = "VCC1V8"; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; - regulator-always-on; - }; }; &A53_0 { @@ -197,7 +189,7 @@ no-sd; no-sdio; vmmc-supply = <®_vcc3v3>; - vqmmc-supply = <®_vcc1v8>; + vqmmc-supply = <&buck5_reg>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi index da8902c5f7e5..1493319aa748 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-verdin-dahlia.dtsi @@ -28,10 +28,10 @@ "Headphone Jack", "HPOUTR", "IN2L", "Line In Jack", "IN2R", "Line In Jack", - "Headphone Jack", "MICBIAS", - "IN1L", "Headphone Jack"; + "Microphone Jack", "MICBIAS", + "IN1L", "Microphone Jack"; simple-audio-card,widgets = - "Microphone", "Headphone Jack", + "Microphone", "Microphone Jack", "Headphone", "Headphone Jack", "Line", "Line In Jack"; diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi index e0ce804bb1a3..d0314cdf0b92 100644 --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi @@ -5163,7 +5163,6 @@ <GIC_SPI 341 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 342 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 343 IRQ_TYPE_LEVEL_HIGH>; - dma-coherent; }; anoc_1_tbu: tbu@150c5000 { diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts index eb9470a00e54..1a59e8b1dc46 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts @@ -194,6 +194,13 @@ <3 RK_PB3 RK_FUNC_GPIO &pcfg_pull_none>; }; }; + + uart { + uart5_rts_pin: uart5-rts-pin { + rockchip,pins = + <0 RK_PB5 RK_FUNC_GPIO &pcfg_pull_none>; + }; + }; }; &pwm0 { @@ -222,10 +229,15 @@ }; &uart0 { + pinctrl-names = "default"; + pinctrl-0 = <&uart0_xfer>; status = "okay"; }; &uart5 { + /* Add pinmux for rts-gpios (uart5_rts_pin) */ + pinctrl-names = "default"; + pinctrl-0 = <&uart5_xfer &uart5_rts_pin>; rts-gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi index b1c9bd0e63ef..8d94d9f91a5c 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-nanopi-r4s.dtsi @@ -115,7 +115,7 @@ }; &u2phy1_host { - status = "disabled"; + phy-supply = <&vdd_5v>; }; &uart0 { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi index 69a9d6170649..51c6aa26d828 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-rockpro64.dtsi @@ -227,6 +227,16 @@ vin-supply = <&vcc12v_dcin>; }; + vcca_0v9: regulator-vcca-0v9 { + compatible = "regulator-fixed"; + regulator-name = "vcca_0v9"; + regulator-always-on; + regulator-boot-on; + regulator-min-microvolt = <900000>; + regulator-max-microvolt = <900000>; + vin-supply = <&vcc3v3_sys>; + }; + vdd_log: regulator-vdd-log { compatible = "pwm-regulator"; pwms = <&pwm2 0 25000 1>; @@ -312,6 +322,8 @@ }; &hdmi { + avdd-0v9-supply = <&vcca_0v9>; + avdd-1v8-supply = <&vcc1v8_dvp>; ddc-i2c-bus = <&i2c3>; pinctrl-names = "default"; pinctrl-0 = <&hdmi_cec>; @@ -661,6 +673,8 @@ num-lanes = <4>; pinctrl-names = "default"; pinctrl-0 = <&pcie_perst>; + vpcie0v9-supply = <&vcca_0v9>; + vpcie1v8-supply = <&vcca_1v8>; vpcie12v-supply = <&vcc12v_dcin>; vpcie3v3-supply = <&vcc3v3_pcie>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts b/arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts index 61dd71c259aa..ddf84c2a19cf 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts +++ b/arch/arm64/boot/dts/rockchip/rk3566-lubancat-1.dts @@ -512,7 +512,6 @@ &sdmmc0 { max-frequency = <150000000>; - supports-sd; bus-width = <4>; cap-mmc-highspeed; cap-sd-highspeed; diff --git a/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi b/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi index e55390629114..4e730aecf84d 100644 --- a/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk356x-base.dtsi @@ -284,6 +284,18 @@ mbi-alias = <0x0 0xfd410000>; mbi-ranges = <296 24>; msi-controller; + ranges; + #address-cells = <2>; + #size-cells = <2>; + dma-noncoherent; + + its: msi-controller@fd440000 { + compatible = "arm,gic-v3-its"; + reg = <0x0 0xfd440000 0 0x20000>; + dma-noncoherent; + msi-controller; + #msi-cells = <1>; + }; }; usb_host0_ehci: usb@fd800000 { @@ -957,7 +969,7 @@ num-ib-windows = <6>; num-ob-windows = <2>; max-link-speed = <2>; - msi-map = <0x0 &gic 0x0 0x1000>; + msi-map = <0x0 &its 0x0 0x1000>; num-lanes = <1>; phys = <&combphy2 PHY_TYPE_PCIE>; phy-names = "pcie-phy"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts b/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts index 90f823b2c219..7f457ab78015 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-jaguar.dts @@ -503,7 +503,6 @@ non-removable; pinctrl-names = "default"; pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk &emmc_data_strobe>; - supports-cqe; vmmc-supply = <&vcc_3v3_s3>; vqmmc-supply = <&vcc_1v8_s3>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts b/arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts index 6d68f70284e4..2a0590209462 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5-itx.dts @@ -690,10 +690,9 @@ &sdhci { bus-width = <8>; - max-frequency = <200000000>; + max-frequency = <150000000>; mmc-hs400-1_8v; mmc-hs400-enhanced-strobe; - mmc-hs200-1_8v; no-sdio; no-sd; non-removable; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi index 81a6a05ce13b..e8fa449517c2 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi @@ -386,7 +386,6 @@ non-removable; pinctrl-names = "default"; pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk &emmc_data_strobe>; - supports-cqe; vmmc-supply = <&vcc_3v3_s3>; vqmmc-supply = <&vcc_1v8_s3>; status = "okay"; diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h index b8a5861dc7b7..292f2687a12e 100644 --- a/arch/arm64/include/asm/asm-extable.h +++ b/arch/arm64/include/asm/asm-extable.h @@ -9,7 +9,8 @@ #define EX_TYPE_BPF 1 #define EX_TYPE_UACCESS_ERR_ZERO 2 #define EX_TYPE_KACCESS_ERR_ZERO 3 -#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 4 +#define EX_TYPE_UACCESS_CPY 4 +#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 5 /* Data fields for EX_TYPE_UACCESS_ERR_ZERO */ #define EX_DATA_REG_ERR_SHIFT 0 @@ -23,6 +24,9 @@ #define EX_DATA_REG_ADDR_SHIFT 5 #define EX_DATA_REG_ADDR GENMASK(9, 5) +/* Data fields for EX_TYPE_UACCESS_CPY */ +#define EX_DATA_UACCESS_WRITE BIT(0) + #ifdef __ASSEMBLY__ #define __ASM_EXTABLE_RAW(insn, fixup, type, data) \ @@ -69,6 +73,10 @@ .endif .endm + .macro _asm_extable_uaccess_cpy, insn, fixup, uaccess_is_write + __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_UACCESS_CPY, \uaccess_is_write) + .endm + #else /* __ASSEMBLY__ */ #include <linux/stringify.h> diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 5b6efe8abeeb..9148f5a31968 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -61,6 +61,10 @@ alternative_else_nop_endif 9999: x; \ _asm_extable_uaccess 9999b, l +#define USER_CPY(l, uaccess_is_write, x...) \ +9999: x; \ + _asm_extable_uaccess_cpy 9999b, l, uaccess_is_write + /* * Generate the assembly for LDTR/STTR with exception table entries. * This is complicated as there is no post-increment or pair versions of the diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 06a4670bdb0b..99cd6546e72e 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -35,7 +35,7 @@ #define ARCH_DMA_MINALIGN (128) #define ARCH_KMALLOC_MINALIGN (8) -#ifndef __ASSEMBLY__ +#if !defined(__ASSEMBLY__) && !defined(BUILD_VDSO) #include <linux/bitops.h> #include <linux/kasan-enabled.h> @@ -118,6 +118,6 @@ static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void) return ctr; } -#endif /* __ASSEMBLY__ */ +#endif /* !defined(__ASSEMBLY__) && !defined(BUILD_VDSO) */ #endif diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index cc2d58141be1..c607e0bf5e0b 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -75,6 +75,7 @@ #define ARM_CPU_PART_CORTEX_A76 0xD0B #define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define ARM_CPU_PART_CORTEX_A77 0xD0D +#define ARM_CPU_PART_CORTEX_A76AE 0xD0E #define ARM_CPU_PART_NEOVERSE_V1 0xD40 #define ARM_CPU_PART_CORTEX_A78 0xD41 #define ARM_CPU_PART_CORTEX_A78AE 0xD42 @@ -119,6 +120,7 @@ #define QCOM_CPU_PART_KRYO 0x200 #define QCOM_CPU_PART_KRYO_2XX_GOLD 0x800 #define QCOM_CPU_PART_KRYO_2XX_SILVER 0x801 +#define QCOM_CPU_PART_KRYO_3XX_GOLD 0x802 #define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803 #define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804 #define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805 @@ -159,6 +161,7 @@ #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) +#define MIDR_CORTEX_A76AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76AE) #define MIDR_NEOVERSE_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V1) #define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) @@ -196,10 +199,21 @@ #define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO) #define MIDR_QCOM_KRYO_2XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_GOLD) #define MIDR_QCOM_KRYO_2XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_2XX_SILVER) +#define MIDR_QCOM_KRYO_3XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_GOLD) #define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER) #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD) #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER) #define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1) + +/* + * NOTES: + * - Qualcomm Kryo 5XX Prime / Gold ID themselves as MIDR_CORTEX_A77 + * - Qualcomm Kryo 5XX Silver IDs itself as MIDR_QCOM_KRYO_4XX_SILVER + * - Qualcomm Kryo 6XX Prime IDs itself as MIDR_CORTEX_X1 + * - Qualcomm Kryo 6XX Gold IDs itself as ARM_CPU_PART_CORTEX_A78 + * - Qualcomm Kryo 6XX Silver IDs itself as MIDR_CORTEX_A55 + */ + #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 555c613fd232..ebceaae3c749 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -259,6 +259,30 @@ .Lskip_fgt_\@: .endm +.macro __init_el2_fgt2 + mrs x1, id_aa64mmfr0_el1 + ubfx x1, x1, #ID_AA64MMFR0_EL1_FGT_SHIFT, #4 + cmp x1, #ID_AA64MMFR0_EL1_FGT_FGT2 + b.lt .Lskip_fgt2_\@ + + mov x0, xzr + mrs x1, id_aa64dfr0_el1 + ubfx x1, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 + cmp x1, #ID_AA64DFR0_EL1_PMUVer_V3P9 + b.lt .Lskip_pmuv3p9_\@ + + orr x0, x0, #HDFGRTR2_EL2_nPMICNTR_EL0 + orr x0, x0, #HDFGRTR2_EL2_nPMICFILTR_EL0 + orr x0, x0, #HDFGRTR2_EL2_nPMUACR_EL1 +.Lskip_pmuv3p9_\@: + msr_s SYS_HDFGRTR2_EL2, x0 + msr_s SYS_HDFGWTR2_EL2, x0 + msr_s SYS_HFGRTR2_EL2, xzr + msr_s SYS_HFGWTR2_EL2, xzr + msr_s SYS_HFGITR2_EL2, xzr +.Lskip_fgt2_\@: +.endm + .macro __init_el2_gcs mrs_s x1, SYS_ID_AA64PFR1_EL1 ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 @@ -304,6 +328,7 @@ __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt + __init_el2_fgt2 __init_el2_gcs .endm diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h index 72b0e71cc3de..9dc39612bdf5 100644 --- a/arch/arm64/include/asm/extable.h +++ b/arch/arm64/include/asm/extable.h @@ -33,6 +33,8 @@ do { \ (b)->data = (tmp).data; \ } while (0) +bool insn_may_access_user(unsigned long addr, unsigned long esr); + #ifdef CONFIG_BPF_JIT bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs); @@ -45,5 +47,5 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, } #endif /* !CONFIG_BPF_JIT */ -bool fixup_exception(struct pt_regs *regs); +bool fixup_exception(struct pt_regs *regs, unsigned long esr); #endif diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index f2a84efc3618..564bc09b3e06 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -80,7 +80,6 @@ extern void fpsimd_signal_preserve_current_state(void); extern void fpsimd_preserve_current_state(void); extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); -extern void fpsimd_kvm_prepare(void); struct cpu_fp_state { struct user_fpsimd_state *st; diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index fd5a08450b12..9e93733523f6 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -45,11 +45,11 @@ #define SPAN_NR_ENTRIES(vstart, vend, shift) \ ((((vend) - 1) >> (shift)) - ((vstart) >> (shift)) + 1) -#define EARLY_ENTRIES(vstart, vend, shift, add) \ - (SPAN_NR_ENTRIES(vstart, vend, shift) + (add)) +#define EARLY_ENTRIES(lvl, vstart, vend) \ + SPAN_NR_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * PTDESC_TABLE_SHIFT) -#define EARLY_LEVEL(lvl, lvls, vstart, vend, add) \ - (lvls > lvl ? EARLY_ENTRIES(vstart, vend, SWAPPER_BLOCK_SHIFT + lvl * (PAGE_SHIFT - 3), add) : 0) +#define EARLY_LEVEL(lvl, lvls, vstart, vend, add) \ + ((lvls) > (lvl) ? EARLY_ENTRIES(lvl, vstart, vend) + (add) : 0) #define EARLY_PAGES(lvls, vstart, vend, add) (1 /* PGDIR page */ \ + EARLY_LEVEL(3, (lvls), (vstart), (vend), add) /* each entry needs a next level page table */ \ diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h index f8f78f622dd2..a2a1eeb36d4b 100644 --- a/arch/arm64/include/asm/mem_encrypt.h +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -21,4 +21,15 @@ static inline bool force_dma_unencrypted(struct device *dev) return is_realm_world(); } +/* + * For Arm CCA guests, canonical addresses are "encrypted", so no changes + * required for dma_addr_encrypted(). + * The unencrypted DMA buffers must be accessed via the unprotected IPA, + * "top IPA bit" set. + */ +#define dma_addr_unencrypted(x) ((x) | PROT_NS_SHARED) + +/* Clear the "top" IPA bit while converting back */ +#define dma_addr_canonical(x) ((x) & ~PROT_NS_SHARED) + #endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index a9136cc551cc..f3b77deedfa2 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -7,40 +7,46 @@ #include <asm/memory.h> +#define PTDESC_ORDER 3 + +/* Number of VA bits resolved by a single translation table level */ +#define PTDESC_TABLE_SHIFT (PAGE_SHIFT - PTDESC_ORDER) + /* * Number of page-table levels required to address 'va_bits' wide * address, without section mapping. We resolve the top (va_bits - PAGE_SHIFT) - * bits with (PAGE_SHIFT - 3) bits at each page table level. Hence: + * bits with PTDESC_TABLE_SHIFT bits at each page table level. Hence: * - * levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), (PAGE_SHIFT - 3)) + * levels = DIV_ROUND_UP((va_bits - PAGE_SHIFT), PTDESC_TABLE_SHIFT) * * where DIV_ROUND_UP(n, d) => (((n) + (d) - 1) / (d)) * * We cannot include linux/kernel.h which defines DIV_ROUND_UP here * due to build issues. So we open code DIV_ROUND_UP here: * - * ((((va_bits) - PAGE_SHIFT) + (PAGE_SHIFT - 3) - 1) / (PAGE_SHIFT - 3)) + * ((((va_bits) - PAGE_SHIFT) + PTDESC_TABLE_SHIFT - 1) / PTDESC_TABLE_SHIFT) * * which gets simplified as : */ -#define ARM64_HW_PGTABLE_LEVELS(va_bits) (((va_bits) - 4) / (PAGE_SHIFT - 3)) +#define ARM64_HW_PGTABLE_LEVELS(va_bits) \ + (((va_bits) - PTDESC_ORDER - 1) / PTDESC_TABLE_SHIFT) /* * Size mapped by an entry at level n ( -1 <= n <= 3) - * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits + * We map PTDESC_TABLE_SHIFT at all translation levels and PAGE_SHIFT bits * in the final page. The maximum number of translation levels supported by * the architecture is 5. Hence, starting at level n, we have further * ((4 - n) - 1) levels of translation excluding the offset within the page. * So, the total number of bits mapped by an entry at level n is : * - * ((4 - n) - 1) * (PAGE_SHIFT - 3) + PAGE_SHIFT + * ((4 - n) - 1) * PTDESC_TABLE_SHIFT + PAGE_SHIFT * * Rearranging it a bit we get : - * (4 - n) * (PAGE_SHIFT - 3) + 3 + * (4 - n) * PTDESC_TABLE_SHIFT + PTDESC_ORDER */ -#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n) ((PAGE_SHIFT - 3) * (4 - (n)) + 3) +#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n) (PTDESC_TABLE_SHIFT * (4 - (n)) + PTDESC_ORDER) -#define PTRS_PER_PTE (1 << (PAGE_SHIFT - 3)) +#define PTRS_PER_PTE (1 << PTDESC_TABLE_SHIFT) /* * PMD_SHIFT determines the size a level 2 page table entry can map. @@ -49,7 +55,7 @@ #define PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PTRS_PER_PMD (1 << (PAGE_SHIFT - 3)) +#define PTRS_PER_PMD (1 << PTDESC_TABLE_SHIFT) #endif /* @@ -59,14 +65,14 @@ #define PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) #define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) -#define PTRS_PER_PUD (1 << (PAGE_SHIFT - 3)) +#define PTRS_PER_PUD (1 << PTDESC_TABLE_SHIFT) #endif #if CONFIG_PGTABLE_LEVELS > 4 #define P4D_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(0) #define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) #define P4D_MASK (~(P4D_SIZE-1)) -#define PTRS_PER_P4D (1 << (PAGE_SHIFT - 3)) +#define PTRS_PER_P4D (1 << PTDESC_TABLE_SHIFT) #endif /* @@ -97,7 +103,6 @@ * Level -1 descriptor (PGD). */ #define PGD_TYPE_TABLE (_AT(pgdval_t, 3) << 0) -#define PGD_TABLE_BIT (_AT(pgdval_t, 1) << 1) #define PGD_TYPE_MASK (_AT(pgdval_t, 3) << 0) #define PGD_TABLE_AF (_AT(pgdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ #define PGD_TABLE_PXN (_AT(pgdval_t, 1) << 59) @@ -107,7 +112,6 @@ * Level 0 descriptor (P4D). */ #define P4D_TYPE_TABLE (_AT(p4dval_t, 3) << 0) -#define P4D_TABLE_BIT (_AT(p4dval_t, 1) << 1) #define P4D_TYPE_MASK (_AT(p4dval_t, 3) << 0) #define P4D_TYPE_SECT (_AT(p4dval_t, 1) << 0) #define P4D_SECT_RDONLY (_AT(p4dval_t, 1) << 7) /* AP[2] */ @@ -119,7 +123,6 @@ * Level 1 descriptor (PUD). */ #define PUD_TYPE_TABLE (_AT(pudval_t, 3) << 0) -#define PUD_TABLE_BIT (_AT(pudval_t, 1) << 1) #define PUD_TYPE_MASK (_AT(pudval_t, 3) << 0) #define PUD_TYPE_SECT (_AT(pudval_t, 1) << 0) #define PUD_SECT_RDONLY (_AT(pudval_t, 1) << 7) /* AP[2] */ @@ -133,7 +136,6 @@ #define PMD_TYPE_MASK (_AT(pmdval_t, 3) << 0) #define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0) -#define PMD_TABLE_BIT (_AT(pmdval_t, 1) << 1) #define PMD_TABLE_AF (_AT(pmdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ /* @@ -162,7 +164,6 @@ #define PTE_VALID (_AT(pteval_t, 1) << 0) #define PTE_TYPE_MASK (_AT(pteval_t, 3) << 0) #define PTE_TYPE_PAGE (_AT(pteval_t, 3) << 0) -#define PTE_TABLE_BIT (_AT(pteval_t, 1) << 1) #define PTE_USER (_AT(pteval_t, 1) << 6) /* AP[1] */ #define PTE_RDONLY (_AT(pteval_t, 1) << 7) /* AP[2] */ #define PTE_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index a95f1f77bb39..7830d031742e 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -169,25 +169,25 @@ static inline bool __pure lpa2_is_enabled(void) #define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) #define PIE_E0 ( \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ - PIRx_ELx_PERM(pte_pi_index(_PAGE_KERNEL), PIE_RW)) + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ + PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW)) #endif /* __ASM_PGTABLE_PROT_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0b2a2ad1b9e8..84f05f781a70 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -68,10 +68,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e)) -/* - * Macros to convert between a physical address and its placement in a - * page table entry, taking care of 52-bit addresses. - */ #ifdef CONFIG_ARM64_PA_BITS_52 static inline phys_addr_t __pte_to_phys(pte_t pte) { @@ -84,8 +80,15 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PHYS_TO_PTE_ADDR_MASK; } #else -#define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_LOW) -#define __phys_to_pte_val(phys) (phys) +static inline phys_addr_t __pte_to_phys(pte_t pte) +{ + return pte_val(pte) & PTE_ADDR_LOW; +} + +static inline pteval_t __phys_to_pte_val(phys_addr_t phys) +{ + return phys; +} #endif #define pte_pfn(pte) (__pte_to_phys(pte) >> PAGE_SHIFT) @@ -483,12 +486,12 @@ static inline pmd_t pte_pmd(pte_t pte) static inline pgprot_t mk_pud_sect_prot(pgprot_t prot) { - return __pgprot((pgprot_val(prot) & ~PUD_TABLE_BIT) | PUD_TYPE_SECT); + return __pgprot((pgprot_val(prot) & ~PUD_TYPE_MASK) | PUD_TYPE_SECT); } static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) { - return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); + return __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); } static inline pte_t pte_swp_mkexclusive(pte_t pte) @@ -548,18 +551,6 @@ static inline int pmd_protnone(pmd_t pmd) #endif #define pmd_present(pmd) pte_present(pmd_pte(pmd)) - -/* - * THP definitions. - */ - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_huge(pmd_t pmd) -{ - return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_valid(pmd) pte_valid(pmd_pte(pmd)) @@ -585,7 +576,18 @@ static inline int pmd_trans_huge(pmd_t pmd) #define pmd_write(pmd) pte_write(pmd_pte(pmd)) -#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + /* + * It's possible that the pmd is present-invalid on entry + * and in that case it needs to remain present-invalid on + * exit. So ensure the VALID bit does not get modified. + */ + pmdval_t mask = PMD_TYPE_MASK & ~PTE_VALID; + pmdval_t val = PMD_TYPE_SECT & ~PTE_VALID; + + return __pmd((pmd_val(pmd) & ~mask) | val); +} #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd)) @@ -613,7 +615,18 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) -#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT)) +static inline pud_t pud_mkhuge(pud_t pud) +{ + /* + * It's possible that the pud is present-invalid on entry + * and in that case it needs to remain present-invalid on + * exit. So ensure the VALID bit does not get modified. + */ + pudval_t mask = PUD_TYPE_MASK & ~PTE_VALID; + pudval_t val = PUD_TYPE_SECT & ~PTE_VALID; + + return __pud((pud_val(pud) & ~mask) | val); +} #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) @@ -724,6 +737,18 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE) #define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_huge(pmd_t pmd) +{ + /* + * If pmd is present-invalid, pmd_table() won't detect it + * as a table, so force the valid bit for the comparison. + */ + return pmd_val(pmd) && pmd_present(pmd) && + !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID)); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 static inline bool pud_sect(pud_t pud) { return false; } static inline bool pud_table(pud_t pud) { return true; } @@ -805,7 +830,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) (!pud_table(pud)) +#define pud_bad(pud) ((pud_val(pud) & PUD_TYPE_MASK) != \ + PUD_TYPE_TABLE) #define pud_present(pud) pte_present(pud_pte(pud)) #ifndef __PAGETABLE_PMD_FOLDED #define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) @@ -896,7 +922,9 @@ static inline bool mm_pud_folded(const struct mm_struct *mm) pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e)) #define p4d_none(p4d) (pgtable_l4_enabled() && !p4d_val(p4d)) -#define p4d_bad(p4d) (pgtable_l4_enabled() && !(p4d_val(p4d) & P4D_TABLE_BIT)) +#define p4d_bad(p4d) (pgtable_l4_enabled() && \ + ((p4d_val(p4d) & P4D_TYPE_MASK) != \ + P4D_TYPE_TABLE)) #define p4d_present(p4d) (!p4d_none(p4d)) static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) @@ -1023,7 +1051,9 @@ static inline bool mm_p4d_folded(const struct mm_struct *mm) pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e)) #define pgd_none(pgd) (pgtable_l5_enabled() && !pgd_val(pgd)) -#define pgd_bad(pgd) (pgtable_l5_enabled() && !(pgd_val(pgd) & PGD_TABLE_BIT)) +#define pgd_bad(pgd) (pgtable_l5_enabled() && \ + ((pgd_val(pgd) & PGD_TYPE_MASK) != \ + PGD_TYPE_TABLE)) #define pgd_present(pgd) (!pgd_none(pgd)) static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h index e06e9f473675..d913d5b529e4 100644 --- a/arch/arm64/include/asm/por.h +++ b/arch/arm64/include/asm/por.h @@ -6,26 +6,27 @@ #ifndef _ASM_ARM64_POR_H #define _ASM_ARM64_POR_H -#define POR_BITS_PER_PKEY 4 -#define POR_ELx_IDX(por_elx, idx) (((por_elx) >> ((idx) * POR_BITS_PER_PKEY)) & 0xf) +#include <asm/sysreg.h> + +#define POR_EL0_INIT POR_ELx_PERM_PREP(0, POE_RWX) static inline bool por_elx_allows_read(u64 por, u8 pkey) { - u8 perm = POR_ELx_IDX(por, pkey); + u8 perm = POR_ELx_PERM_GET(pkey, por); return perm & POE_R; } static inline bool por_elx_allows_write(u64 por, u8 pkey) { - u8 perm = POR_ELx_IDX(por, pkey); + u8 perm = POR_ELx_PERM_GET(pkey, por); return perm & POE_W; } static inline bool por_elx_allows_exec(u64 por, u8 pkey) { - u8 perm = POR_ELx_IDX(por, pkey); + u8 perm = POR_ELx_PERM_GET(pkey, por); return perm & POE_X; } diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index 0c4d9045c31f..f1524cdeacf1 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -97,7 +97,6 @@ enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); -u8 spectre_bhb_loop_affected(int scope); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9511f3faac46..2639d3633073 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1032,8 +1032,11 @@ #define PIE_RX UL(0xa) #define PIE_RW UL(0xc) #define PIE_RWX UL(0xe) +#define PIE_MASK UL(0xf) -#define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) +#define PIRx_ELx_BITS_PER_IDX 4 +#define PIRx_ELx_PERM_SHIFT(idx) ((idx) * PIRx_ELx_BITS_PER_IDX) +#define PIRx_ELx_PERM_PREP(idx, perm) (((perm) & PIE_MASK) << PIRx_ELx_PERM_SHIFT(idx)) /* * Permission Overlay Extension (POE) permission encodings. @@ -1044,12 +1047,14 @@ #define POE_RX UL(0x3) #define POE_W UL(0x4) #define POE_RW UL(0x5) -#define POE_XW UL(0x6) -#define POE_RXW UL(0x7) +#define POE_WX UL(0x6) +#define POE_RWX UL(0x7) #define POE_MASK UL(0xf) -/* Initial value for Permission Overlay Extension for EL0 */ -#define POR_EL0_INIT POE_RXW +#define POR_ELx_BITS_PER_IDX 4 +#define POR_ELx_PERM_SHIFT(idx) ((idx) * POR_ELx_BITS_PER_IDX) +#define POR_ELx_PERM_GET(idx, reg) (((reg) >> POR_ELx_PERM_SHIFT(idx)) & POE_MASK) +#define POR_ELx_PERM_PREP(idx, perm) (((perm) & POE_MASK) << POR_ELx_PERM_SHIFT(idx)) /* * Definitions for Guarded Control Stack diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h index 3e3c3fdb1842..61679070f595 100644 --- a/arch/arm64/include/asm/vdso.h +++ b/arch/arm64/include/asm/vdso.h @@ -5,7 +5,7 @@ #ifndef __ASM_VDSO_H #define __ASM_VDSO_H -#define __VVAR_PAGES 2 +#define __VDSO_PAGES 4 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index 778c1202bbbf..d60ea7a72a9c 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -104,7 +104,7 @@ int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) } static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { u64 res; @@ -131,45 +131,33 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return res; } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { - const struct vdso_data *ret; + const struct vdso_time_data *ret; /* - * This simply puts &_vdso_data into ret. The reason why we don't use - * `ret = _vdso_data` is that the compiler tends to optimise this in a - * very suboptimal way: instead of keeping &_vdso_data in a register, - * it goes through a relocation almost every time _vdso_data must be + * This simply puts &_vdso_time_data into ret. The reason why we don't use + * `ret = _vdso_time_data` is that the compiler tends to optimise this in a + * very suboptimal way: instead of keeping &_vdso_time_data in a register, + * it goes through a relocation almost every time _vdso_time_data must be * accessed (even in subfunctions). This is both time and space * consuming: each relocation uses a word in the code section, and it * has to be loaded at runtime. * * This trick hides the assignment from the compiler. Since it cannot * track where the pointer comes from, it will only use one relocation - * where __arch_get_vdso_data() is called, and then keep the result in - * a register. + * where __aarch64_get_vdso_u_time_data() is called, and then keep the + * result in a register. */ - asm volatile("mov %0, %1" : "=r"(ret) : "r"(_vdso_data)); + asm volatile("mov %0, %1" : "=r"(ret) : "r"(&vdso_u_time_data)); return ret; } +#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - const struct vdso_data *ret; - - /* See __arch_get_vdso_data(). */ - asm volatile("mov %0, %1" : "=r"(ret) : "r"(_timens_data)); - - return ret; -} -#endif - -static inline bool vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) { - return vd->clock_mode == VDSO_CLOCKMODE_ARCHTIMER; + return vc->clock_mode == VDSO_CLOCKMODE_ARCHTIMER; } #define vdso_clocksource_ok vdso_clocksource_ok diff --git a/arch/arm64/include/asm/vdso/getrandom.h b/arch/arm64/include/asm/vdso/getrandom.h index 342f807e2044..a2197da1951b 100644 --- a/arch/arm64/include/asm/vdso/getrandom.h +++ b/arch/arm64/include/asm/vdso/getrandom.h @@ -33,18 +33,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns return ret; } -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - /* - * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace - * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_ - * PAGE_OFFSET points to the real VVAR page. - */ - if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) - return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * (1UL << CONFIG_PAGE_SHIFT); - return &_vdso_rng_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index 764d13e2916c..92a2b59a9f3d 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -67,7 +67,7 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) } static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { u64 res; @@ -99,20 +99,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return res; } -static __always_inline -const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h index eea51946d45a..de58951b8df6 100644 --- a/arch/arm64/include/asm/vdso/vsyscall.h +++ b/arch/arm64/include/asm/vdso/vsyscall.h @@ -2,44 +2,21 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#define __VDSO_RND_DATA_OFFSET 480 - #ifndef __ASSEMBLY__ #include <vdso/datapage.h> -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - #define VDSO_PRECISION_MASK ~(0xFF00ULL<<48) -extern struct vdso_data *vdso_data; /* * Update the vDSO data page to keep in sync with kernel timekeeping. */ static __always_inline -struct vdso_data *__arm64_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __arm64_get_k_vdso_data - -static __always_inline -struct vdso_rng_data *__arm64_get_k_vdso_rnd_data(void) -{ - return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; -} -#define __arch_get_k_vdso_rng_data __arm64_get_k_vdso_rnd_data - -static __always_inline -void __arm64_update_vsyscall(struct vdso_data *vdata) +void __arm64_update_vsyscall(struct vdso_time_data *vdata) { - vdata[CS_HRES_COARSE].mask = VDSO_PRECISION_MASK; - vdata[CS_RAW].mask = VDSO_PRECISION_MASK; + vdata->clock_data[CS_HRES_COARSE].mask = VDSO_PRECISION_MASK; + vdata->clock_data[CS_RAW].mask = VDSO_PRECISION_MASK; } #define __arch_update_vsyscall __arm64_update_vsyscall diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 2e94d20c4ac7..b735f4c2fe5e 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -27,9 +27,10 @@ static int mte_dump_tag_range(struct coredump_params *cprm, int ret = 1; unsigned long addr; void *tags = NULL; + int locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { - struct page *page = get_dump_page(addr); + struct page *page = get_dump_page(addr, &locked); /* * get_dump_page() returns NULL when encountering an empty diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 2b69e3beeef8..81345f68f9fc 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -31,7 +31,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; - int lshift = (3 - level) * (PAGE_SHIFT - 3); + int lshift = (3 - level) * PTDESC_TABLE_SHIFT; u64 lmask = (PAGE_SIZE << lshift) - 1; start &= PAGE_MASK; @@ -45,12 +45,12 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, * clearing the mapping */ if (protval) - protval |= (level < 3) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; + protval |= (level == 2) ? PMD_TYPE_SECT : PTE_TYPE_PAGE; while (start < end) { u64 next = min((start | lmask) + 1, PAGE_ALIGN(end)); - if (level < 3 && (start | next | pa) & lmask) { + if (level < 2 || (level == 2 && (start | next | pa) & lmask)) { /* * This chunk needs a finer grained mapping. Create a * table mapping if necessary and recurse. diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index a573fa40d4b6..d5d11fd11549 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -845,52 +845,86 @@ static unsigned long system_bhb_mitigations; * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any * SCOPE_SYSTEM call will give the right answer. */ -u8 spectre_bhb_loop_affected(int scope) +static bool is_spectre_bhb_safe(int scope) +{ + static const struct midr_range spectre_bhb_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A510), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A520), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + {}, + }; + static bool all_safe = true; + + if (scope != SCOPE_LOCAL_CPU) + return all_safe; + + if (is_midr_in_range_list(spectre_bhb_safe_list)) + return true; + + all_safe = false; + + return false; +} + +static u8 spectre_bhb_loop_affected(void) { u8 k = 0; - static u8 max_bhb_k; - - if (scope == SCOPE_LOCAL_CPU) { - static const struct midr_range spectre_bhb_k32_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), - MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), - {}, - }; - static const struct midr_range spectre_bhb_k24_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), - MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), - {}, - }; - static const struct midr_range spectre_bhb_k11_list[] = { - MIDR_ALL_VERSIONS(MIDR_AMPERE1), - {}, - }; - static const struct midr_range spectre_bhb_k8_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - {}, - }; - - if (is_midr_in_range_list(spectre_bhb_k32_list)) - k = 32; - else if (is_midr_in_range_list(spectre_bhb_k24_list)) - k = 24; - else if (is_midr_in_range_list(spectre_bhb_k11_list)) - k = 11; - else if (is_midr_in_range_list(spectre_bhb_k8_list)) - k = 8; - - max_bhb_k = max(max_bhb_k, k); - } else { - k = max_bhb_k; - } + + static const struct midr_range spectre_bhb_k132_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + }; + static const struct midr_range spectre_bhb_k38_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + }; + static const struct midr_range spectre_bhb_k32_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1), + {}, + }; + static const struct midr_range spectre_bhb_k24_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), + {}, + }; + static const struct midr_range spectre_bhb_k11_list[] = { + MIDR_ALL_VERSIONS(MIDR_AMPERE1), + {}, + }; + static const struct midr_range spectre_bhb_k8_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + {}, + }; + + if (is_midr_in_range_list(spectre_bhb_k132_list)) + k = 132; + else if (is_midr_in_range_list(spectre_bhb_k38_list)) + k = 38; + else if (is_midr_in_range_list(spectre_bhb_k32_list)) + k = 32; + else if (is_midr_in_range_list(spectre_bhb_k24_list)) + k = 24; + else if (is_midr_in_range_list(spectre_bhb_k11_list)) + k = 11; + else if (is_midr_in_range_list(spectre_bhb_k8_list)) + k = 8; return k; } @@ -916,28 +950,13 @@ static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void) } } -static bool is_spectre_bhb_fw_affected(int scope) +static bool has_spectre_bhb_fw_mitigation(void) { - static bool system_affected; enum mitigation_state fw_state; bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE; - static const struct midr_range spectre_bhb_firmware_mitigated_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), - {}, - }; - bool cpu_in_list = is_midr_in_range_list(spectre_bhb_firmware_mitigated_list); - - if (scope != SCOPE_LOCAL_CPU) - return system_affected; fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (cpu_in_list || (has_smccc && fw_state == SPECTRE_MITIGATED)) { - system_affected = true; - return true; - } - - return false; + return has_smccc && fw_state == SPECTRE_MITIGATED; } static bool supports_ecbhb(int scope) @@ -953,6 +972,8 @@ static bool supports_ecbhb(int scope) ID_AA64MMFR1_EL1_ECBHB_SHIFT); } +static u8 max_bhb_k; + bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope) { @@ -961,16 +982,18 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, if (supports_csv2p3(scope)) return false; - if (supports_clearbhb(scope)) - return true; - - if (spectre_bhb_loop_affected(scope)) - return true; + if (is_spectre_bhb_safe(scope)) + return false; - if (is_spectre_bhb_fw_affected(scope)) - return true; + /* + * At this point the core isn't known to be "safe" so we're going to + * assume it's vulnerable. We still need to update `max_bhb_k` though, + * but only if we aren't mitigating with clearbhb though. + */ + if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU)) + max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected()); - return false; + return true; } static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) @@ -1001,7 +1024,7 @@ early_param("nospectre_bhb", parse_spectre_bhb_param); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) { bp_hardening_cb_t cpu_cb; - enum mitigation_state fw_state, state = SPECTRE_VULNERABLE; + enum mitigation_state state = SPECTRE_VULNERABLE; struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU)) @@ -1027,7 +1050,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN); state = SPECTRE_MITIGATED; set_bit(BHB_INSN, &system_bhb_mitigations); - } else if (spectre_bhb_loop_affected(SCOPE_LOCAL_CPU)) { + } else if (spectre_bhb_loop_affected()) { /* * Ensure KVM uses the indirect vector which will have the * branchy-loop added. A57/A72-r0 will already have selected @@ -1040,32 +1063,29 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP); state = SPECTRE_MITIGATED; set_bit(BHB_LOOP, &system_bhb_mitigations); - } else if (is_spectre_bhb_fw_affected(SCOPE_LOCAL_CPU)) { - fw_state = spectre_bhb_get_cpu_fw_mitigation_state(); - if (fw_state == SPECTRE_MITIGATED) { - /* - * Ensure KVM uses one of the spectre bp_hardening - * vectors. The indirect vector doesn't include the EL3 - * call, so needs upgrading to - * HYP_VECTOR_SPECTRE_INDIRECT. - */ - if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) - data->slot += 1; - - this_cpu_set_vectors(EL1_VECTOR_BHB_FW); - - /* - * The WA3 call in the vectors supersedes the WA1 call - * made during context-switch. Uninstall any firmware - * bp_hardening callback. - */ - cpu_cb = spectre_v2_get_sw_mitigation_cb(); - if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) - __this_cpu_write(bp_hardening_data.fn, NULL); - - state = SPECTRE_MITIGATED; - set_bit(BHB_FW, &system_bhb_mitigations); - } + } else if (has_spectre_bhb_fw_mitigation()) { + /* + * Ensure KVM uses one of the spectre bp_hardening + * vectors. The indirect vector doesn't include the EL3 + * call, so needs upgrading to + * HYP_VECTOR_SPECTRE_INDIRECT. + */ + if (!data->slot || data->slot == HYP_VECTOR_INDIRECT) + data->slot += 1; + + this_cpu_set_vectors(EL1_VECTOR_BHB_FW); + + /* + * The WA3 call in the vectors supersedes the WA1 call + * made during context-switch. Uninstall any firmware + * bp_hardening callback. + */ + cpu_cb = spectre_v2_get_sw_mitigation_cb(); + if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb) + __this_cpu_write(bp_hardening_data.fn, NULL); + + state = SPECTRE_MITIGATED; + set_bit(BHB_FW, &system_bhb_mitigations); } update_mitigation_state(&spectre_bhb_state, state); @@ -1099,7 +1119,6 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, { u8 rd; u32 insn; - u16 loop_count = spectre_bhb_loop_affected(SCOPE_SYSTEM); BUG_ON(nr_inst != 1); /* MOV -> MOV */ @@ -1108,7 +1127,7 @@ void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt, insn = le32_to_cpu(*origptr); rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn); - insn = aarch64_insn_gen_movewide(rd, loop_count, 0, + insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0, AARCH64_INSN_VARIANT_64BIT, AARCH64_INSN_MOVEWIDE_ZERO); *updptr++ = cpu_to_le32(insn); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 99ea26d400ff..a7c37afb4ebe 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -91,7 +91,7 @@ static void save_reset_user_access_state(struct user_access_state *ua_state) u64 por_enable_all = 0; for (int pkey = 0; pkey < arch_max_pkey(); pkey++) - por_enable_all |= POE_RXW << (pkey * POR_BITS_PER_PKEY); + por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX); ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); write_sysreg_s(por_enable_all, SYS_POR_EL0); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index cb180684d10d..5d07ee85bdae 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -15,8 +15,11 @@ #include <linux/arch_topology.h> #include <linux/cacheinfo.h> #include <linux/cpufreq.h> +#include <linux/cpu_smt.h> #include <linux/init.h> #include <linux/percpu.h> +#include <linux/sched/isolation.h> +#include <linux/xarray.h> #include <asm/cpu.h> #include <asm/cputype.h> @@ -37,17 +40,28 @@ static bool __init acpi_cpu_is_threaded(int cpu) return !!is_threaded; } +struct cpu_smt_info { + unsigned int thread_num; + int core_id; +}; + /* * Propagate the topology information of the processor_topology_node tree to the * cpu_topology array. */ int __init parse_acpi_topology(void) { + unsigned int max_smt_thread_num = 1; + struct cpu_smt_info *entry; + struct xarray hetero_cpu; + unsigned long hetero_id; int cpu, topology_id; if (acpi_disabled) return 0; + xa_init(&hetero_cpu); + for_each_possible_cpu(cpu) { topology_id = find_acpi_cpu_topology(cpu, 0); if (topology_id < 0) @@ -57,6 +71,34 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = topology_id; topology_id = find_acpi_cpu_topology(cpu, 1); cpu_topology[cpu].core_id = topology_id; + + /* + * In the PPTT, CPUs below a node with the 'identical + * implementation' flag have the same number of threads. + * Count the number of threads for only one CPU (i.e. + * one core_id) among those with the same hetero_id. + * See the comment of find_acpi_cpu_topology_hetero_id() + * for more details. + * + * One entry is created for each node having: + * - the 'identical implementation' flag + * - its parent not having the flag + */ + hetero_id = find_acpi_cpu_topology_hetero_id(cpu); + entry = xa_load(&hetero_cpu, hetero_id); + if (!entry) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + WARN_ON_ONCE(!entry); + + if (entry) { + entry->core_id = topology_id; + entry->thread_num = 1; + xa_store(&hetero_cpu, hetero_id, + entry, GFP_KERNEL); + } + } else if (entry->core_id == topology_id) { + entry->thread_num++; + } } else { cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; @@ -67,6 +109,19 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].package_id = topology_id; } + /* + * This is a short loop since the number of XArray elements is the + * number of heterogeneous CPU clusters. On a homogeneous system + * there's only one entry in the XArray. + */ + xa_for_each(&hetero_cpu, hetero_id, entry) { + max_smt_thread_num = max(max_smt_thread_num, entry->thread_num); + xa_erase(&hetero_cpu, hetero_id); + kfree(entry); + } + + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + xa_destroy(&hetero_cpu); return 0; } #endif @@ -88,18 +143,28 @@ int __init parse_acpi_topology(void) * initialized. */ static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); -static DEFINE_PER_CPU(u64, arch_const_cycles_prev); -static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; +struct amu_cntr_sample { + u64 arch_const_cycles_prev; + u64 arch_core_cycles_prev; + unsigned long last_scale_update; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples); + void update_freq_counters_refs(void) { - this_cpu_write(arch_core_cycles_prev, read_corecnt()); - this_cpu_write(arch_const_cycles_prev, read_constcnt()); + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); + + amu_sample->arch_core_cycles_prev = read_corecnt(); + amu_sample->arch_const_cycles_prev = read_constcnt(); } static inline bool freq_counters_valid(int cpu) { + struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) return false; @@ -108,8 +173,8 @@ static inline bool freq_counters_valid(int cpu) return false; } - if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || - !per_cpu(arch_core_cycles_prev, cpu))) { + if (unlikely(!amu_sample->arch_const_cycles_prev || + !amu_sample->arch_core_cycles_prev)) { pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); return false; } @@ -152,17 +217,22 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate) static void amu_scale_freq_tick(void) { + struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples); u64 prev_core_cnt, prev_const_cnt; u64 core_cnt, const_cnt, scale; - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + prev_const_cnt = amu_sample->arch_const_cycles_prev; + prev_core_cnt = amu_sample->arch_core_cycles_prev; update_freq_counters_refs(); - const_cnt = this_cpu_read(arch_const_cycles_prev); - core_cnt = this_cpu_read(arch_core_cycles_prev); + const_cnt = amu_sample->arch_const_cycles_prev; + core_cnt = amu_sample->arch_core_cycles_prev; + /* + * This should not happen unless the AMUs have been reset and the + * counter values have not been restored - unlikely + */ if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) return; @@ -182,6 +252,8 @@ static void amu_scale_freq_tick(void) scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); this_cpu_write(arch_freq_scale, (unsigned long)scale); + + amu_sample->last_scale_update = jiffies; } static struct scale_freq_data amu_sfd = { @@ -189,6 +261,96 @@ static struct scale_freq_data amu_sfd = { .set_freq_scale = amu_scale_freq_tick, }; +static __always_inline bool amu_fie_cpu_supported(unsigned int cpu) +{ + return cpumask_available(amu_fie_cpus) && + cpumask_test_cpu(cpu, amu_fie_cpus); +} + +void arch_cpu_idle_enter(void) +{ + unsigned int cpu = smp_processor_id(); + + if (!amu_fie_cpu_supported(cpu)) + return; + + /* Kick in AMU update but only if one has not happened already */ + if (housekeeping_cpu(cpu, HK_TYPE_TICK) && + time_is_before_jiffies(per_cpu(cpu_amu_samples.last_scale_update, cpu))) + amu_scale_freq_tick(); +} + +#define AMU_SAMPLE_EXP_MS 20 + +int arch_freq_get_on_cpu(int cpu) +{ + struct amu_cntr_sample *amu_sample; + unsigned int start_cpu = cpu; + unsigned long last_update; + unsigned int freq = 0; + u64 scale; + + if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu)) + return -EOPNOTSUPP; + + while (1) { + + amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu); + + last_update = amu_sample->last_scale_update; + + /* + * For those CPUs that are in full dynticks mode, or those that have + * not seen tick for a while, try an alternative source for the counters + * (and thus freq scale), if available, for given policy: this boils + * down to identifying an active cpu within the same freq domain, if any. + */ + if (!housekeeping_cpu(cpu, HK_TYPE_TICK) || + time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) { + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + int ref_cpu; + + if (!policy) + return -EINVAL; + + if (!cpumask_intersects(policy->related_cpus, + housekeeping_cpumask(HK_TYPE_TICK))) { + cpufreq_cpu_put(policy); + return -EOPNOTSUPP; + } + + for_each_cpu_wrap(ref_cpu, policy->cpus, cpu + 1) { + if (ref_cpu == start_cpu) { + /* Prevent verifying same CPU twice */ + ref_cpu = nr_cpu_ids; + break; + } + if (!idle_cpu(ref_cpu)) + break; + } + + cpufreq_cpu_put(policy); + + if (ref_cpu >= nr_cpu_ids) + /* No alternative to pull info from */ + return -EAGAIN; + + cpu = ref_cpu; + } else { + break; + } + } + /* + * Reversed computation to the one used to determine + * the arch_freq_scale value + * (see amu_scale_freq_tick for details) + */ + scale = arch_scale_freq_capacity(cpu); + freq = scale * arch_scale_freq_ref(cpu); + freq >>= SCHED_CAPACITY_SHIFT; + return freq; +} + static void amu_fie_setup(const struct cpumask *cpus) { int cpu; diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 4e26bd356a48..529cff825531 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -172,14 +172,6 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) printk("%sCode: %s\n", lvl, str); } -#ifdef CONFIG_PREEMPT -#define S_PREEMPT " PREEMPT" -#elif defined(CONFIG_PREEMPT_RT) -#define S_PREEMPT " PREEMPT_RT" -#else -#define S_PREEMPT "" -#endif - #define S_SMP " SMP" static int __die(const char *str, long err, struct pt_regs *regs) @@ -187,7 +179,7 @@ static int __die(const char *str, long err, struct pt_regs *regs) static int die_counter; int ret; - pr_emerg("Internal error: %s: %016lx [#%d]" S_PREEMPT S_SMP "\n", + pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e8ed8e5b713b..887ac0b05961 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -18,7 +18,7 @@ #include <linux/sched.h> #include <linux/signal.h> #include <linux/slab.h> -#include <linux/time_namespace.h> +#include <linux/vdso_datastore.h> #include <linux/vmalloc.h> #include <vdso/datapage.h> #include <vdso/helpers.h> @@ -57,12 +57,6 @@ static struct vdso_abi_info vdso_info[] __ro_after_init = { #endif /* CONFIG_COMPAT_VDSO */ }; -/* - * The vDSO data page. - */ -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { @@ -104,78 +98,6 @@ static int __init __vdso_init(enum vdso_abi abi) return 0; } -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -static const struct vm_special_mapping vvar_map; - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vvar_map)) - zap_vma_pages(vma); - } - - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = sym_to_pfn(vdso_data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - -static const struct vm_special_mapping vvar_map = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -185,11 +107,11 @@ static int __setup_additional_pages(enum vdso_abi abi, unsigned long gp_flags = 0; void *ret; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VDSO_NR_PAGES * PAGE_SIZE; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { @@ -197,16 +119,14 @@ static int __setup_additional_pages(enum vdso_abi abi, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_PFNMAP, - &vvar_map); + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; if (system_supports_bti_kernel()) gp_flags = VM_ARM64_BTI; - vdso_base += VVAR_NR_PAGES * PAGE_SIZE; + vdso_base += VDSO_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 35685c036044..5e27e46aa496 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -7,7 +7,7 @@ # # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 47ad6944f9f0..52314be29191 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -20,11 +20,8 @@ OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); - PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 25a2cb6317f3..f2dfdc7dc818 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -3,7 +3,7 @@ # Makefile for vdso32 # -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Same as cc-*option, but using CC_COMPAT instead of CC ifeq ($(CONFIG_CC_IS_CLANG), y) diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index 732702a187e9..e02b27487ce8 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -12,16 +12,15 @@ #include <asm/page.h> #include <asm/vdso.h> #include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm") OUTPUT_ARCH(arm) SECTIONS { - PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 70802e4c91cf..5133dcbfe9f7 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1070,8 +1070,7 @@ static void timer_context_init(struct kvm_vcpu *vcpu, int timerid) else ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; - hrtimer_init(&ctxt->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - ctxt->hrtimer.function = kvm_hrtimer_expire; + hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); switch (timerid) { case TIMER_PTIMER: @@ -1098,8 +1097,8 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) timer_set_offset(vcpu_ptimer(vcpu), 0); } - hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - timer->bg_timer.function = kvm_bg_timer_expire; + hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); } void kvm_timer_init_vm(struct kvm *kvm) diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 3a96c96816e9..f74a66ce3064 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1090,22 +1090,22 @@ static void compute_s1_overlay_permissions(struct kvm_vcpu *vcpu, break; } - if (pov_perms & ~POE_RXW) + if (pov_perms & ~POE_RWX) pov_perms = POE_NONE; if (wi->poe && wr->pov) { wr->pr &= pov_perms & POE_R; - wr->px &= pov_perms & POE_X; wr->pw &= pov_perms & POE_W; + wr->px &= pov_perms & POE_X; } - if (uov_perms & ~POE_RXW) + if (uov_perms & ~POE_RWX) uov_perms = POE_NONE; if (wi->e0poe && wr->uov) { wr->ur &= uov_perms & POE_R; - wr->ux &= uov_perms & POE_X; wr->uw &= uov_perms & POE_W; + wr->ux &= uov_perms & POE_X; } } diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c index e4a342e903e2..098416d7e5c2 100644 --- a/arch/arm64/kvm/ptdump.c +++ b/arch/arm64/kvm/ptdump.c @@ -52,8 +52,8 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = { .set = "AF", .clear = " ", }, { - .mask = PTE_TABLE_BIT | PTE_VALID, - .val = PTE_VALID, + .mask = PMD_TYPE_MASK, + .val = PMD_TYPE_SECT, .set = "BLK", .clear = " ", }, diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index a5a5f5b97b17..de9a303b6ad0 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,14 +17,27 @@ * Alignment fixed up by hardware. */ - .p2align 4 - // Alignment is for the loop, but since the prologue (including BTI) - // is also 16 bytes we can keep any padding outside the function SYM_FUNC_START(__arch_clear_user) add x2, x0, x1 + +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +alternative_if_not ARM64_HAS_MOPS + b .Lno_mops +alternative_else_nop_endif + +USER(9f, setpt [x0]!, x1!, xzr) +USER(6f, setmt [x0]!, x1!, xzr) +USER(6f, setet [x0]!, x1!, xzr) + mov x0, #0 + ret +.Lno_mops: +#endif + subs x1, x1, #8 b.mi 2f -1: + +1: .p2align 4 USER(9f, sttr xzr, [x0]) add x0, x0, #8 subs x1, x1, #8 @@ -47,6 +60,10 @@ USER(7f, sttrb wzr, [x2, #-1]) ret // Exception fixups +6: b.cs 9f + // Registers are in Option A format + add x0, x0, x1 + b 9f 7: sub x0, x2, #5 // Adjust for faulting on the final byte... 8: add x0, x0, #4 // ...or the second word of the 4-7 byte case 9: sub x0, x2, x0 diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e317907524..400057d607ec 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -52,6 +52,13 @@ stp \reg1, \reg2, [\ptr], \val .endm + .macro cpy1 dst, src, count + .arch_extension mops + USER_CPY(9997f, 0, cpyfprt [\dst]!, [\src]!, \count!) + USER_CPY(9996f, 0, cpyfmrt [\dst]!, [\src]!, \count!) + USER_CPY(9996f, 0, cpyfert [\dst]!, [\src]!, \count!) + .endm + end .req x5 srcin .req x15 SYM_FUNC_START(__arch_copy_from_user) @@ -62,6 +69,9 @@ SYM_FUNC_START(__arch_copy_from_user) ret // Exception fixups +9996: b.cs 9997f + // Registers are in Option A format + add dst, dst, count 9997: cmp dst, dstin b.ne 9998f // Before being absolutely sure we couldn't copy anything, try harder diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S index 488df234c49a..7f2f5a0e2fb9 100644 --- a/arch/arm64/lib/copy_template.S +++ b/arch/arm64/lib/copy_template.S @@ -40,6 +40,16 @@ D_l .req x13 D_h .req x14 mov dst, dstin + +#ifdef CONFIG_AS_HAS_MOPS +alternative_if_not ARM64_HAS_MOPS + b .Lno_mops +alternative_else_nop_endif + cpy1 dst, src, count + b .Lexitfunc +.Lno_mops: +#endif + cmp count, #16 /*When memory length is less than 16, the accessed are not aligned.*/ b.lo .Ltiny15 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 802231772608..819f2e3fc7a9 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -51,6 +51,13 @@ user_stp 9997f, \reg1, \reg2, \ptr, \val .endm + .macro cpy1 dst, src, count + .arch_extension mops + USER_CPY(9997f, 1, cpyfpwt [\dst]!, [\src]!, \count!) + USER_CPY(9996f, 1, cpyfmwt [\dst]!, [\src]!, \count!) + USER_CPY(9996f, 1, cpyfewt [\dst]!, [\src]!, \count!) + .endm + end .req x5 srcin .req x15 SYM_FUNC_START(__arch_copy_to_user) @@ -61,6 +68,9 @@ SYM_FUNC_START(__arch_copy_to_user) ret // Exception fixups +9996: b.cs 9997f + // Registers are in Option A format + add dst, dst, count 9997: cmp dst, dstin b.ne 9998f // Before being absolutely sure we couldn't copy anything, try harder diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index 228d681a8715..6e0528831cd3 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -8,8 +8,33 @@ #include <linux/uaccess.h> #include <asm/asm-extable.h> +#include <asm/esr.h> #include <asm/ptrace.h> +static bool cpy_faulted_on_uaccess(const struct exception_table_entry *ex, + unsigned long esr) +{ + bool uaccess_is_write = FIELD_GET(EX_DATA_UACCESS_WRITE, ex->data); + bool fault_on_write = esr & ESR_ELx_WNR; + + return uaccess_is_write == fault_on_write; +} + +bool insn_may_access_user(unsigned long addr, unsigned long esr) +{ + const struct exception_table_entry *ex = search_exception_tables(addr); + + if (!ex) + return false; + + switch (ex->type) { + case EX_TYPE_UACCESS_CPY: + return cpy_faulted_on_uaccess(ex, esr); + default: + return true; + } +} + static inline unsigned long get_ex_fixup(const struct exception_table_entry *ex) { @@ -29,6 +54,17 @@ static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, return true; } +static bool ex_handler_uaccess_cpy(const struct exception_table_entry *ex, + struct pt_regs *regs, unsigned long esr) +{ + /* Do not fix up faults on kernel memory accesses */ + if (!cpy_faulted_on_uaccess(ex, esr)) + return false; + + regs->pc = get_ex_fixup(ex); + return true; +} + static bool ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) @@ -56,7 +92,7 @@ ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, return true; } -bool fixup_exception(struct pt_regs *regs) +bool fixup_exception(struct pt_regs *regs, unsigned long esr) { const struct exception_table_entry *ex; @@ -70,6 +106,8 @@ bool fixup_exception(struct pt_regs *regs) case EX_TYPE_UACCESS_ERR_ZERO: case EX_TYPE_KACCESS_ERR_ZERO: return ex_handler_uaccess_err_zero(ex, regs); + case EX_TYPE_UACCESS_CPY: + return ex_handler_uaccess_cpy(ex, regs, esr); case EX_TYPE_LOAD_UNALIGNED_ZEROPAD: return ex_handler_load_unaligned_zeropad(ex, regs); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index ef63651099a9..ec0a337891dd 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -375,7 +375,7 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr, * Are we prepared to handle this kernel fault? * We are almost certainly not prepared to handle instruction faults. */ - if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) + if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr)) return; if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs), @@ -606,7 +606,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, die_kernel_fault("execution of user memory", addr, esr, regs); - if (!search_exception_tables(regs->pc)) + if (!insn_may_access_user(regs->pc, esr)) die_kernel_fault("access to user memory outside uaccess routines", addr, esr, regs); } diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index b3a7fafe8892..cfe8cb8ba1cc 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -334,7 +334,9 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) switch (hp_size) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - return PGDIR_SIZE - PUD_SIZE; + if (pud_sect_supported()) + return PGDIR_SIZE - PUD_SIZE; + break; #endif case CONT_PMD_SIZE: return PUD_SIZE - CONT_PMD_SIZE; @@ -356,23 +358,21 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) switch (pagesize) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - entry = pud_pte(pud_mkhuge(pte_pud(entry))); + if (pud_sect_supported()) + return pud_pte(pud_mkhuge(pte_pud(entry))); break; #endif case CONT_PMD_SIZE: - entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); - fallthrough; + return pmd_pte(pmd_mkhuge(pmd_mkcont(pte_pmd(entry)))); case PMD_SIZE: - entry = pmd_pte(pmd_mkhuge(pte_pmd(entry))); - break; + return pmd_pte(pmd_mkhuge(pte_pmd(entry))); case CONT_PTE_SIZE: - entry = pte_mkcont(entry); - break; + return pte_mkcont(entry); default: - pr_warn("%s: unrecognized huge page size 0x%lx\n", - __func__, pagesize); break; } + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); return entry; } diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index b65a29440a0c..d541ce45daeb 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -190,7 +190,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, */ static bool __init root_level_aligned(u64 addr) { - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 1) * PTDESC_TABLE_SHIFT; return (addr % (PAGE_SIZE << shift)) == 0; } @@ -245,7 +245,7 @@ static int __init root_level_idx(u64 addr) */ u64 vabits = IS_ENABLED(CONFIG_ARM64_64K_PAGES) ? VA_BITS : vabits_actual; - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits) - 1) * PTDESC_TABLE_SHIFT; return (addr & ~_PAGE_OFFSET(vabits)) >> (shift + PAGE_SHIFT); } @@ -269,7 +269,7 @@ static void __init clone_next_level(u64 addr, pgd_t *tmp_pg_dir, pud_t *pud) */ static int __init next_level_idx(u64 addr) { - int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * (PAGE_SHIFT - 3); + int shift = (ARM64_HW_PGTABLE_LEVELS(vabits_actual) - 2) * PTDESC_TABLE_SHIFT; return (addr >> (shift + PAGE_SHIFT)) % PTRS_PER_PTE; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1dfe1a8efdbe..b98f89420713 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1558,9 +1558,8 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) #ifdef CONFIG_ARCH_HAS_PKEYS int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { - u64 new_por = POE_RXW; + u64 new_por; u64 old_por; - u64 pkey_shift; if (!system_supports_poe()) return -ENOSPC; @@ -1574,7 +1573,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i return -EINVAL; /* Set the bits we need in POR: */ - new_por = POE_RXW; + new_por = POE_RWX; if (init_val & PKEY_DISABLE_WRITE) new_por &= ~POE_W; if (init_val & PKEY_DISABLE_ACCESS) @@ -1585,12 +1584,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i new_por &= ~POE_X; /* Shift the bits in to the correct place in POR for pkey: */ - pkey_shift = pkey * POR_BITS_PER_PKEY; - new_por <<= pkey_shift; + new_por = POR_ELx_PERM_PREP(pkey, new_por); /* Get old POR and mask off any old bits in place: */ old_por = read_sysreg_s(SYS_POR_EL0); - old_por &= ~(POE_MASK << pkey_shift); + old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey)); /* Write old part along with new part: */ write_sysreg_s(old_por | new_por, SYS_POR_EL0); diff --git a/arch/arm64/mm/physaddr.c b/arch/arm64/mm/physaddr.c index cde44c13dda1..7d94e09b01b3 100644 --- a/arch/arm64/mm/physaddr.c +++ b/arch/arm64/mm/physaddr.c @@ -10,7 +10,7 @@ phys_addr_t __virt_to_phys(unsigned long x) { WARN(!__is_lm_address(__tag_reset(x)), - "virt_to_phys used for non-linear address: %pK (%pS)\n", + "virt_to_phys used for non-linear address: %p (%pS)\n", (void *)x, (void *)x); diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 688fbe0271ca..8cec0da4cff2 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -80,8 +80,8 @@ static const struct ptdump_prot_bits pte_bits[] = { .set = "CON", .clear = " ", }, { - .mask = PTE_TABLE_BIT | PTE_VALID, - .val = PTE_VALID, + .mask = PMD_TYPE_MASK, + .val = PMD_TYPE_SECT, .set = "BLK", .clear = " ", }, { diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk index 1a2afc9fdd42..f2a1732cb1f6 100755 --- a/arch/arm64/tools/gen-sysreg.awk +++ b/arch/arm64/tools/gen-sysreg.awk @@ -111,7 +111,7 @@ END { /^$/ { next } /^[\t ]*#/ { next } -/^SysregFields/ && block_current() == "Root" { +$1 == "SysregFields" && block_current() == "Root" { block_push("SysregFields") expect_fields(2) @@ -127,7 +127,8 @@ END { next } -/^EndSysregFields/ && block_current() == "SysregFields" { +$1 == "EndSysregFields" && block_current() == "SysregFields" { + expect_fields(1) if (next_bit > 0) fatal("Unspecified bits in " reg) @@ -145,7 +146,7 @@ END { next } -/^Sysreg/ && block_current() == "Root" { +$1 == "Sysreg" && block_current() == "Root" { block_push("Sysreg") expect_fields(7) @@ -177,7 +178,8 @@ END { next } -/^EndSysreg/ && block_current() == "Sysreg" { +$1 == "EndSysreg" && block_current() == "Sysreg" { + expect_fields(1) if (next_bit > 0) fatal("Unspecified bits in " reg) @@ -206,7 +208,7 @@ END { # Currently this is effectivey a comment, in future we may want to emit # defines for the fields. -(/^Fields/ || /^Mapping/) && block_current() == "Sysreg" { +($1 == "Fields" || $1 == "Mapping") && block_current() == "Sysreg" { expect_fields(2) if (next_bit != 63) @@ -224,7 +226,7 @@ END { } -/^Res0/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Res0" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(2) parse_bitdef(reg, "RES0", $2) field = "RES0_" msb "_" lsb @@ -234,7 +236,7 @@ END { next } -/^Res1/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Res1" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(2) parse_bitdef(reg, "RES1", $2) field = "RES1_" msb "_" lsb @@ -244,7 +246,7 @@ END { next } -/^Unkn/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Unkn" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(2) parse_bitdef(reg, "UNKN", $2) field = "UNKN_" msb "_" lsb @@ -254,7 +256,7 @@ END { next } -/^Field/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Field" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(3) field = $3 parse_bitdef(reg, field, $2) @@ -265,14 +267,14 @@ END { next } -/^Raz/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Raz" && (block_current() == "Sysreg" || block_current() == "SysregFields") { expect_fields(2) parse_bitdef(reg, field, $2) next } -/^SignedEnum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "SignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields") { block_push("Enum") expect_fields(3) @@ -285,7 +287,7 @@ END { next } -/^UnsignedEnum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "UnsignedEnum" && (block_current() == "Sysreg" || block_current() == "SysregFields") { block_push("Enum") expect_fields(3) @@ -298,7 +300,7 @@ END { next } -/^Enum/ && (block_current() == "Sysreg" || block_current() == "SysregFields") { +$1 == "Enum" && (block_current() == "Sysreg" || block_current() == "SysregFields") { block_push("Enum") expect_fields(3) @@ -310,7 +312,8 @@ END { next } -/^EndEnum/ && block_current() == "Enum" { +$1 == "EndEnum" && block_current() == "Enum" { + expect_fields(1) field = null msb = null diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl index 69a829912a05..0765b3a8d6d6 100644 --- a/arch/arm64/tools/syscall_32.tbl +++ b/arch/arm64/tools/syscall_32.tbl @@ -478,3 +478,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 2c63662c1a48..f9476848a2ed 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1664,6 +1664,7 @@ EndEnum UnsignedEnum 59:56 FGT 0b0000 NI 0b0001 IMP + 0b0010 FGT2 EndEnum Res0 55:48 UnsignedEnum 47:44 EXS @@ -1725,6 +1726,7 @@ Enum 3:0 PARANGE 0b0100 44 0b0101 48 0b0110 52 + 0b0111 56 EndEnum EndSysreg @@ -2074,7 +2076,7 @@ EndEnum Res0 4:2 Field 1 ExTRE Field 0 E0TRE -EndSysregFields +EndSysreg Sysreg SMPRI_EL1 3 0 1 2 4 Res0 63:4 @@ -2641,6 +2643,101 @@ Field 0 E0HTRE EndSysreg +Sysreg HDFGRTR2_EL2 3 4 3 1 0 +Res0 63:25 +Field 24 nPMBMAR_EL1 +Field 23 nMDSTEPOP_EL1 +Field 22 nTRBMPAM_EL1 +Res0 21 +Field 20 nTRCITECR_EL1 +Field 19 nPMSDSFR_EL1 +Field 18 nSPMDEVAFF_EL1 +Field 17 nSPMID +Field 16 nSPMSCR_EL1 +Field 15 nSPMACCESSR_EL1 +Field 14 nSPMCR_EL0 +Field 13 nSPMOVS +Field 12 nSPMINTEN +Field 11 nSPMCNTEN +Field 10 nSPMSELR_EL0 +Field 9 nSPMEVTYPERn_EL0 +Field 8 nSPMEVCNTRn_EL0 +Field 7 nPMSSCR_EL1 +Field 6 nPMSSDATA +Field 5 nMDSELR_EL1 +Field 4 nPMUACR_EL1 +Field 3 nPMICFILTR_EL0 +Field 2 nPMICNTR_EL0 +Field 1 nPMIAR_EL1 +Field 0 nPMECR_EL1 +EndSysreg + +Sysreg HDFGWTR2_EL2 3 4 3 1 1 +Res0 63:25 +Field 24 nPMBMAR_EL1 +Field 23 nMDSTEPOP_EL1 +Field 22 nTRBMPAM_EL1 +Field 21 nPMZR_EL0 +Field 20 nTRCITECR_EL1 +Field 19 nPMSDSFR_EL1 +Res0 18:17 +Field 16 nSPMSCR_EL1 +Field 15 nSPMACCESSR_EL1 +Field 14 nSPMCR_EL0 +Field 13 nSPMOVS +Field 12 nSPMINTEN +Field 11 nSPMCNTEN +Field 10 nSPMSELR_EL0 +Field 9 nSPMEVTYPERn_EL0 +Field 8 nSPMEVCNTRn_EL0 +Field 7 nPMSSCR_EL1 +Res0 6 +Field 5 nMDSELR_EL1 +Field 4 nPMUACR_EL1 +Field 3 nPMICFILTR_EL0 +Field 2 nPMICNTR_EL0 +Field 1 nPMIAR_EL1 +Field 0 nPMECR_EL1 +EndSysreg + +Sysreg HFGRTR2_EL2 3 4 3 1 2 +Res0 63:15 +Field 14 nACTLRALIAS_EL1 +Field 13 nACTLRMASK_EL1 +Field 12 nTCR2ALIAS_EL1 +Field 11 nTCRALIAS_EL1 +Field 10 nSCTLRALIAS2_EL1 +Field 9 nSCTLRALIAS_EL1 +Field 8 nCPACRALIAS_EL1 +Field 7 nTCR2MASK_EL1 +Field 6 nTCRMASK_EL1 +Field 5 nSCTLR2MASK_EL1 +Field 4 nSCTLRMASK_EL1 +Field 3 nCPACRMASK_EL1 +Field 2 nRCWSMASK_EL1 +Field 1 nERXGSR_EL1 +Field 0 nPFAR_EL1 +EndSysreg + +Sysreg HFGWTR2_EL2 3 4 3 1 3 +Res0 63:15 +Field 14 nACTLRALIAS_EL1 +Field 13 nACTLRMASK_EL1 +Field 12 nTCR2ALIAS_EL1 +Field 11 nTCRALIAS_EL1 +Field 10 nSCTLRALIAS2_EL1 +Field 9 nSCTLRALIAS_EL1 +Field 8 nCPACRALIAS_EL1 +Field 7 nTCR2MASK_EL1 +Field 6 nTCRMASK_EL1 +Field 5 nSCTLR2MASK_EL1 +Field 4 nSCTLRMASK_EL1 +Field 3 nCPACRMASK_EL1 +Field 2 nRCWSMASK_EL1 +Res0 1 +Field 0 nPFAR_EL1 +EndSysreg + Sysreg HDFGRTR_EL2 3 4 3 1 4 Field 63 PMBIDR_EL1 Field 62 nPMSNEVFR_EL1 @@ -2813,6 +2910,12 @@ Field 1 AMEVCNTR00_EL0 Field 0 AMCNTEN0 EndSysreg +Sysreg HFGITR2_EL2 3 4 3 1 7 +Res0 63:2 +Field 1 nDCCIVAPS +Field 0 TSBCSYNC +EndSysreg + Sysreg ZCR_EL2 3 4 1 2 0 Fields ZCR_ELx EndSysreg diff --git a/arch/csky/kernel/vdso/Makefile b/arch/csky/kernel/vdso/Makefile index 069ef0b17fe5..a3042287a070 100644 --- a/arch/csky/kernel/vdso/Makefile +++ b/arch/csky/kernel/vdso/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Symbols present in the vdso vdso-syms += rt_sigreturn diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 2b8bd27a852f..687502917ae2 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -30,6 +30,7 @@ config LOONGARCH select ARCH_HAS_SET_MEMORY select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_VDSO_ARCH_DATA select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION @@ -106,6 +107,7 @@ config LOONGARCH select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GPIOLIB select HAS_IOPORT @@ -291,6 +293,9 @@ config AS_HAS_LBT_EXTENSION config AS_HAS_LVZ_EXTENSION def_bool $(as-instr,hvcl 0) +config CC_HAS_ANNOTATE_TABLEJUMP + def_bool $(cc-option,-mannotate-tablejump) + menu "Kernel type and options" source "kernel/Kconfig.hz" diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 567bd122a9ee..0304eabbe606 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -101,7 +101,11 @@ KBUILD_AFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma) KBUILD_CFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma)-mthin-add-sub) ifdef CONFIG_OBJTOOL -KBUILD_CFLAGS += -fno-jump-tables +ifdef CONFIG_CC_HAS_ANNOTATE_TABLEJUMP +KBUILD_CFLAGS += -mannotate-tablejump +else +KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers +endif endif KBUILD_RUSTFLAGS += --target=loongarch64-unknown-none-softfloat -Ccode-model=small diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 73c77500ac46..3c240afe5aed 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -981,7 +981,6 @@ CONFIG_MINIX_FS=m CONFIG_ROMFS_FS=m CONFIG_PSTORE=m CONFIG_PSTORE_COMPRESS=y -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_EROFS_FS_ZIP_LZMA=y diff --git a/arch/loongarch/include/asm/vdso.h b/arch/loongarch/include/asm/vdso.h index d3ba35eb23e7..f72ec79e2dde 100644 --- a/arch/loongarch/include/asm/vdso.h +++ b/arch/loongarch/include/asm/vdso.h @@ -31,7 +31,6 @@ struct loongarch_vdso_info { unsigned long size; unsigned long offset_sigreturn; struct vm_special_mapping code_mapping; - struct vm_special_mapping data_mapping; }; extern struct loongarch_vdso_info vdso_info; diff --git a/arch/loongarch/include/asm/vdso/arch_data.h b/arch/loongarch/include/asm/vdso/arch_data.h new file mode 100644 index 000000000000..322d0a5f1c84 --- /dev/null +++ b/arch/loongarch/include/asm/vdso/arch_data.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Author: Huacai Chen <chenhuacai@loongson.cn> + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ + +#ifndef _VDSO_ARCH_DATA_H +#define _VDSO_ARCH_DATA_H + +#ifndef __ASSEMBLY__ + +#include <asm/asm.h> +#include <asm/vdso.h> + +struct vdso_pcpu_data { + u32 node; +} ____cacheline_aligned_in_smp; + +struct vdso_arch_data { + struct vdso_pcpu_data pdata[NR_CPUS]; +}; + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h index e80f3c4ac748..48c43f55b039 100644 --- a/arch/loongarch/include/asm/vdso/getrandom.h +++ b/arch/loongarch/include/asm/vdso/getrandom.h @@ -28,11 +28,6 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns return ret; } -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - return &_loongarch_data.rng_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/loongarch/include/asm/vdso/gettimeofday.h b/arch/loongarch/include/asm/vdso/gettimeofday.h index 7eb3f041af76..88cfcf133116 100644 --- a/arch/loongarch/include/asm/vdso/gettimeofday.h +++ b/arch/loongarch/include/asm/vdso/gettimeofday.h @@ -72,7 +72,7 @@ static __always_inline int clock_getres_fallback( } static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { uint64_t count; @@ -89,18 +89,6 @@ static inline bool loongarch_vdso_hres_capable(void) } #define __arch_vdso_hres_capable loongarch_vdso_hres_capable -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/loongarch/include/asm/vdso/vdso.h b/arch/loongarch/include/asm/vdso/vdso.h index 1c183a9b2115..50c65fb29daf 100644 --- a/arch/loongarch/include/asm/vdso/vdso.h +++ b/arch/loongarch/include/asm/vdso/vdso.h @@ -12,43 +12,9 @@ #include <asm/asm.h> #include <asm/page.h> #include <asm/vdso.h> +#include <vdso/datapage.h> -struct vdso_pcpu_data { - u32 node; -} ____cacheline_aligned_in_smp; - -struct loongarch_vdso_data { - struct vdso_pcpu_data pdata[NR_CPUS]; - struct vdso_rng_data rng_data; -}; - -/* - * The layout of vvar: - * - * high - * +---------------------+--------------------------+ - * | loongarch vdso data | LOONGARCH_VDSO_DATA_SIZE | - * +---------------------+--------------------------+ - * | time-ns vdso data | PAGE_SIZE | - * +---------------------+--------------------------+ - * | generic vdso data | PAGE_SIZE | - * +---------------------+--------------------------+ - * low - */ -#define LOONGARCH_VDSO_DATA_SIZE PAGE_ALIGN(sizeof(struct loongarch_vdso_data)) -#define LOONGARCH_VDSO_DATA_PAGES (LOONGARCH_VDSO_DATA_SIZE >> PAGE_SHIFT) - -enum vvar_pages { - VVAR_GENERIC_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_LOONGARCH_PAGES_START, - VVAR_LOONGARCH_PAGES_END = VVAR_LOONGARCH_PAGES_START + LOONGARCH_VDSO_DATA_PAGES - 1, - VVAR_NR_PAGES, -}; - -#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) - -extern struct loongarch_vdso_data _loongarch_data __attribute__((visibility("hidden"))); +#define VVAR_SIZE (VDSO_NR_PAGES << PAGE_SHIFT) #endif /* __ASSEMBLY__ */ diff --git a/arch/loongarch/include/asm/vdso/vsyscall.h b/arch/loongarch/include/asm/vdso/vsyscall.h index 8987e951d0a9..1140b54b4bc8 100644 --- a/arch/loongarch/include/asm/vdso/vsyscall.h +++ b/arch/loongarch/include/asm/vdso/vsyscall.h @@ -6,23 +6,6 @@ #include <vdso/datapage.h> -extern struct vdso_data *vdso_data; -extern struct vdso_rng_data *vdso_rng_data; - -static __always_inline -struct vdso_data *__loongarch_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __loongarch_get_k_vdso_data - -static __always_inline -struct vdso_rng_data *__loongarch_get_k_vdso_rng_data(void) -{ - return vdso_rng_data; -} -#define __arch_get_k_vdso_rng_data __loongarch_get_k_vdso_rng_data - /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index c19f446ad0db..db1e4bb26b6a 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -316,6 +316,6 @@ static void __used output_vdso_defines(void) { COMMENT("LoongArch vDSO offsets."); - DEFINE(__VVAR_PAGES, VVAR_NR_PAGES); + DEFINE(__VDSO_PAGES, VDSO_NR_PAGES); BLANK(); } diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index 05e5fbac102a..10cf1608c7b3 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -14,7 +14,7 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> -#include <linux/time_namespace.h> +#include <linux/vdso_datastore.h> #include <asm/page.h> #include <asm/vdso.h> @@ -25,18 +25,6 @@ extern char vdso_start[], vdso_end[]; -/* Kernel-provided data used by the VDSO. */ -static union vdso_data_store generic_vdso_data __page_aligned_data; - -static union { - u8 page[LOONGARCH_VDSO_DATA_SIZE]; - struct loongarch_vdso_data vdata; -} loongarch_vdso_data __page_aligned_data; - -struct vdso_data *vdso_data = generic_vdso_data.data; -struct vdso_pcpu_data *vdso_pdata = loongarch_vdso_data.vdata.pdata; -struct vdso_rng_data *vdso_rng_data = &loongarch_vdso_data.vdata.rng_data; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { current->mm->context.vdso = (void *)(new_vma->vm_start); @@ -44,53 +32,12 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struc return 0; } -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - unsigned long pfn; - struct page *timens_page = find_timens_vvar_page(vma); - - switch (vmf->pgoff) { - case VVAR_GENERIC_PAGE_OFFSET: - if (!timens_page) - pfn = sym_to_pfn(vdso_data); - else - pfn = page_to_pfn(timens_page); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace specific - * VVAR is mapped with the VVAR_GENERIC_PAGE_OFFSET and the real - * VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - else - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - case VVAR_LOONGARCH_PAGES_START ... VVAR_LOONGARCH_PAGES_END: - pfn = sym_to_pfn(&loongarch_vdso_data) + vmf->pgoff - VVAR_LOONGARCH_PAGES_START; - break; - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - struct loongarch_vdso_info vdso_info = { .vdso = vdso_start, .code_mapping = { .name = "[vdso]", .mremap = vdso_mremap, }, - .data_mapping = { - .name = "[vvar]", - .fault = vvar_fault, - }, .offset_sigreturn = vdso_offset_sigreturn, }; @@ -101,7 +48,7 @@ static int __init init_vdso(void) BUG_ON(!PAGE_ALIGNED(vdso_info.vdso)); for_each_possible_cpu(cpu) - vdso_pdata[cpu].node = cpu_to_node(cpu); + vdso_k_arch_data->pdata[cpu].node = cpu_to_node(cpu); vdso_info.size = PAGE_ALIGN(vdso_end - vdso_start); vdso_info.code_mapping.pages = @@ -115,37 +62,6 @@ static int __init init_vdso(void) } subsys_initcall(init_vdso); -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -/* - * The vvar mapping contains data for a specific time namespace, so when a - * task changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vdso_info.data_mapping)) - zap_vma_pages(vma); - } - mmap_read_unlock(mm); - - return 0; -} -#endif - static unsigned long vdso_base(void) { unsigned long base = STACK_TOP; @@ -181,9 +97,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out; } - vma = _install_special_mapping(mm, data_addr, VVAR_SIZE, - VM_READ | VM_MAYREAD | VM_PFNMAP, - &info->data_mapping); + vma = vdso_install_vvar_mapping(mm, data_addr); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 552cde722932..8e427b379661 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1487,8 +1487,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.vpid = 0; vcpu->arch.flush_gpa = INVALID_GPA; - hrtimer_init(&vcpu->arch.swtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - vcpu->arch.swtimer.function = kvm_swtimer_wakeup; + hrtimer_setup(&vcpu->arch.swtimer, kvm_swtimer_wakeup, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); /* Get GPA (=HVA) of PGD for kvm hypervisor */ vcpu->arch.kvm_pgd = __pa(vcpu->kvm->arch.pgd); diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index fdde1bcd4e26..1c26147aff70 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -2,7 +2,7 @@ # Objects to go into the VDSO. # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o vgetrandom.o \ vgetrandom-chacha.o sigreturn.o diff --git a/arch/loongarch/vdso/vdso.lds.S b/arch/loongarch/vdso/vdso.lds.S index 160cfaef2de4..8ff986499947 100644 --- a/arch/loongarch/vdso/vdso.lds.S +++ b/arch/loongarch/vdso/vdso.lds.S @@ -5,6 +5,7 @@ */ #include <asm/page.h> #include <generated/asm-offsets.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf64-loongarch", "elf64-loongarch", "elf64-loongarch") @@ -12,11 +13,8 @@ OUTPUT_ARCH(loongarch) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - PROVIDE(_loongarch_data = _vdso_data + 2 * PAGE_SIZE); + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/loongarch/vdso/vgetcpu.c b/arch/loongarch/vdso/vgetcpu.c index 0db51258b2a7..5301cd9d0f83 100644 --- a/arch/loongarch/vdso/vgetcpu.c +++ b/arch/loongarch/vdso/vgetcpu.c @@ -19,27 +19,19 @@ static __always_inline int read_cpu_id(void) return cpu_id; } -static __always_inline const struct vdso_pcpu_data *get_pcpu_data(void) -{ - return _loongarch_data.pdata; -} - extern int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused); int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused) { int cpu_id; - const struct vdso_pcpu_data *data; cpu_id = read_cpu_id(); if (cpu) *cpu = cpu_id; - if (node) { - data = get_pcpu_data(); - *node = data[cpu_id].node; - } + if (node) + *node = vdso_u_arch_data.pdata[cpu_id].node; return 0; } diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index dbf2ea561c85..31ecb8b7b9f1 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -299,7 +299,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_PARPORT=m CONFIG_PARPORT_AMIGA=m @@ -336,6 +335,7 @@ CONFIG_ATA=y CONFIG_PATA_GAYLE=y CONFIG_PATA_BUDDHA=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -486,7 +486,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index b0fd199cc0a4..1f57514624d5 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -295,7 +295,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=y @@ -316,6 +315,7 @@ CONFIG_SCSI_SAS_ATTRS=m CONFIG_ISCSI_TCP=m CONFIG_ISCSI_BOOT_SYSFS=m CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -443,7 +443,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index bb5b2d3b6c10..02db7a48e57e 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -302,7 +302,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_PARPORT=m CONFIG_PARPORT_ATARI=m @@ -331,6 +330,7 @@ CONFIG_ATA=y # CONFIG_ATA_BMDMA is not set CONFIG_PATA_FALCON=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -463,7 +463,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 8315a13bab73..f0e673cb17eb 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -292,7 +292,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=y @@ -314,6 +313,7 @@ CONFIG_ISCSI_TCP=m CONFIG_ISCSI_BOOT_SYSFS=m CONFIG_BVME6000_SCSI=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -435,7 +435,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 350370657e5f..e8ca5a50b86d 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -294,7 +294,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=y @@ -315,6 +314,7 @@ CONFIG_SCSI_SAS_ATTRS=m CONFIG_ISCSI_TCP=m CONFIG_ISCSI_BOOT_SYSFS=m CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -445,7 +445,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index f942b4755702..b3a270441bb1 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -293,7 +293,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_BLK_DEV_SWIM=m CONFIG_ZRAM=m @@ -320,6 +319,7 @@ CONFIG_ATA=y # CONFIG_ATA_BMDMA is not set CONFIG_PATA_PLATFORM=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -462,7 +462,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index b1eaad02efab..d215dba006ce 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -313,7 +313,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_PARPORT=m CONFIG_PARPORT_PC=m @@ -363,6 +362,7 @@ CONFIG_PATA_GAYLE=y CONFIG_PATA_BUDDHA=y CONFIG_PATA_PLATFORM=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -549,7 +549,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 6309a4442bb3..a888ed93ff82 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -291,7 +291,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=y @@ -313,6 +312,7 @@ CONFIG_ISCSI_TCP=m CONFIG_ISCSI_BOOT_SYSFS=m CONFIG_MVME147_SCSI=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -435,7 +435,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 3feb0731f814..b481782375f6 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -292,7 +292,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=y @@ -314,6 +313,7 @@ CONFIG_ISCSI_TCP=m CONFIG_ISCSI_BOOT_SYSFS=m CONFIG_MVME16x_SCSI=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -436,7 +436,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index ea04b1b0da7d..6eba743d8eb5 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -293,7 +293,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_PARPORT=m CONFIG_PARPORT_PC=m @@ -320,6 +319,7 @@ CONFIG_ATA=y # CONFIG_ATA_BMDMA is not set CONFIG_PATA_FALCON=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -452,7 +452,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index f52d9af92153..9bdbb418ffa8 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -288,7 +288,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=y @@ -310,6 +309,7 @@ CONFIG_ISCSI_TCP=m CONFIG_ISCSI_BOOT_SYSFS=m CONFIG_SUN3_SCSI=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -433,7 +433,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index f348447824da..e1cf20fa5343 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -289,7 +289,6 @@ CONFIG_NET_IFE=m CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_TEST_ASYNC_DRIVER_PROBE=m -CONFIG_DM_KUNIT_TEST=m CONFIG_CONNECTOR=m CONFIG_ZRAM=m CONFIG_BLK_DEV_LOOP=y @@ -311,6 +310,7 @@ CONFIG_ISCSI_TCP=m CONFIG_ISCSI_BOOT_SYSFS=m CONFIG_SUN3X_ESP=y CONFIG_MD=y +CONFIG_MD_LINEAR=m CONFIG_BLK_DEV_DM=m CONFIG_DM_UNSTRIPED=m CONFIG_DM_CRYPT=m @@ -433,7 +433,6 @@ CONFIG_OMFS_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_QNX6FS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_EROFS_FS=m CONFIG_NFS_FS=y diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index 8f2676c3a988..3c43c09d4489 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -95,10 +95,24 @@ static inline void set_fc(unsigned long val) "movec %0,%/dfc\n\t" : /* no outputs */ : "r" (val) : "memory"); } + +static inline unsigned long get_fc(void) +{ + unsigned long val; + + __asm__ ("movec %/dfc,%0" : "=r" (val) : ); + + return val; +} #else static inline void set_fc(unsigned long val) { } + +static inline unsigned long get_fc(void) +{ + return USER_DATA; +} #endif /* CONFIG_CPU_HAS_ADDRESS_SPACES */ struct thread_struct { diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c index 15c1a595a1de..3bc28bc512ac 100644 --- a/arch/m68k/kernel/setup_mm.c +++ b/arch/m68k/kernel/setup_mm.c @@ -147,8 +147,7 @@ static void __init m68k_parse_bootinfo(const struct bi_record *record) break; case BI_COMMAND_LINE: - strscpy(m68k_command_line, data, - sizeof(m68k_command_line)); + strscpy(m68k_command_line, data); break; case BI_RNG_SEED: { diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f5ed71f1910d..9fe47112c586 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -466,3 +466,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c index 119bd32efcfb..b39fc3717d8e 100644 --- a/arch/m68k/sun3/mmu_emu.c +++ b/arch/m68k/sun3/mmu_emu.c @@ -17,6 +17,7 @@ #include <linux/bitops.h> #include <linux/module.h> #include <linux/sched/mm.h> +#include <linux/string_choices.h> #include <asm/setup.h> #include <asm/traps.h> @@ -370,8 +371,8 @@ int mmu_emu_handle_fault (unsigned long vaddr, int read_flag, int kernel_fault) } #ifdef DEBUG_MMU_EMU - pr_info("%s: vaddr=%lx type=%s crp=%p\n", __func__, vaddr, - read_flag ? "read" : "write", crp); + pr_info("%s: vaddr=%lx type=%s crp=%px\n", __func__, vaddr, + str_read_write(read_flag), crp); #endif segment = (vaddr >> SUN3_PMEG_SIZE_BITS) & 0x7FF; @@ -417,7 +418,7 @@ int mmu_emu_handle_fault (unsigned long vaddr, int read_flag, int kernel_fault) pte_val (*pte) |= SUN3_PAGE_ACCESSED; #ifdef DEBUG_MMU_EMU - pr_info("seg:%ld crp:%p ->", get_fs().seg, crp); + pr_info("seg:%ld crp:%px ->", get_fc(), crp); print_pte_vaddr (vaddr); pr_cont("\n"); #endif diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 680f568b77f2..7b6e97828e55 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -472,3 +472,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 1924f2d83932..b51168e319ea 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -51,6 +51,7 @@ config MIPS select GENERIC_SMP_IDLE_THREAD select GENERIC_IDLE_POLL_SETUP select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HAS_IOPORT if !NO_IOPORT_MAP || ISA select HAVE_ARCH_COMPILER_H diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig index 4390d30206d9..e308b82c094a 100644 --- a/arch/mips/configs/malta_defconfig +++ b/arch/mips/configs/malta_defconfig @@ -347,7 +347,6 @@ CONFIG_CRAMFS=m CONFIG_VXFS_FS=m CONFIG_MINIX_FS=m CONFIG_ROMFS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig index d63d8be8cb50..fa5b04063ddb 100644 --- a/arch/mips/configs/malta_kvm_defconfig +++ b/arch/mips/configs/malta_kvm_defconfig @@ -354,7 +354,6 @@ CONFIG_CRAMFS=m CONFIG_VXFS_FS=m CONFIG_MINIX_FS=m CONFIG_ROMFS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig index 338bb6544a93..40283171af68 100644 --- a/arch/mips/configs/maltaup_xpa_defconfig +++ b/arch/mips/configs/maltaup_xpa_defconfig @@ -353,7 +353,6 @@ CONFIG_CRAMFS=m CONFIG_VXFS_FS=m CONFIG_MINIX_FS=m CONFIG_ROMFS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig index 08e1c1f2f4de..3a6d0384b774 100644 --- a/arch/mips/configs/rm200_defconfig +++ b/arch/mips/configs/rm200_defconfig @@ -336,7 +336,6 @@ CONFIG_MINIX_FS=m CONFIG_HPFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_ROMFS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_NFS_FS=m CONFIG_NFSD=m diff --git a/arch/mips/include/asm/vdso/gettimeofday.h b/arch/mips/include/asm/vdso/gettimeofday.h index 44a45f3fa4b0..fd32baa30e17 100644 --- a/arch/mips/include/asm/vdso/gettimeofday.h +++ b/arch/mips/include/asm/vdso/gettimeofday.h @@ -167,7 +167,7 @@ static __always_inline u64 read_r4k_count(void) #ifdef CONFIG_CLKSRC_MIPS_GIC -static __always_inline u64 read_gic_count(const struct vdso_data *data) +static __always_inline u64 read_gic_count(const struct vdso_time_data *data) { void __iomem *gic = get_gic(data); u32 hi, hi2, lo; @@ -184,7 +184,7 @@ static __always_inline u64 read_gic_count(const struct vdso_data *data) #endif static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { #ifdef CONFIG_CSRC_R4K if (clock_mode == VDSO_CLOCKMODE_R4K) @@ -209,10 +209,11 @@ static inline bool mips_vdso_hres_capable(void) } #define __arch_vdso_hres_capable mips_vdso_hres_capable -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { - return get_vdso_data(); + return get_vdso_time_data(); } +#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data #endif /* !__ASSEMBLY__ */ diff --git a/arch/mips/include/asm/vdso/vdso.h b/arch/mips/include/asm/vdso/vdso.h index 6cd88191fefa..acd0efcd3d93 100644 --- a/arch/mips/include/asm/vdso/vdso.h +++ b/arch/mips/include/asm/vdso/vdso.h @@ -5,16 +5,18 @@ */ #include <asm/sgidefs.h> +#include <vdso/page.h> + +#define __VDSO_PAGES 4 #ifndef __ASSEMBLY__ #include <asm/asm.h> -#include <asm/page.h> #include <asm/vdso.h> -static inline unsigned long get_vdso_base(void) +static inline const struct vdso_time_data *get_vdso_time_data(void) { - unsigned long addr; + const struct vdso_time_data *addr; /* * We can't use cpu_has_mips_r6 since it needs the cpu_data[] @@ -27,7 +29,7 @@ static inline unsigned long get_vdso_base(void) * We can't use addiupc because there is no label-label * support for the addiupc reloc */ - __asm__("lapc %0, _start \n" + __asm__("lapc %0, vdso_u_time_data \n" : "=r" (addr) : :); #else /* @@ -46,7 +48,7 @@ static inline unsigned long get_vdso_base(void) " .set noreorder \n" " bal 1f \n" " nop \n" - " .word _start - . \n" + " .word vdso_u_time_data - . \n" "1: lw %0, 0($31) \n" " " STR(PTR_ADDU) " %0, $31, %0 \n" " .set pop \n" @@ -58,14 +60,9 @@ static inline unsigned long get_vdso_base(void) return addr; } -static inline const struct vdso_data *get_vdso_data(void) -{ - return (const struct vdso_data *)(get_vdso_base() - PAGE_SIZE); -} - #ifdef CONFIG_CLKSRC_MIPS_GIC -static inline void __iomem *get_gic(const struct vdso_data *data) +static inline void __iomem *get_gic(const struct vdso_time_data *data) { return (void __iomem *)((unsigned long)data & PAGE_MASK) - PAGE_SIZE; } diff --git a/arch/mips/include/asm/vdso/vsyscall.h b/arch/mips/include/asm/vdso/vsyscall.h index a4582870aaea..2b1debb62dee 100644 --- a/arch/mips/include/asm/vdso/vsyscall.h +++ b/arch/mips/include/asm/vdso/vsyscall.h @@ -2,22 +2,12 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H +#include <asm/page.h> + #ifndef __ASSEMBLY__ #include <vdso/datapage.h> -extern struct vdso_data *vdso_data; - -/* - * Update the vDSO data page to keep in sync with kernel timekeeping. - */ -static __always_inline -struct vdso_data *__mips_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __mips_get_k_vdso_data - /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 61503a36067e..f7107479c7fa 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -1326,24 +1326,8 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs) return -1; } -#ifdef CONFIG_SECCOMP - if (unlikely(test_thread_flag(TIF_SECCOMP))) { - int ret, i; - struct seccomp_data sd; - unsigned long args[6]; - - sd.nr = current_thread_info()->syscall; - sd.arch = syscall_get_arch(current); - syscall_get_arguments(current, regs, args); - for (i = 0; i < 6; i++) - sd.args[i] = args[i]; - sd.instruction_pointer = KSTK_EIP(current); - - ret = __secure_computing(&sd); - if (ret == -1) - return ret; - } -#endif + if (secure_computing()) + return -1; if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->regs[2]); diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 0b9b7e25b69a..aa70e371bb54 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -405,3 +405,4 @@ 464 n32 getxattrat sys_getxattrat 465 n32 listxattrat sys_listxattrat 466 n32 removexattrat sys_removexattrat +467 n32 open_tree_attr sys_open_tree_attr diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c844cd5cda62..1e8c44c7b614 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -381,3 +381,4 @@ 464 n64 getxattrat sys_getxattrat 465 n64 listxattrat sys_listxattrat 466 n64 removexattrat sys_removexattrat +467 n64 open_tree_attr sys_open_tree_attr diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 349b8aad1159..114a5a1a6230 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -454,3 +454,4 @@ 464 o32 getxattrat sys_getxattrat 465 o32 listxattrat sys_listxattrat 466 o32 removexattrat sys_removexattrat +467 o32 open_tree_attr sys_open_tree_attr diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 75c9d3618f58..de096777172f 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -15,6 +15,7 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/vdso_datastore.h> #include <asm/abi.h> #include <asm/mips-cps.h> @@ -23,20 +24,7 @@ #include <vdso/helpers.h> #include <vdso/vsyscall.h> -/* Kernel-provided data used by the VDSO. */ -static union vdso_data_store mips_vdso_data __page_aligned_data; -struct vdso_data *vdso_data = mips_vdso_data.data; - -/* - * Mapping for the VDSO data/GIC pages. The real pages are mapped manually, as - * what we map and where within the area they are mapped is determined at - * runtime. - */ -static struct page *no_pages[] = { NULL }; -static struct vm_special_mapping vdso_vvar_mapping = { - .name = "[vvar]", - .pages = no_pages, -}; +static_assert(VDSO_NR_PAGES == __VDSO_PAGES); static void __init init_vdso_image(struct mips_vdso_image *image) { @@ -90,7 +78,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mips_vdso_image *image = current->thread.abi->vdso; struct mm_struct *mm = current->mm; - unsigned long gic_size, vvar_size, size, base, data_addr, vdso_addr, gic_pfn, gic_base; + unsigned long gic_size, size, base, data_addr, vdso_addr, gic_pfn, gic_base; struct vm_area_struct *vma; int ret; @@ -119,8 +107,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * the counter registers at the start. */ gic_size = mips_gic_present() ? PAGE_SIZE : 0; - vvar_size = gic_size + PAGE_SIZE; - size = vvar_size + image->size; + size = gic_size + VDSO_NR_PAGES * PAGE_SIZE + image->size; /* * Find a region that's large enough for us to perform the @@ -143,15 +130,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) */ if (cpu_has_dc_aliases) { base = __ALIGN_MASK(base, shm_align_mask); - base += ((unsigned long)vdso_data - gic_size) & shm_align_mask; + base += ((unsigned long)vdso_k_time_data - gic_size) & shm_align_mask; } data_addr = base + gic_size; - vdso_addr = data_addr + PAGE_SIZE; + vdso_addr = data_addr + VDSO_NR_PAGES * PAGE_SIZE; - vma = _install_special_mapping(mm, base, vvar_size, - VM_READ | VM_MAYREAD, - &vdso_vvar_mapping); + vma = vdso_install_vvar_mapping(mm, data_addr); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -161,6 +146,17 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (gic_size) { gic_base = (unsigned long)mips_gic_base + MIPS_GIC_USER_OFS; gic_pfn = PFN_DOWN(__pa(gic_base)); + static const struct vm_special_mapping gic_mapping = { + .name = "[gic]", + .pages = (struct page **) { NULL }, + }; + + vma = _install_special_mapping(mm, base, gic_size, VM_READ | VM_MAYREAD, + &gic_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } ret = io_remap_pfn_range(vma, base, gic_pfn, gic_size, pgprot_noncached(vma->vm_page_prot)); @@ -168,13 +164,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out; } - /* Map data page. */ - ret = remap_pfn_range(vma, data_addr, - virt_to_phys(vdso_data) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot); - if (ret) - goto out; - /* Map VDSO image. */ vma = _install_special_mapping(mm, vdso_addr, image->size, VM_READ | VM_EXEC | diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 60b43ea85c12..cef3c423a41a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -288,9 +288,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (err) return err; - hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; + hrtimer_setup(&vcpu->arch.comparecount_timer, kvm_mips_comparecount_wakeup, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Allocate space for host mode exception handlers that handle diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index b289b2c1b294..fb4c493aaffa 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -2,7 +2,7 @@ # Objects to go into the VDSO. # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso-y := elf.o vgettimeofday.o sigreturn.o diff --git a/arch/mips/vdso/vdso.lds.S b/arch/mips/vdso/vdso.lds.S index 836465e3bcb8..c8bbe56d89cb 100644 --- a/arch/mips/vdso/vdso.lds.S +++ b/arch/mips/vdso/vdso.lds.S @@ -5,6 +5,8 @@ */ #include <asm/sgidefs.h> +#include <asm/vdso/vdso.h> +#include <vdso/datapage.h> #if _MIPS_SIM == _MIPS_SIM_ABI64 OUTPUT_FORMAT("elf64-tradlittlemips", "elf64-tradbigmips", "elf64-tradlittlemips") @@ -18,7 +20,8 @@ OUTPUT_ARCH(mips) SECTIONS { - PROVIDE(_start = .); + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; /* diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 19a804860ed5..ea623f910748 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -268,7 +268,6 @@ CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_XATTR=y CONFIG_CONFIGFS_FS=y -CONFIG_SYSV_FS=y CONFIG_NFS_FS=m CONFIG_NFS_V4=m CONFIG_NFS_V4_1=y diff --git a/arch/parisc/include/asm/vdso.h b/arch/parisc/include/asm/vdso.h index 2a2dc11b5545..5f581c1d6460 100644 --- a/arch/parisc/include/asm/vdso.h +++ b/arch/parisc/include/asm/vdso.h @@ -12,8 +12,6 @@ #define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) #define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) -extern struct vdso_data *vdso_data; - #endif /* __ASSEMBLY __ */ /* Default link addresses for the vDSOs */ diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index d9fc94c86965..94df3cb957e9 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -465,3 +465,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/parisc/kernel/vdso32/Makefile b/arch/parisc/kernel/vdso32/Makefile index 288f8b85978f..4ee8d17da229 100644 --- a/arch/parisc/kernel/vdso32/Makefile +++ b/arch/parisc/kernel/vdso32/Makefile @@ -1,5 +1,5 @@ # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include KCOV_INSTRUMENT := n diff --git a/arch/parisc/kernel/vdso64/Makefile b/arch/parisc/kernel/vdso64/Makefile index bc5d9553f311..c63f4069170f 100644 --- a/arch/parisc/kernel/vdso64/Makefile +++ b/arch/parisc/kernel/vdso64/Makefile @@ -1,5 +1,5 @@ # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include KCOV_INSTRUMENT := n diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 424f188e62d9..8a2a6e403eb1 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -159,6 +159,7 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN + select ARCH_HAS_VDSO_ARCH_DATA select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAVE_EXTRA_ELF_NOTES if SPU_BASE select ARCH_KEEP_MEMBLOCK @@ -209,6 +210,7 @@ config PPC select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select HAS_IOPORT if PCI select HAVE_ARCH_AUDITSYSCALL diff --git a/arch/powerpc/configs/fsl-emb-nonhw.config b/arch/powerpc/configs/fsl-emb-nonhw.config index 3009b0efaf34..d6d2a458847b 100644 --- a/arch/powerpc/configs/fsl-emb-nonhw.config +++ b/arch/powerpc/configs/fsl-emb-nonhw.config @@ -112,7 +112,6 @@ CONFIG_QNX4FS_FS=m CONFIG_RCU_TRACE=y CONFIG_RESET_CONTROLLER=y CONFIG_ROOT_NFS=y -CONFIG_SYSV_FS=m CONFIG_SYSVIPC=y CONFIG_TMPFS=y CONFIG_UBIFS_FS=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index ca0c90e95837..364d1a78bc12 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -986,7 +986,6 @@ CONFIG_MINIX_FS=m CONFIG_OMFS_FS=m CONFIG_QNX4FS_FS=m CONFIG_ROMFS_FS=m -CONFIG_SYSV_FS=m CONFIG_UFS_FS=m CONFIG_NFS_FS=m CONFIG_NFS_V3_ACL=y diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 16bacfe8c7a2..da15b5efe807 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -152,6 +152,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpu == cpu_first_thread_sibling(cpu); } +#define topology_is_primary_thread topology_is_primary_thread static inline bool topology_smt_thread_allowed(unsigned int cpu) { diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 8d972bc98b55..1ca23fbfe087 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -3,6 +3,7 @@ #define _ASM_POWERPC_VDSO_H #define VDSO_VERSION_STRING LINUX_2.6.15 +#define __VDSO_PAGES 4 #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/vdso/arch_data.h b/arch/powerpc/include/asm/vdso/arch_data.h new file mode 100644 index 000000000000..c240a6b87518 --- /dev/null +++ b/arch/powerpc/include/asm/vdso/arch_data.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2002 Peter Bergner <bergner@vnet.ibm.com>, IBM + * Copyright (C) 2005 Benjamin Herrenschmidy <benh@kernel.crashing.org>, + * IBM Corp. + */ +#ifndef _ASM_POWERPC_VDSO_ARCH_DATA_H +#define _ASM_POWERPC_VDSO_ARCH_DATA_H + +#include <linux/unistd.h> +#include <linux/types.h> + +#define SYSCALL_MAP_SIZE ((NR_syscalls + 31) / 32) + +#ifdef CONFIG_PPC64 + +struct vdso_arch_data { + __u64 tb_ticks_per_sec; /* Timebase tics / sec */ + __u32 dcache_block_size; /* L1 d-cache block size */ + __u32 icache_block_size; /* L1 i-cache block size */ + __u32 dcache_log_block_size; /* L1 d-cache log block size */ + __u32 icache_log_block_size; /* L1 i-cache log block size */ + __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ + __u32 compat_syscall_map[SYSCALL_MAP_SIZE]; /* Map of compat syscalls */ +}; + +#else /* CONFIG_PPC64 */ + +struct vdso_arch_data { + __u64 tb_ticks_per_sec; /* Timebase tics / sec */ + __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ + __u32 compat_syscall_map[0]; /* No compat syscalls on PPC32 */ +}; + +#endif /* CONFIG_PPC64 */ + +#endif /* _ASM_POWERPC_VDSO_ARCH_DATA_H */ diff --git a/arch/powerpc/include/asm/vdso/getrandom.h b/arch/powerpc/include/asm/vdso/getrandom.h index 80ce0709725e..067a5396aac6 100644 --- a/arch/powerpc/include/asm/vdso/getrandom.h +++ b/arch/powerpc/include/asm/vdso/getrandom.h @@ -43,20 +43,21 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig (unsigned long)len, (unsigned long)flags); } -static __always_inline struct vdso_rng_data *__arch_get_vdso_rng_data(void) +static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(void) { - struct vdso_arch_data *data; + struct vdso_rng_data *data; asm ( " bcl 20, 31, .+4 ;" "0: mflr %0 ;" - " addis %0, %0, (_vdso_datapage - 0b)@ha ;" - " addi %0, %0, (_vdso_datapage - 0b)@l ;" + " addis %0, %0, (vdso_u_rng_data - 0b)@ha ;" + " addi %0, %0, (vdso_u_rng_data - 0b)@l ;" : "=r" (data) : : "lr" ); - return &data->rng_data; + return data; } +#define __arch_get_vdso_u_rng_data __arch_get_vdso_u_rng_data ssize_t __c_kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index c6390890a60c..99c9d6f43fde 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -94,22 +94,12 @@ int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) #endif static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return get_tb(); } -const struct vdso_data *__arch_get_vdso_data(void); - -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return (void *)vd + (1U << CONFIG_PAGE_SHIFT); -} -#endif - -static inline bool vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) { return true; } @@ -135,21 +125,22 @@ static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift) #ifdef __powerpc64__ int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts, - const struct vdso_data *vd); + const struct vdso_time_data *vd); int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res, - const struct vdso_data *vd); + const struct vdso_time_data *vd); #else int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts, - const struct vdso_data *vd); + const struct vdso_time_data *vd); int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts, - const struct vdso_data *vd); + const struct vdso_time_data *vd); int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res, - const struct vdso_data *vd); + const struct vdso_time_data *vd); #endif int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz, - const struct vdso_data *vd); + const struct vdso_time_data *vd); __kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, - const struct vdso_data *vd); + const struct vdso_time_data *vd); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/powerpc/include/asm/vdso/vsyscall.h b/arch/powerpc/include/asm/vdso/vsyscall.h index 48560a119559..c2c9ae1b22e7 100644 --- a/arch/powerpc/include/asm/vdso/vsyscall.h +++ b/arch/powerpc/include/asm/vdso/vsyscall.h @@ -6,19 +6,6 @@ #include <asm/vdso_datapage.h> -static __always_inline -struct vdso_data *__arch_get_k_vdso_data(void) -{ - return vdso_data->data; -} -#define __arch_get_k_vdso_data __arch_get_k_vdso_data - -static __always_inline -struct vdso_rng_data *__arch_get_k_vdso_rng_data(void) -{ - return &vdso_data->rng_data; -} - /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h index a202f5b63479..95d45a50355d 100644 --- a/arch/powerpc/include/asm/vdso_datapage.h +++ b/arch/powerpc/include/asm/vdso_datapage.h @@ -11,56 +11,18 @@ #ifndef __ASSEMBLY__ -#include <linux/unistd.h> -#include <linux/time.h> #include <vdso/datapage.h> -#define SYSCALL_MAP_SIZE ((NR_syscalls + 31) / 32) - -#ifdef CONFIG_PPC64 - -struct vdso_arch_data { - __u64 tb_ticks_per_sec; /* Timebase tics / sec */ - __u32 dcache_block_size; /* L1 d-cache block size */ - __u32 icache_block_size; /* L1 i-cache block size */ - __u32 dcache_log_block_size; /* L1 d-cache log block size */ - __u32 icache_log_block_size; /* L1 i-cache log block size */ - __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ - __u32 compat_syscall_map[SYSCALL_MAP_SIZE]; /* Map of compat syscalls */ - - struct vdso_rng_data rng_data; - - struct vdso_data data[CS_BASES] __aligned(1 << CONFIG_PAGE_SHIFT); -}; - -#else /* CONFIG_PPC64 */ - -struct vdso_arch_data { - __u64 tb_ticks_per_sec; /* Timebase tics / sec */ - __u32 syscall_map[SYSCALL_MAP_SIZE]; /* Map of syscalls */ - __u32 compat_syscall_map[0]; /* No compat syscalls on PPC32 */ - struct vdso_rng_data rng_data; - - struct vdso_data data[CS_BASES] __aligned(1 << CONFIG_PAGE_SHIFT); -}; - -#endif /* CONFIG_PPC64 */ - -extern struct vdso_arch_data *vdso_data; - #else /* __ASSEMBLY__ */ -.macro get_datapage ptr offset=0 +.macro get_datapage ptr symbol bcl 20, 31, .+4 999: mflr \ptr - addis \ptr, \ptr, (_vdso_datapage - 999b + \offset)@ha - addi \ptr, \ptr, (_vdso_datapage - 999b + \offset)@l + addis \ptr, \ptr, (\symbol - 999b)@ha + addi \ptr, \ptr, (\symbol - 999b)@l .endm -#include <asm/asm-offsets.h> -#include <asm/page.h> - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 7a390bd4f4af..b3048f6d3822 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -334,7 +334,6 @@ int main(void) #endif /* ! CONFIG_PPC64 */ /* datapage offsets for use by vdso */ - OFFSET(VDSO_DATA_OFFSET, vdso_arch_data, data); OFFSET(CFG_TB_TICKS_PER_SEC, vdso_arch_data, tb_ticks_per_sec); #ifdef CONFIG_PPC64 OFFSET(CFG_ICACHE_BLOCKSZ, vdso_arch_data, icache_block_size); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 4b371c738213..d44349fe8e2b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -751,7 +751,7 @@ u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) * prstatus.pr_pid = ???? */ elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); return buf; } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 727ed4a14545..c6997df63287 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -215,7 +215,7 @@ static int do_seccomp(struct pt_regs *regs) * have already loaded -ENOSYS into r3, or seccomp has put * something else in r3 (via SECCOMP_RET_ERRNO/TRACE). */ - if (__secure_computing(NULL)) + if (__secure_computing()) return -1; /* diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index d8b4ab78bef0..9a084bdb8926 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -557,3 +557,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 0727332ad86f..15784c5c95c7 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -950,7 +950,7 @@ void __init time_init(void) sys_tz.tz_dsttime = 0; } - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; + vdso_k_arch_data->tb_ticks_per_sec = tb_ticks_per_sec; #ifdef CONFIG_PPC64_PROC_SYSTEMCFG systemcfg->tb_ticks_per_sec = tb_ticks_per_sec; #endif diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index edf5cabe5dfd..cb8e9357383e 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -263,10 +263,9 @@ static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); - printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", + printk("%s PAGE_SIZE=%luK%s %s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", PAGE_SIZE / 1024, get_mmu_str(), - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 43379365ce1b..219d67bcf747 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -17,7 +17,7 @@ #include <linux/elf.h> #include <linux/security.h> #include <linux/syscalls.h> -#include <linux/time_namespace.h> +#include <linux/vdso_datastore.h> #include <vdso/datapage.h> #include <asm/syscall.h> @@ -32,6 +32,8 @@ #include <asm/vdso_datapage.h> #include <asm/setup.h> +static_assert(__VDSO_PAGES == VDSO_NR_PAGES); + /* The alignment of the vDSO */ #define VDSO_ALIGNMENT (1 << 16) @@ -40,24 +42,6 @@ extern char vdso64_start, vdso64_end; long sys_ni_syscall(void); -/* - * The vdso data page (aka. systemcfg for old ppc64 fans) is here. - * Once the early boot kernel code no longer needs to muck around - * with it, it will become dynamically allocated - */ -static union { - struct vdso_arch_data data; - u8 page[2 * PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_arch_data *vdso_data = &vdso_data_store.data; - -enum vvar_pages { - VVAR_BASE_PAGE_OFFSET, - VVAR_TIME_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma, unsigned long text_size) { @@ -96,14 +80,6 @@ static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struc mm->context.vdso = NULL; } -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf); - -static struct vm_special_mapping vvar_spec __ro_after_init = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap, @@ -116,73 +92,6 @@ static struct vm_special_mapping vdso64_spec __ro_after_init = { .close = vdso_close, }; -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return vvar_page; -} - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - VMA_ITERATOR(vmi, mm, 0); - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vvar_spec)) - zap_vma_pages(vma); - } - mmap_read_unlock(mm); - - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_BASE_PAGE_OFFSET: - pfn = virt_to_pfn(vdso_data); - break; - case VVAR_TIME_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = virt_to_pfn(vdso_data->data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = virt_to_pfn(vdso_data->data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - /* * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree @@ -191,7 +100,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int { unsigned long vdso_size, vdso_base, mappings_size; struct vm_special_mapping *vdso_spec; - unsigned long vvar_size = VVAR_NR_PAGES * PAGE_SIZE; + unsigned long vvar_size = VDSO_NR_PAGES * PAGE_SIZE; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -217,9 +126,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int /* Add required alignment. */ vdso_base = ALIGN(vdso_base, VDSO_ALIGNMENT); - vma = _install_special_mapping(mm, vdso_base, vvar_size, - VM_READ | VM_MAYREAD | VM_IO | - VM_DONTDUMP | VM_PFNMAP, &vvar_spec); + vma = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -299,10 +206,10 @@ static void __init vdso_setup_syscall_map(void) for (i = 0; i < NR_syscalls; i++) { if (sys_call_table[i] != (void *)&sys_ni_syscall) - vdso_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); + vdso_k_arch_data->syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); if (IS_ENABLED(CONFIG_COMPAT) && compat_sys_call_table[i] != (void *)&sys_ni_syscall) - vdso_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); + vdso_k_arch_data->compat_syscall_map[i >> 5] |= 0x80000000UL >> (i & 0x1f); } } @@ -352,10 +259,10 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) static int __init vdso_init(void) { #ifdef CONFIG_PPC64 - vdso_data->dcache_block_size = ppc64_caches.l1d.block_size; - vdso_data->icache_block_size = ppc64_caches.l1i.block_size; - vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size; - vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size; + vdso_k_arch_data->dcache_block_size = ppc64_caches.l1d.block_size; + vdso_k_arch_data->icache_block_size = ppc64_caches.l1i.block_size; + vdso_k_arch_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size; + vdso_k_arch_data->icache_log_block_size = ppc64_caches.l1i.log_block_size; #endif /* CONFIG_PPC64 */ vdso_setup_syscall_map(); diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 0e3ed6fb199f..e8824f933326 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -3,7 +3,7 @@ # List of files in the vdso, has to be asm only for now # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o obj-vdso64 = sigtramp64-64.o gettimeofday-64.o datapage-64.o cacheflush-64.o note-64.o getcpu-64.o diff --git a/arch/powerpc/kernel/vdso/cacheflush.S b/arch/powerpc/kernel/vdso/cacheflush.S index 0085ae464dac..488d3ade11e6 100644 --- a/arch/powerpc/kernel/vdso/cacheflush.S +++ b/arch/powerpc/kernel/vdso/cacheflush.S @@ -30,7 +30,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) #ifdef CONFIG_PPC64 mflr r12 .cfi_register lr,r12 - get_datapage r10 + get_datapage r10 vdso_u_arch_data mtlr r12 .cfi_restore lr #endif diff --git a/arch/powerpc/kernel/vdso/datapage.S b/arch/powerpc/kernel/vdso/datapage.S index db8e167f0166..d23b2e8e2a34 100644 --- a/arch/powerpc/kernel/vdso/datapage.S +++ b/arch/powerpc/kernel/vdso/datapage.S @@ -28,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) mflr r12 .cfi_register lr,r12 mr. r4,r3 - get_datapage r3 + get_datapage r3 vdso_u_arch_data mtlr r12 #ifdef __powerpc64__ addi r3,r3,CFG_SYSCALL_MAP64 @@ -52,7 +52,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 .cfi_register lr,r12 - get_datapage r3 + get_datapage r3 vdso_u_arch_data #ifndef __powerpc64__ lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) #endif diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index 5333848322ca..79c967212444 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -33,9 +33,9 @@ .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT #endif .ifeq \call_time - get_datapage r5 VDSO_DATA_OFFSET + get_datapage r5 vdso_u_time_data .else - get_datapage r4 VDSO_DATA_OFFSET + get_datapage r4 vdso_u_time_data .endif bl CFUNC(DOTSYM(\funct)) PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) diff --git a/arch/powerpc/kernel/vdso/vdso32.lds.S b/arch/powerpc/kernel/vdso/vdso32.lds.S index 1a1b0b6d681a..72a1012b8a20 100644 --- a/arch/powerpc/kernel/vdso/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso/vdso32.lds.S @@ -6,6 +6,7 @@ #include <asm/vdso.h> #include <asm/page.h> #include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> #ifdef __LITTLE_ENDIAN__ OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle") @@ -16,7 +17,8 @@ OUTPUT_ARCH(powerpc:common) SECTIONS { - PROVIDE(_vdso_datapage = . - 3 * PAGE_SIZE); + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/powerpc/kernel/vdso/vdso64.lds.S b/arch/powerpc/kernel/vdso/vdso64.lds.S index e21b5506cad6..32102a05eaa7 100644 --- a/arch/powerpc/kernel/vdso/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso/vdso64.lds.S @@ -6,6 +6,7 @@ #include <asm/vdso.h> #include <asm/page.h> #include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> #ifdef __LITTLE_ENDIAN__ OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle") @@ -16,7 +17,8 @@ OUTPUT_ARCH(powerpc:common64) SECTIONS { - PROVIDE(_vdso_datapage = . - 3 * PAGE_SIZE); + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/powerpc/kernel/vdso/vgettimeofday.c b/arch/powerpc/kernel/vdso/vgettimeofday.c index 55a287c9a736..6f5167d81af5 100644 --- a/arch/powerpc/kernel/vdso/vgettimeofday.c +++ b/arch/powerpc/kernel/vdso/vgettimeofday.c @@ -7,43 +7,43 @@ #ifdef __powerpc64__ int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_gettime_data(vd, clock, ts); } int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_getres_data(vd, clock_id, res); } #else int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_gettime32_data(vd, clock, ts); } int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_gettime_data(vd, clock, ts); } int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_clock_getres_time32_data(vd, clock_id, res); } #endif int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { return __cvdso_gettimeofday_data(vd, tv, tz); } -__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd) +__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_time_data *vd) { return __cvdso_time_data(vd, time); } diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 8c464a5d8246..2429cb1c7baa 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -495,8 +495,7 @@ static void start_watchdog(void *arg) *this_cpu_ptr(&wd_timer_tb) = get_tb(); - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = watchdog_timer_fn; + hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms), HRTIMER_MODE_REL_PINNED); } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ce1d91eed231..61f2b7e007fa 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -766,8 +766,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err; - hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; + hrtimer_setup(&vcpu->arch.dec_timer, kvmppc_decrementer_wakeup, CLOCK_REALTIME, + HRTIMER_MODE_ABS); #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 2b79171ee185..f4e03aaabb4c 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -132,7 +132,10 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw) static inline void power_pmu_bhrb_enable(struct perf_event *event) {} static inline void power_pmu_bhrb_disable(struct perf_event *event) {} -static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {} +static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) +{ +} static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {} static void pmao_restore_workaround(bool ebb) { } #endif /* CONFIG_PPC32 */ @@ -444,7 +447,8 @@ static void power_pmu_bhrb_disable(struct perf_event *event) /* Called from ctxsw to prevent one process's branch entries to * mingle with the other process's entries during context switch. */ -static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { if (!ppmu->bhrb_nr) return; diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index c9a9b759cc92..a379ff86c120 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -149,7 +149,7 @@ static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word *buf, /* end of vector */ bufp[idx++] = cpu_to_be64(AT_NULL); - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV, + buf = append_elf64_note(buf, NN_AUXV, NT_AUXV, oc_conf->auxv_buf, AUXV_DESC_SZ); return buf; } @@ -252,7 +252,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) * crashing CPU's prstatus. */ first_cpu_note = buf; - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) { @@ -279,7 +279,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) fill_prstatus(&prstatus, thread_pir, ®s); if (thread_pir != oc_conf->crashing_cpu) { - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, + buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); } else { @@ -287,7 +287,7 @@ static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) * Add crashing CPU as the first NT_PRSTATUS note for * GDB to process the core file appropriately. */ - append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME, + append_elf64_note(first_cpu_note, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); } diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c index 1574176e3ffc..c86950d7105a 100644 --- a/arch/powerpc/platforms/pseries/papr-vpd.c +++ b/arch/powerpc/platforms/pseries/papr-vpd.c @@ -482,14 +482,13 @@ static long papr_vpd_create_handle(struct papr_location_code __user *ulc) goto free_blob; } - file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops, - (void *)blob, O_RDONLY); + file = anon_inode_getfile_fmode("[papr-vpd]", &papr_vpd_handle_ops, + (void *)blob, O_RDONLY, + FMODE_LSEEK | FMODE_PREAD); if (IS_ERR(file)) { err = PTR_ERR(file); goto put_fd; } - - file->f_mode |= FMODE_LSEEK | FMODE_PREAD; fd_install(fd, file); return fd; put_fd: diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 268859e4df87..1acb53aab252 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1271,11 +1271,7 @@ static int xmon_batch_next_cpu(void) { unsigned long cpu; - while (!cpumask_empty(&xmon_batch_cpus)) { - cpu = cpumask_next_wrap(smp_processor_id(), &xmon_batch_cpus, - xmon_batch_start_cpu, true); - if (cpu >= nr_cpu_ids) - break; + for_each_cpu_wrap(cpu, &xmon_batch_cpus, xmon_batch_start_cpu) { if (xmon_batch_start_cpu == -1) xmon_batch_start_cpu = cpu; if (xmon_switch_cpu(cpu)) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7612c52e9b1e..0409be768666 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,7 +53,7 @@ config RISCV select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN - select ARCH_HAS_VDSO_TIME_DATA + select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE select ARCH_KEEP_MEMBLOCK if ACPI select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX @@ -111,11 +111,13 @@ config RISCV select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_LIB_DEVMEM_IS_ALLOWED + select GENERIC_PENDING_IRQ if SMP select GENERIC_PCI_IOMAP select GENERIC_PTDUMP if MMU select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if MMU && 64BIT + select GENERIC_VDSO_DATA_STORE if MMU select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO select HARDIRQS_SW_RESEND select HAS_IOPORT if MMU diff --git a/arch/riscv/boot/dts/sophgo/sg2042.dtsi b/arch/riscv/boot/dts/sophgo/sg2042.dtsi index e62ac51ac55a..fef2a0e0f7a3 100644 --- a/arch/riscv/boot/dts/sophgo/sg2042.dtsi +++ b/arch/riscv/boot/dts/sophgo/sg2042.dtsi @@ -173,6 +173,16 @@ #clock-cells = <1>; }; + msi: msi-controller@7030010304 { + compatible = "sophgo,sg2042-msi"; + reg = <0x70 0x30010304 0x0 0x4>, + <0x70 0x30010300 0x0 0x4>; + reg-names = "clr", "doorbell"; + msi-controller; + #msi-cells = <0>; + msi-ranges = <&intc 64 IRQ_TYPE_LEVEL_HIGH 32>; + }; + rpgate: clock-controller@7030010368 { compatible = "sophgo,sg2042-rpgate"; reg = <0x70 0x30010368 0x0 0x98>; diff --git a/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h b/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h index 256de17f5261..ae49c908e7fb 100644 --- a/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h +++ b/arch/riscv/boot/dts/starfive/jh7110-pinfunc.h @@ -89,7 +89,7 @@ #define GPOUT_SYS_SDIO1_DATA1 59 #define GPOUT_SYS_SDIO1_DATA2 60 #define GPOUT_SYS_SDIO1_DATA3 61 -#define GPOUT_SYS_SDIO1_DATA4 63 +#define GPOUT_SYS_SDIO1_DATA4 62 #define GPOUT_SYS_SDIO1_DATA5 63 #define GPOUT_SYS_SDIO1_DATA6 64 #define GPOUT_SYS_SDIO1_DATA7 65 diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index c6bd3d8354a9..49a0f48d93df 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -226,7 +226,7 @@ legacy: * @nr: Bit to set * @addr: Address to count from * - * This operation may be reordered on other architectures than x86. + * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline int arch_test_and_set_bit(int nr, volatile unsigned long *addr) { @@ -238,7 +238,7 @@ static __always_inline int arch_test_and_set_bit(int nr, volatile unsigned long * @nr: Bit to clear * @addr: Address to count from * - * This operation can be reordered on other architectures other than x86. + * This is an atomic fully-ordered operation (implied full memory barrier). */ static __always_inline int arch_test_and_clear_bit(int nr, volatile unsigned long *addr) { diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 1c5c641075d2..0257f4aa7ff4 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -136,7 +136,7 @@ __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw()) #include <asm-generic/io.h> #ifdef CONFIG_MMU -#define arch_memremap_wb(addr, size) \ +#define arch_memremap_wb(addr, size, flags) \ ((__force void *)ioremap_prot((addr), (size), _PAGE_KERNEL)) #endif diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index f891478829a5..c130d8100232 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -14,7 +14,7 @@ */ #ifdef CONFIG_MMU -#define __VVAR_PAGES 2 +#define __VDSO_PAGES 4 #ifndef __ASSEMBLY__ #include <generated/vdso-offsets.h> diff --git a/arch/riscv/include/asm/vdso/time_data.h b/arch/riscv/include/asm/vdso/arch_data.h index dfa65228999b..da57a3786f7a 100644 --- a/arch/riscv/include/asm/vdso/time_data.h +++ b/arch/riscv/include/asm/vdso/arch_data.h @@ -1,12 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __RISCV_ASM_VDSO_TIME_DATA_H -#define __RISCV_ASM_VDSO_TIME_DATA_H +#ifndef __RISCV_ASM_VDSO_ARCH_DATA_H +#define __RISCV_ASM_VDSO_ARCH_DATA_H #include <linux/types.h> #include <vdso/datapage.h> #include <asm/hwprobe.h> -struct arch_vdso_time_data { +struct vdso_arch_data { /* Stash static answers to the hwprobe queries when all CPUs are selected. */ __u64 all_cpu_hwprobe_values[RISCV_HWPROBE_MAX_KEY + 1]; @@ -14,4 +14,4 @@ struct arch_vdso_time_data { __u8 homogeneous_cpus; }; -#endif /* __RISCV_ASM_VDSO_TIME_DATA_H */ +#endif /* __RISCV_ASM_VDSO_ARCH_DATA_H */ diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index ba3283cf7acc..29164f84f93c 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -69,7 +69,7 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) #endif /* CONFIG_GENERIC_TIME_VSYSCALL */ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { /* * The purpose of csr_read(CSR_TIME) is to trap the system into @@ -79,18 +79,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return csr_read(CSR_TIME); } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/riscv/include/asm/vdso/vsyscall.h b/arch/riscv/include/asm/vdso/vsyscall.h index e8a9c4b53c0c..1140b54b4bc8 100644 --- a/arch/riscv/include/asm/vdso/vsyscall.h +++ b/arch/riscv/include/asm/vdso/vsyscall.h @@ -6,15 +6,6 @@ #include <vdso/datapage.h> -extern struct vdso_data *vdso_data; - -static __always_inline struct vdso_data *__riscv_get_k_vdso_data(void) -{ - return vdso_data; -} - -#define __arch_get_k_vdso_data __riscv_get_k_vdso_data - /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index bcd3b816306c..04a4e5495512 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -450,8 +450,7 @@ static int do_riscv_hwprobe(struct riscv_hwprobe __user *pairs, static int __init init_hwprobe_vdso_data(void) { - struct vdso_data *vd = __arch_get_k_vdso_data(); - struct arch_vdso_time_data *avd = &vd->arch_data; + struct vdso_arch_data *avd = vdso_k_arch_data; u64 id_bitsmash = 0; struct riscv_hwprobe pair; int key; diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 3ca3ae4277e1..cc2895d1fbc2 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -13,20 +13,11 @@ #include <linux/err.h> #include <asm/page.h> #include <asm/vdso.h> -#include <linux/time_namespace.h> +#include <linux/vdso_datastore.h> #include <vdso/datapage.h> #include <vdso/vsyscall.h> -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - -#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) - -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; +#define VVAR_SIZE (VDSO_NR_PAGES << PAGE_SHIFT) struct __vdso_info { const char *name; @@ -79,78 +70,6 @@ static void __init __vdso_init(struct __vdso_info *vdso_info) vdso_info->cm->pages = vdso_pagelist; } -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -static const struct vm_special_mapping rv_vvar_map; - -/* - * The vvar mapping contains data for a specific time namespace, so when a task - * changes namespace we must unmap its vvar data for the old namespace. - * Subsequent faults will map in data for the new namespace. - * - * For more details see timens_setup_vdso_data(). - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &rv_vvar_map)) - zap_vma_pages(vma); - } - - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - if (timens_page) - pfn = page_to_pfn(timens_page); - else - pfn = sym_to_pfn(vdso_data); - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = sym_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - - return vmf_insert_pfn(vma, vmf->address, pfn); -} - -static const struct vm_special_mapping rv_vvar_map = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static struct vm_special_mapping rv_vdso_map __ro_after_init = { .name = "[vdso]", .mremap = vdso_mremap, @@ -196,7 +115,7 @@ static int __setup_additional_pages(struct mm_struct *mm, unsigned long vdso_base, vdso_text_len, vdso_mapping_len; void *ret; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); vdso_text_len = vdso_info->vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ @@ -208,8 +127,7 @@ static int __setup_additional_pages(struct mm_struct *mm, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, VVAR_SIZE, - (VM_READ | VM_MAYREAD | VM_PFNMAP), &rv_vvar_map); + ret = vdso_install_vvar_mapping(mm, vdso_base); if (IS_ERR(ret)) goto up_fail; diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 9a1b555e8733..ad73607abc28 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -2,7 +2,7 @@ # Copied from arch/tile/kernel/vdso/Makefile # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Symbols present in the vdso vdso-syms = rt_sigreturn ifdef CONFIG_64BIT diff --git a/arch/riscv/kernel/vdso/hwprobe.c b/arch/riscv/kernel/vdso/hwprobe.c index a158c029344f..2ddeba6c68dd 100644 --- a/arch/riscv/kernel/vdso/hwprobe.c +++ b/arch/riscv/kernel/vdso/hwprobe.c @@ -16,8 +16,7 @@ static int riscv_vdso_get_values(struct riscv_hwprobe *pairs, size_t pair_count, size_t cpusetsize, unsigned long *cpus, unsigned int flags) { - const struct vdso_data *vd = __arch_get_vdso_data(); - const struct arch_vdso_time_data *avd = &vd->arch_data; + const struct vdso_arch_data *avd = &vdso_u_arch_data; bool all_cpus = !cpusetsize && !cpus; struct riscv_hwprobe *p = pairs; struct riscv_hwprobe *end = pairs + pair_count; @@ -51,8 +50,7 @@ static int riscv_vdso_get_cpus(struct riscv_hwprobe *pairs, size_t pair_count, size_t cpusetsize, unsigned long *cpus, unsigned int flags) { - const struct vdso_data *vd = __arch_get_vdso_data(); - const struct arch_vdso_time_data *avd = &vd->arch_data; + const struct vdso_arch_data *avd = &vdso_u_arch_data; struct riscv_hwprobe *p = pairs; struct riscv_hwprobe *end = pairs + pair_count; unsigned char *c = (unsigned char *)cpus; diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index cbe2a179331d..8e86965a8aae 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -4,15 +4,14 @@ */ #include <asm/page.h> #include <asm/vdso.h> +#include <vdso/datapage.h> OUTPUT_ARCH(riscv) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index 96e7a4e463f7..ff672fa71fcc 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -248,18 +248,19 @@ int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu) if (t->init_done) return -EINVAL; - hrtimer_init(&t->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); t->init_done = true; t->next_set = false; /* Enable sstc for every vcpu if available in hardware */ if (riscv_isa_extension_available(NULL, SSTC)) { t->sstc_enabled = true; - t->hrt.function = kvm_riscv_vcpu_vstimer_expired; + hrtimer_setup(&t->hrt, kvm_riscv_vcpu_vstimer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); t->timer_next_event = kvm_riscv_vcpu_update_vstimecmp; } else { t->sstc_enabled = false; - t->hrt.function = kvm_riscv_vcpu_hrtimer_expired; + hrtimer_setup(&t->hrt, kvm_riscv_vcpu_hrtimer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); t->timer_next_event = kvm_riscv_vcpu_update_hrtimer; } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9c9ec08d78c7..7ed6f229250c 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -166,6 +166,7 @@ config S390 select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index 92c73e4d97a9..420a073fdde5 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -6,13 +6,11 @@ #ifndef __ASSEMBLY__ -extern struct vdso_data *vdso_data; - int vdso_getcpu_init(void); #endif /* __ASSEMBLY__ */ -#define __VVAR_PAGES 2 +#define __VDSO_PAGES 4 #define VDSO_VERSION_STRING LINUX_2.6.29 diff --git a/arch/s390/include/asm/vdso/getrandom.h b/arch/s390/include/asm/vdso/getrandom.h index 36355af7160b..f8713ce39bb2 100644 --- a/arch/s390/include/asm/vdso/getrandom.h +++ b/arch/s390/include/asm/vdso/getrandom.h @@ -23,18 +23,6 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig return syscall3(__NR_getrandom, (long)buffer, (long)len, (long)flags); } -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - /* - * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace - * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_ - * PAGE_OFFSET points to the real VVAR page. - */ - if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) - return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; - return &_vdso_rng_data; -} - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h index 7937765ccfa5..fb4564308e9d 100644 --- a/arch/s390/include/asm/vdso/gettimeofday.h +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -14,12 +14,7 @@ #include <linux/compiler.h> -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return _vdso_data; -} - -static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) +static inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd) { u64 adj, now; @@ -49,12 +44,4 @@ long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) return syscall2(__NR_clock_getres, (long)clkid, (long)ts); } -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return _timens_data; -} -#endif - #endif diff --git a/arch/s390/include/asm/vdso/vsyscall.h b/arch/s390/include/asm/vdso/vsyscall.h index 3eb576ecd3bd..d346ebe51301 100644 --- a/arch/s390/include/asm/vdso/vsyscall.h +++ b/arch/s390/include/asm/vdso/vsyscall.h @@ -2,32 +2,12 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#define __VDSO_RND_DATA_OFFSET 768 - #ifndef __ASSEMBLY__ #include <linux/hrtimer.h> #include <vdso/datapage.h> #include <asm/vdso.h> -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES -}; - -static __always_inline struct vdso_data *__s390_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __s390_get_k_vdso_data - -static __always_inline struct vdso_rng_data *__s390_get_k_vdso_rnd_data(void) -{ - return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; -} -#define __arch_get_k_vdso_rng_data __s390_get_k_vdso_rnd_data - /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 276cb4c1e11b..4a981266b483 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -246,15 +246,6 @@ bool is_kdump_kernel(void) } EXPORT_SYMBOL_GPL(is_kdump_kernel); -static const char *nt_name(Elf64_Word type) -{ - const char *name = "LINUX"; - - if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG) - name = KEXEC_CORE_NOTE_NAME; - return name; -} - /* * Initialize ELF note */ @@ -279,10 +270,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len, return PTR_ADD(buf, len); } -static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len) -{ - return nt_init_name(buf, type, desc, d_len, nt_name(type)); -} +#define nt_init(buf, type, desc) \ + nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type) /* * Calculate the size of ELF note @@ -298,10 +287,7 @@ static size_t nt_size_name(int d_len, const char *name) return size; } -static inline size_t nt_size(Elf64_Word type, int d_len) -{ - return nt_size_name(d_len, nt_name(type)); -} +#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type) /* * Fill ELF notes for one CPU with save area registers @@ -322,18 +308,16 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs)); /* Create ELF notes for the CPU */ - ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus)); - ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset)); - ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer)); - ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp)); - ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg)); - ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs)); - ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix)); + ptr = nt_init(ptr, PRSTATUS, nt_prstatus); + ptr = nt_init(ptr, PRFPREG, nt_fpregset); + ptr = nt_init(ptr, S390_TIMER, sa->timer); + ptr = nt_init(ptr, S390_TODCMP, sa->todcmp); + ptr = nt_init(ptr, S390_TODPREG, sa->todpreg); + ptr = nt_init(ptr, S390_CTRS, sa->ctrs); + ptr = nt_init(ptr, S390_PREFIX, sa->prefix); if (cpu_has_vx()) { - ptr = nt_init(ptr, NT_S390_VXRS_HIGH, - &sa->vxrs_high, sizeof(sa->vxrs_high)); - ptr = nt_init(ptr, NT_S390_VXRS_LOW, - &sa->vxrs_low, sizeof(sa->vxrs_low)); + ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high); + ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low); } return ptr; } @@ -346,16 +330,16 @@ static size_t get_cpu_elf_notes_size(void) struct save_area *sa = NULL; size_t size; - size = nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus)); - size += nt_size(NT_PRFPREG, sizeof(elf_fpregset_t)); - size += nt_size(NT_S390_TIMER, sizeof(sa->timer)); - size += nt_size(NT_S390_TODCMP, sizeof(sa->todcmp)); - size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg)); - size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs)); - size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix)); + size = nt_size(PRSTATUS, struct elf_prstatus); + size += nt_size(PRFPREG, elf_fpregset_t); + size += nt_size(S390_TIMER, sa->timer); + size += nt_size(S390_TODCMP, sa->todcmp); + size += nt_size(S390_TODPREG, sa->todpreg); + size += nt_size(S390_CTRS, sa->ctrs); + size += nt_size(S390_PREFIX, sa->prefix); if (cpu_has_vx()) { - size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high)); - size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low)); + size += nt_size(S390_VXRS_HIGH, sa->vxrs_high); + size += nt_size(S390_VXRS_LOW, sa->vxrs_low); } return size; @@ -371,7 +355,7 @@ static void *nt_prpsinfo(void *ptr) memset(&prpsinfo, 0, sizeof(prpsinfo)); prpsinfo.pr_sname = 'R'; strcpy(prpsinfo.pr_fname, "vmlinux"); - return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); + return nt_init(ptr, PRPSINFO, prpsinfo); } /* @@ -610,7 +594,7 @@ static size_t get_elfcorehdr_size(int phdr_count) /* PT_NOTES */ size += sizeof(Elf64_Phdr); /* nt_prpsinfo */ - size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo)); + size += nt_size(PRPSINFO, struct elf_prpsinfo); /* regsets */ size += get_cpu_cnt() * get_cpu_elf_notes_size(); /* nt_vmcoreinfo */ diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 1ecd0580561f..911b95cd57e5 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -198,13 +198,8 @@ void __noreturn die(struct pt_regs *regs, const char *str) console_verbose(); spin_lock_irq(&die_lock); bust_spinlocks(1); - printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff, + printk("%s: %04x ilc:%d [#%d]", str, regs->int_code & 0xffff, regs->int_code >> 17, ++die_counter); -#ifdef CONFIG_PREEMPT - pr_cont("PREEMPT "); -#elif defined(CONFIG_PREEMPT_RT) - pr_cont("PREEMPT_RT "); -#endif pr_cont("SMP "); if (debug_pagealloc_enabled()) pr_cont("DEBUG_PAGEALLOC"); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 10725f5a6f0f..63875270941b 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -518,7 +518,8 @@ static void paicrypt_have_samples(void) /* Called on schedule-in and schedule-out. No access to event structure, * but for sampling only event CRYPTO_ALL is allowed. */ -static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { /* We started with a clean page on event installation. So read out * results on schedule_out and if page was dirty, save old values. diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index a8f0bad99cf0..fd14d5ebccbc 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -542,7 +542,8 @@ static void paiext_have_samples(void) /* Called on schedule-in and schedule-out. No access to event structure, * but for sampling only event NNPA_ALL is allowed. */ -static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { /* We started with a clean page on event installation. So read out * results on schedule_out and if page was dirty, save old values. diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 5ce9a795a0fe..745649ad9779 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -72,7 +72,7 @@ void notrace stop_machine_yield(const struct cpumask *cpumask) this_cpu = smp_processor_id(); if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) { __this_cpu_write(cpu_relax_retry, 0); - cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false); + cpu = cpumask_next_wrap(this_cpu, cpumask); if (cpu >= nr_cpu_ids) return; if (arch_vcpu_is_preempted(cpu)) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index e9115b4d8b63..a4569b96ef06 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -469,3 +469,4 @@ 464 common getxattrat sys_getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index e9f47c3a6197..699a18f1c54e 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -79,12 +79,10 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; - int cs; /* Initialize TOD steering parameters */ tod_steering_end = tod_clock_base.tod; - for (cs = 0; cs < CS_BASES; cs++) - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) return; @@ -373,7 +371,6 @@ static void clock_sync_global(long delta) { unsigned long now, adj; struct ptff_qto qto; - int cs; /* Fixup the monotonic sched clock. */ tod_clock_base.eitod += delta; @@ -389,10 +386,8 @@ static void clock_sync_global(long delta) panic("TOD clock sync offset %li is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); - for (cs = 0; cs < CS_BASES; cs++) { - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; - vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; - } + vdso_k_time_data->arch_data.tod_steering_end = tod_steering_end; + vdso_k_time_data->arch_data.tod_steering_delta = tod_steering_delta; /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 598b512cde01..70c8f9ad13cd 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -16,8 +16,8 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/time_namespace.h> #include <linux/random.h> +#include <linux/vdso_datastore.h> #include <vdso/datapage.h> #include <asm/vdso/vsyscall.h> #include <asm/alternative.h> @@ -26,85 +26,6 @@ extern char vdso64_start[], vdso64_end[]; extern char vdso32_start[], vdso32_end[]; -static struct vm_special_mapping vvar_mapping; - -static union vdso_data_store vdso_data_store __page_aligned_data; - -struct vdso_data *vdso_data = vdso_data_store.data; - -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -/* - * The VVAR page layout depends on whether a task belongs to the root or - * non-root time namespace. Whenever a task changes its namespace, the VVAR - * page tables are cleared and then they will be re-faulted with a - * corresponding layout. - * See also the comment near timens_setup_vdso_data() for details. - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - VMA_ITERATOR(vmi, mm, 0); - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (!vma_is_special_mapping(vma, &vvar_mapping)) - continue; - zap_vma_pages(vma); - break; - } - mmap_read_unlock(mm); - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long addr, pfn; - vm_fault_t err; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - pfn = virt_to_pfn(vdso_data); - if (timens_page) { - /* - * Fault in VVAR page too, since it will be accessed - * to get clock data anyway. - */ - addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; - err = vmf_insert_pfn(vma, addr, pfn); - if (unlikely(err & VM_FAULT_ERROR)) - return err; - pfn = page_to_pfn(timens_page); - } - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = virt_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - return vmf_insert_pfn(vma, vmf->address, pfn); -} - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *vma) { @@ -112,11 +33,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, -}; - static struct vm_special_mapping vdso64_mapping = { .name = "[vdso]", .mremap = vdso_mremap, @@ -142,7 +58,7 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) struct vm_area_struct *vma; int rc; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -157,14 +73,11 @@ static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) rc = vvar_start; if (IS_ERR_VALUE(vvar_start)) goto out; - vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, - &vvar_mapping); + vma = vdso_install_vvar_mapping(mm, vvar_start); rc = PTR_ERR(vma); if (IS_ERR(vma)) goto out; - vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE; + vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE; /* VM_MAYWRITE for COW so gdb can set breakpoints */ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, VM_READ|VM_EXEC| @@ -220,7 +133,7 @@ unsigned long vdso_text_size(void) unsigned long vdso_size(void) { - return vdso_text_size() + VVAR_NR_PAGES * PAGE_SIZE; + return vdso_text_size() + VDSO_NR_PAGES * PAGE_SIZE; } int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index 2c5afb88d298..1e4ddd1a683f 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -2,7 +2,7 @@ # List of files in the vdso # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso32 = vdso_user_wrapper-32.o note-32.o # Build rules diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S index c916c4f73f76..9630d58c2080 100644 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -6,16 +6,15 @@ #include <asm/page.h> #include <asm/vdso.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") OUTPUT_ARCH(s390:31-bit) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index ad206f2068d8..d8f0df742809 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -2,7 +2,7 @@ # List of files in the vdso # Include the generic Makefile to check the built vdso. -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index ec42b7d9cb53..e4f6551ae898 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -7,17 +7,15 @@ #include <asm/vdso/vsyscall.h> #include <asm/page.h> #include <asm/vdso.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") OUTPUT_ARCH(s390:64-bit) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); - PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif + VDSO_VVAR_SYMS + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 07ff0e10cb7f..0f00f8e85fee 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3174,8 +3174,7 @@ void kvm_s390_gisa_init(struct kvm *kvm) gi->alert.mask = 0; spin_lock_init(&gi->alert.ref_lock); gi->expires = 50 * 1000; /* 50 usec */ - hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - gi->timer.function = gisa_vcpu_kicker; + hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL); memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); gi->origin->next_alert = (u32)virt_to_phys(gi->origin); VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 020502af7dc9..48104e59648b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3943,8 +3943,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) if (rc) return rc; } - hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; + hrtimer_setup(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); vcpu->arch.sie_block->hpid = HPID_KVM; diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index c8cad33bf250..52a7652fcff6 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -470,3 +470,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 727f99d333b3..83e45eb6c095 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -512,3 +512,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/sparc/vdso/Makefile b/arch/sparc/vdso/Makefile index 50ec2978cda5..fdc4a8f5a49c 100644 --- a/arch/sparc/vdso/Makefile +++ b/arch/sparc/vdso/Makefile @@ -22,7 +22,7 @@ targets += $(foreach x, 32 64, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg) CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf64_sparc -soname linux-vdso.so.1 --no-undefined \ +VDSO_LDFLAGS_vdso.lds = -m elf64_sparc -soname linux-vdso.so.1 \ -z max-page-size=8192 $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE @@ -101,7 +101,6 @@ $(obj)/vdso32.so.dbg: FORCE \ quiet_cmd_vdso = VDSO $@ cmd_vdso = $(LD) -nostdlib -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -T $(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(src)/checkundef.sh '$(OBJDUMP)' '$@' + -T $(filter %.lds,$^) $(filter %.o,$^) -VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 -Bsymbolic +VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 -Bsymbolic --no-undefined diff --git a/arch/sparc/vdso/checkundef.sh b/arch/sparc/vdso/checkundef.sh deleted file mode 100644 index 2d85876ffc32..000000000000 --- a/arch/sparc/vdso/checkundef.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -objdump="$1" -file="$2" -$objdump -t "$file" | grep '*UUND*' | grep -v '#scratch' > /dev/null 2>&1 -if [ $? -eq 1 ]; then - exit 0 -else - echo "$file: undefined symbols found" >&2 - exit 1 -fi diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 79ea97d4797e..8be91974e786 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -440,25 +440,24 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void apply_seal_endbr(s32 *start, s32 *end) { } -void apply_retpolines(s32 *start, s32 *end, struct module *mod) +void apply_retpolines(s32 *start, s32 *end) { } -void apply_returns(s32 *start, s32 *end, struct module *mod) +void apply_returns(s32 *start, s32 *end) { } void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { } -void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod) +void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index cf0ad89f5639..f7fb3d88c57b 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -1,4 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 + +# Branch profiling isn't noinstr-safe. Disable it for arch/x86/* +subdir-ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING + obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += coco/ obj-y += entry/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0e27ebd7e36a..05b4eca156cf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -85,6 +85,7 @@ config X86 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_EXECMEM_ROX if X86_64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL @@ -132,7 +133,7 @@ config X86 select ARCH_SUPPORTS_AUTOFDO_CLANG select ARCH_SUPPORTS_PROPELLER_CLANG if X86_64 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 + select ARCH_USE_CMPXCHG_LOCKREF if X86_CX8 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS @@ -178,6 +179,7 @@ config X86 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_DATA_STORE select GENERIC_VDSO_TIME_NS select GENERIC_VDSO_OVERFLOW_PROTECT select GUP_GET_PXX_LOW_HIGH if X86_PAE @@ -232,7 +234,7 @@ config X86 select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_EISA + select HAVE_EISA if X86_32 select HAVE_EXIT_THREAD select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE @@ -277,7 +279,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API @@ -285,7 +287,7 @@ config X86 select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK - select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR + select HAVE_STACKPROTECTOR select HAVE_STACK_VALIDATION if HAVE_OBJTOOL select HAVE_STATIC_CALL select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL @@ -426,15 +428,6 @@ config PGTABLE_LEVELS default 3 if X86_PAE default 2 -config CC_HAS_SANE_STACKPROTECTOR - bool - default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) if 64BIT - default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) - help - We have to make sure stack protector is unconditionally disabled if - the compiler produces broken code or if it does not let us control - the segment on 32-bit kernels. - menu "Processor type and features" config SMP @@ -505,6 +498,7 @@ config X86_CPU_RESCTRL depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) select KERNFS select PROC_CPU_RESCTRL if PROC_FS + select RESCTRL_FS_PSEUDO_LOCK help Enable x86 CPU resource control support. @@ -521,6 +515,12 @@ config X86_CPU_RESCTRL Say N if unsure. +config RESCTRL_FS_PSEUDO_LOCK + bool + help + Software mechanism to pin data in a cache portion using + micro-architecture specific knowledge. + config X86_FRED bool "Flexible Return and Event Delivery" depends on X86_64 @@ -530,12 +530,6 @@ config X86_FRED ring transitions and exception/interrupt handling if the system supports it. -config X86_BIGSMP - bool "Support for big SMP systems with more than 8 CPUs" - depends on SMP && X86_32 - help - This option is needed for the systems that have more than 8 CPUs. - config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -553,13 +547,12 @@ config X86_EXTENDED_PLATFORM AMD Elan RDC R-321x SoC SGI 320/540 (Visual Workstation) - STA2X11-based (e.g. Northville) - Moorestown MID devices 64-bit platforms (CONFIG_64BIT=y): Numascale NumaChip ScaleMP vSMP SGI Ultraviolet + Merrifield/Moorefield MID devices If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. @@ -604,8 +597,31 @@ config X86_UV This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. -# Following is an alphabetically sorted list of 32 bit extended platforms -# Please maintain the alphabetic order if and when there are additions +config X86_INTEL_MID + bool "Intel Z34xx/Z35xx MID platform support" + depends on X86_EXTENDED_PLATFORM + depends on X86_PLATFORM_DEVICES + depends on PCI + depends on X86_64 || (EXPERT && PCI_GOANY) + depends on X86_IO_APIC + select I2C + select DW_APB_TIMER + select INTEL_SCU_PCI + help + Select to build a kernel capable of supporting 64-bit Intel MID + (Mobile Internet Device) platform systems which do not have + the PCI legacy interfaces. + + The only supported devices are the 22nm Merrified (Z34xx) + and Moorefield (Z35xx) SoC used in the Intel Edison board and + a small number of Android devices such as the Asus Zenfone 2, + Asus FonePad 8 and Dell Venue 7. + + If you are building for a PC class system or non-MID tablet + SoCs like Bay Trail (Z36xx/Z37xx), say N here. + + Intel MID platforms are based on an Intel processor and chipset which + consume less power than most of the x86 derivatives. config X86_GOLDFISH bool "Goldfish (Virtual Platform)" @@ -615,6 +631,9 @@ config X86_GOLDFISH for Android development. Unless you are building for the Android Goldfish emulator say N here. +# Following is an alphabetically sorted list of 32 bit extended platforms +# Please maintain the alphabetic order if and when there are additions + config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -630,24 +649,6 @@ config X86_INTEL_CE This option compiles in support for the CE4100 SOC for settop boxes and media devices. -config X86_INTEL_MID - bool "Intel MID platform support" - depends on X86_EXTENDED_PLATFORM - depends on X86_PLATFORM_DEVICES - depends on PCI - depends on X86_64 || (PCI_GOANY && X86_32) - depends on X86_IO_APIC - select I2C - select DW_APB_TIMER - select INTEL_SCU_PCI - help - Select to build a kernel capable of supporting Intel MID (Mobile - Internet Device) platform systems which do not have the PCI legacy - interfaces. If you are building for a PC class system say N here. - - Intel MID platforms are based on an Intel processor and chipset which - consume less power than most of the x86 derivatives. - config X86_INTEL_QUARK bool "Intel Quark platform support" depends on X86_32 @@ -729,18 +730,6 @@ config X86_RDC321X as R-8610-(G). If you don't have one of these chips, you should say N here. -config X86_32_NON_STANDARD - bool "Support non-standard 32-bit SMP architectures" - depends on X86_32 && SMP - depends on X86_EXTENDED_PLATFORM - help - This option compiles in the bigsmp and STA2X11 default - subarchitectures. It is intended for a generic binary - kernel. If you select them all, kernel will probe it one by - one and will fallback to default. - -# Alphabetically sorted list of Non standard 32 bit platforms - config X86_SUPPORTS_MEMORY_FAILURE def_bool y # MCE code calls memory_failure(): @@ -750,19 +739,6 @@ config X86_SUPPORTS_MEMORY_FAILURE depends on X86_64 || !SPARSEMEM select ARCH_SUPPORTS_MEMORY_FAILURE -config STA2X11 - bool "STA2X11 Companion Chip Support" - depends on X86_32_NON_STANDARD && PCI - select SWIOTLB - select MFD_STA2X11 - select GPIOLIB - help - This adds support for boards based on the STA2X11 IO-Hub, - a.k.a. "ConneXt". The chip is used in place of the standard - PC chipset, so all "standard" peripherals are missing. If this - option is selected the kernel will still be able to boot on - standard PC machines. - config X86_32_IRIS tristate "Eurobraille/Iris poweroff module" depends on X86_32 @@ -1012,8 +988,7 @@ config NR_CPUS_RANGE_BEGIN config NR_CPUS_RANGE_END int depends on X86_32 - default 64 if SMP && X86_BIGSMP - default 8 if SMP && !X86_BIGSMP + default 8 if SMP default 1 if !SMP config NR_CPUS_RANGE_END @@ -1026,7 +1001,6 @@ config NR_CPUS_RANGE_END config NR_CPUS_DEFAULT int depends on X86_32 - default 32 if X86_BIGSMP default 8 if SMP default 1 if !SMP @@ -1102,7 +1076,7 @@ config UP_LATE_INIT config X86_UP_APIC bool "Local APIC support on uniprocessors" if !PCI_MSI default PCI_MSI - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP help A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -1127,7 +1101,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI + depends on X86_64 || SMP || X86_UP_APIC || PCI_MSI select IRQ_DOMAIN_HIERARCHY config ACPI_MADT_WAKEUP @@ -1396,15 +1370,11 @@ config X86_CPUID with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -choice - prompt "High Memory Support" - default HIGHMEM4G +config HIGHMEM4G + bool "High Memory Support" depends on X86_32 - -config NOHIGHMEM - bool "off" help - Linux can use up to 64 Gigabytes of physical memory on x86 systems. + Linux can use up to 4 Gigabytes of physical memory on x86 systems. However, the address space of 32-bit x86 processors is only 4 Gigabytes large. That means that, if you have a large amount of physical memory, not all of it can be "permanently mapped" by the @@ -1420,38 +1390,9 @@ config NOHIGHMEM possible. If the machine has between 1 and 4 Gigabytes physical RAM, then - answer "4GB" here. - - If more than 4 Gigabytes is used then answer "64GB" here. This - selection turns Intel PAE (Physical Address Extension) mode on. - PAE implements 3-level paging on IA32 processors. PAE is fully - supported by Linux, PAE mode is implemented on all recent Intel - processors (Pentium Pro and better). NOTE: If you say "64GB" here, - then the kernel will not boot on CPUs that don't support PAE! - - The actual amount of total physical memory will either be - auto detected or can be forced by using a kernel command line option - such as "mem=256M". (Try "man bootparam" or see the documentation of - your boot loader (lilo or loadlin) about how to pass options to the - kernel at boot time.) - - If unsure, say "off". - -config HIGHMEM4G - bool "4GB" - help - Select this if you have a 32-bit processor and between 1 and 4 - gigabytes of physical RAM. - -config HIGHMEM64G - bool "64GB" - depends on X86_HAVE_PAE - select X86_PAE - help - Select this if you have a 32-bit processor and more than 4 - gigabytes of physical RAM. + answer "Y" here. -endchoice + If unsure, say N. choice prompt "Memory split" if EXPERT @@ -1497,14 +1438,12 @@ config PAGE_OFFSET depends on X86_32 config HIGHMEM - def_bool y - depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) + def_bool HIGHMEM4G config X86_PAE bool "PAE (Physical Address Extension) Support" depends on X86_32 && X86_HAVE_PAE select PHYS_ADDR_T_64BIT - select SWIOTLB help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It @@ -1574,8 +1513,7 @@ config AMD_MEM_ENCRYPT config NUMA bool "NUMA Memory Allocation and Scheduler Support" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) - default y if X86_BIGSMP + depends on X86_64 select USE_PERCPU_NUMA_NODE_ID select OF_NUMA if OF help @@ -1588,9 +1526,6 @@ config NUMA For 64-bit this is recommended if the system is Intel Core i7 (or later), AMD Opteron, or EM64T NUMA. - For 32-bit this is only needed if you boot a 32-bit - kernel on a 64-bit NUMA platform. - Otherwise, you should say N. config AMD_NUMA @@ -1629,7 +1564,7 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y - depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD + depends on X86_64 || NUMA || X86_32 select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 @@ -1675,15 +1610,6 @@ config X86_PMEM_LEGACY Say Y if unsure. -config HIGHPTE - bool "Allocate 3rd-level pagetables from highmem" - depends on HIGHMEM - help - The VM uses one page table entry for each page of physical memory. - For systems with a lot of RAM, this can be wasteful of precious - low memory. Setting this option will put user-space page table - entries in high memory. - config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" help @@ -2451,18 +2377,20 @@ config CC_HAS_NAMED_AS def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC) -x c - -S -o /dev/null) depends on CC_IS_GCC +# +# -fsanitize=kernel-address (KASAN) and -fsanitize=thread (KCSAN) +# are incompatible with named address spaces with GCC < 13.3 +# (see GCC PR sanitizer/111736 and also PR sanitizer/115172). +# + config CC_HAS_NAMED_AS_FIXED_SANITIZERS - def_bool CC_IS_GCC && GCC_VERSION >= 130300 + def_bool y + depends on !(KASAN || KCSAN) || GCC_VERSION >= 130300 + depends on !(UBSAN_BOOL && KASAN) || GCC_VERSION >= 140200 config USE_X86_SEG_SUPPORT - def_bool y - depends on CC_HAS_NAMED_AS - # - # -fsanitize=kernel-address (KASAN) and -fsanitize=thread - # (KCSAN) are incompatible with named address spaces with - # GCC < 13.3 - see GCC PR sanitizer/111736. - # - depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS + def_bool CC_HAS_NAMED_AS + depends on CC_HAS_NAMED_AS_FIXED_SANITIZERS config CC_HAS_SLS def_bool $(cc-option,-mharden-sls=all) @@ -2473,6 +2401,10 @@ config CC_HAS_RETURN_THUNK config CC_HAS_ENTRY_PADDING def_bool $(cc-option,-fpatchable-function-entry=16,16) +config CC_HAS_KCFI_ARITY + def_bool $(cc-option,-fsanitize=kcfi -fsanitize-kcfi-arity) + depends on CC_IS_CLANG && !RUST + config FUNCTION_PADDING_CFI int default 59 if FUNCTION_ALIGNMENT_64B @@ -2498,6 +2430,10 @@ config FINEIBT depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE select CALL_PADDING +config FINEIBT_BHI + def_bool y + depends on FINEIBT && CC_HAS_KCFI_ARITY + config HAVE_CALL_THUNKS def_bool y depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL @@ -3202,4 +3138,6 @@ config HAVE_ATOMIC_IOMAP source "arch/x86/kvm/Kconfig" +source "arch/x86/Kconfig.cpufeatures" + source "arch/x86/Kconfig.assembler" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2a7279d80460..753b8763abae 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 # Put here option for CPU selection and depending optimization choice - prompt "Processor family" - default M686 if X86_32 - default GENERIC_CPU if X86_64 + prompt "x86-32 Processor family" + depends on X86_32 + default M686 help This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel @@ -31,7 +31,6 @@ choice - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). - - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs. - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. @@ -42,13 +41,10 @@ choice - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). - "VIA C7" for VIA C7. - - "Intel P4" for the Pentium 4/Netburst microarchitecture. - - "Core 2/newer Xeon" for all core2 and newer Intel CPUs. - "Intel Atom" for the Atom-microarchitecture CPUs. - - "Generic-x86-64" for a kernel which runs on any x86-64 CPU. See each option's help text for additional details. If you don't know - what to do, choose "486". + what to do, choose "Pentium-Pro". config M486SX bool "486SX" @@ -114,11 +110,11 @@ config MPENTIUMIII extensions. config MPENTIUMM - bool "Pentium M" + bool "Pentium M/Pentium Dual Core/Core Solo/Core Duo" depends on X86_32 help Select this for Intel Pentium M (not Pentium-4 M) - notebook chips. + "Merom" Core Solo/Duo notebook chips config MPENTIUM4 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" @@ -139,22 +135,10 @@ config MPENTIUM4 -Mobile Pentium 4 -Mobile Pentium 4 M -Extreme Edition (Gallatin) - -Prescott - -Prescott 2M - -Cedar Mill - -Presler - -Smithfiled Xeons (Intel Xeon, Xeon MP, Xeon LV, Xeon MV) corename: -Foster -Prestonia -Gallatin - -Nocona - -Irwindale - -Cranford - -Potomac - -Paxville - -Dempsey - config MK6 bool "K6/K6-II/K6-III" @@ -172,13 +156,6 @@ config MK7 some extended instructions, and passes appropriate optimization flags to GCC. -config MK8 - bool "Opteron/Athlon64/Hammer/K8" - help - Select this for an AMD Opteron or Athlon64 Hammer-family processor. - Enables use of some extended instructions, and passes appropriate - optimization flags to GCC. - config MCRUSOE bool "Crusoe" depends on X86_32 @@ -258,42 +235,14 @@ config MVIAC7 Select this for a VIA C7. Selecting this uses the correct cache shift and tells gcc to treat the CPU as a 686. -config MPSC - bool "Intel P4 / older Netburst based Xeon" - depends on X86_64 - help - Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey - Xeon CPUs with Intel 64bit which is compatible with x86-64. - Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distinguish them - using the cpu family field - in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. - -config MCORE2 - bool "Core 2/newer Xeon" - help - - Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and - 53xx) CPUs. You can distinguish newer from older Xeons by the CPU - family in /proc/cpuinfo. Newer ones have 6 and older ones 15 - (not a typo) - config MATOM bool "Intel Atom" help - Select this for the Intel Atom platform. Intel Atom CPUs have an in-order pipelining architecture and thus can benefit from accordingly optimized code. Use a recent GCC with specific Atom support in order to fully benefit from selecting this option. -config GENERIC_CPU - bool "Generic-x86-64" - depends on X86_64 - help - Generic x86-64 CPU. - Run equally well on all x86-64 CPUs. - endchoice config X86_GENERIC @@ -317,8 +266,8 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int - default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "7" if MPENTIUM4 + default "6" if MK7 || MPENTIUMM || MATOM || MVIAC7 || X86_GENERIC || X86_64 default "4" if MELAN || M486SX || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX @@ -336,51 +285,35 @@ config X86_ALIGNMENT_16 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK7 || MEFFICEON config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM - -# -# P6_NOPs are a relatively minor optimization that require a family >= -# 6 processor, except that it is broken on certain VIA chips. -# Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). In addition, it looks like Virtual PC -# does not understand them. -# -# As a result, disallow these if we're not compiling for X86_64 (these -# NOPs do work on all x86-64 capable chips); the list of processors in -# the right-hand clause are the cores that benefit from this optimization. -# -config X86_P6_NOP - def_bool y - depends on X86_64 - depends on (MCORE2 || MPENTIUM4 || MPSC) + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MATOM config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MATOM) || X86_64 config X86_HAVE_PAE def_bool y - depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC7 || MCORE2 || MATOM || X86_64 + depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC7 || MATOM || X86_64 -config X86_CMPXCHG64 +config X86_CX8 def_bool y - depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 + depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || MATOM || MGEODE_LX || X86_64) config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8) - default "5" if X86_32 && X86_CMPXCHG64 + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MK7) + default "5" if X86_32 && X86_CX8 default "4" config X86_DEBUGCTLMSR @@ -401,6 +334,10 @@ menuconfig PROCESSOR_SELECT This lets you choose what x86 vendor support code your kernel will include. +config BROADCAST_TLB_FLUSH + def_bool y + depends on CPU_SUP_AMD && 64BIT + config CPU_SUP_INTEL default y bool "Support Intel processors" if PROCESSOR_SELECT diff --git a/arch/x86/Kconfig.cpufeatures b/arch/x86/Kconfig.cpufeatures new file mode 100644 index 000000000000..e12d5b7e39a2 --- /dev/null +++ b/arch/x86/Kconfig.cpufeatures @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# x86 feature bits (see arch/x86/include/asm/cpufeatures.h) that are +# either REQUIRED to be enabled, or DISABLED (always ignored) for this +# particular compile-time configuration. The tests for these features +# are turned into compile-time constants via the generated +# <asm/cpufeaturemasks.h>. +# +# The naming of these variables *must* match asm/cpufeatures.h, e.g., +# X86_FEATURE_ALWAYS <==> X86_REQUIRED_FEATURE_ALWAYS +# X86_FEATURE_FRED <==> X86_DISABLED_FEATURE_FRED +# +# And these REQUIRED and DISABLED config options are manipulated in an +# AWK script as the following example: +# +# +----------------------+ +# | X86_FRED = y ? | +# +----------------------+ +# / \ +# Y / \ N +# +-------------------------------------+ +-------------------------------+ +# | X86_DISABLED_FEATURE_FRED undefined | | X86_DISABLED_FEATURE_FRED = y | +# +-------------------------------------+ +-------------------------------+ +# | +# | +# +-------------------------------------------+ | +# | X86_FEATURE_FRED: feature word 12, bit 17 | ---->| +# +-------------------------------------------+ | +# | +# | +# +-------------------------------+ +# | set bit 17 of DISABLED_MASK12 | +# +-------------------------------+ +# + +config X86_REQUIRED_FEATURE_ALWAYS + def_bool y + +config X86_REQUIRED_FEATURE_NOPL + def_bool y + depends on X86_64 || X86_P6_NOP + +config X86_REQUIRED_FEATURE_CX8 + def_bool y + depends on X86_CX8 + +# this should be set for all -march=.. options where the compiler +# generates cmov. +config X86_REQUIRED_FEATURE_CMOV + def_bool y + depends on X86_CMOV + +# this should be set for all -march= options where the compiler +# generates movbe. +config X86_REQUIRED_FEATURE_MOVBE + def_bool y + depends on MATOM + +config X86_REQUIRED_FEATURE_CPUID + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_UP + def_bool y + depends on !SMP + +config X86_REQUIRED_FEATURE_FPU + def_bool y + depends on !MATH_EMULATION + +config X86_REQUIRED_FEATURE_PAE + def_bool y + depends on X86_64 || X86_PAE + +config X86_REQUIRED_FEATURE_PSE + def_bool y + depends on X86_64 && !PARAVIRT_XXL + +config X86_REQUIRED_FEATURE_PGE + def_bool y + depends on X86_64 && !PARAVIRT_XXL + +config X86_REQUIRED_FEATURE_MSR + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_FXSR + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_XMM + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_XMM2 + def_bool y + depends on X86_64 + +config X86_REQUIRED_FEATURE_LM + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_UMIP + def_bool y + depends on !X86_UMIP + +config X86_DISABLED_FEATURE_VME + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_K6_MTRR + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_CYRIX_ARR + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_CENTAUR_MCR + def_bool y + depends on X86_64 + +config X86_DISABLED_FEATURE_PCID + def_bool y + depends on !X86_64 + +config X86_DISABLED_FEATURE_PKU + def_bool y + depends on !X86_INTEL_MEMORY_PROTECTION_KEYS + +config X86_DISABLED_FEATURE_OSPKE + def_bool y + depends on !X86_INTEL_MEMORY_PROTECTION_KEYS + +config X86_DISABLED_FEATURE_LA57 + def_bool y + depends on !X86_5LEVEL + +config X86_DISABLED_FEATURE_PTI + def_bool y + depends on !MITIGATION_PAGE_TABLE_ISOLATION + +config X86_DISABLED_FEATURE_RETPOLINE + def_bool y + depends on !MITIGATION_RETPOLINE + +config X86_DISABLED_FEATURE_RETPOLINE_LFENCE + def_bool y + depends on !MITIGATION_RETPOLINE + +config X86_DISABLED_FEATURE_RETHUNK + def_bool y + depends on !MITIGATION_RETHUNK + +config X86_DISABLED_FEATURE_UNRET + def_bool y + depends on !MITIGATION_UNRET_ENTRY + +config X86_DISABLED_FEATURE_CALL_DEPTH + def_bool y + depends on !MITIGATION_CALL_DEPTH_TRACKING + +config X86_DISABLED_FEATURE_LAM + def_bool y + depends on !ADDRESS_MASKING + +config X86_DISABLED_FEATURE_ENQCMD + def_bool y + depends on !INTEL_IOMMU_SVM + +config X86_DISABLED_FEATURE_SGX + def_bool y + depends on !X86_SGX + +config X86_DISABLED_FEATURE_XENPV + def_bool y + depends on !XEN_PV + +config X86_DISABLED_FEATURE_TDX_GUEST + def_bool y + depends on !INTEL_TDX_GUEST + +config X86_DISABLED_FEATURE_USER_SHSTK + def_bool y + depends on !X86_USER_SHADOW_STACK + +config X86_DISABLED_FEATURE_IBT + def_bool y + depends on !X86_KERNEL_IBT + +config X86_DISABLED_FEATURE_FRED + def_bool y + depends on !X86_FRED + +config X86_DISABLED_FEATURE_SEV_SNP + def_bool y + depends on !KVM_AMD_SEV + +config X86_DISABLED_FEATURE_INVLPGB + def_bool y + depends on !BROADCAST_TLB_FLUSH diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5b773b34768d..0fc7e8fd1a2e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -137,17 +137,12 @@ ifeq ($(CONFIG_X86_32),y) include $(srctree)/arch/x86/Makefile_32.cpu KBUILD_CFLAGS += $(cflags-y) - # temporary until string.h is fixed + ifneq ($(call clang-min-version, 160000),y) + # https://github.com/llvm/llvm-project/issues/53645 KBUILD_CFLAGS += -ffreestanding - - ifeq ($(CONFIG_STACKPROTECTOR),y) - ifeq ($(CONFIG_SMP),y) - KBUILD_CFLAGS += -mstack-protector-guard-reg=fs \ - -mstack-protector-guard-symbol=__ref_stack_chk_guard - else - KBUILD_CFLAGS += -mstack-protector-guard=global - endif endif + + percpu_seg := fs else BITS := 64 UTS_MACHINE := x86_64 @@ -178,25 +173,24 @@ else # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) - # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) - cflags-$(CONFIG_MK8) += -march=k8 - cflags-$(CONFIG_MPSC) += -march=nocona - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom - cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic - KBUILD_CFLAGS += $(cflags-y) - - rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 - rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona - rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 - rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom - rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic - KBUILD_RUSTFLAGS += $(rustflags-y) + KBUILD_CFLAGS += -march=x86-64 -mtune=generic + KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic KBUILD_CFLAGS += -mno-red-zone KBUILD_CFLAGS += -mcmodel=kernel KBUILD_RUSTFLAGS += -Cno-redzone=y KBUILD_RUSTFLAGS += -Ccode-model=kernel + + percpu_seg := gs +endif + +ifeq ($(CONFIG_STACKPROTECTOR),y) + ifeq ($(CONFIG_SMP),y) + KBUILD_CFLAGS += -mstack-protector-guard-reg=$(percpu_seg) + KBUILD_CFLAGS += -mstack-protector-guard-symbol=__ref_stack_chk_guard + else + KBUILD_CFLAGS += -mstack-protector-guard=global + endif endif # @@ -277,6 +271,21 @@ archheaders: $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all ### +# <asm/cpufeaturemasks.h> header generation + +cpufeaturemasks.hdr := arch/x86/include/generated/asm/cpufeaturemasks.h +cpufeaturemasks.awk := $(srctree)/arch/x86/tools/cpufeaturemasks.awk +cpufeatures_hdr := $(srctree)/arch/x86/include/asm/cpufeatures.h +targets += $(cpufeaturemasks.hdr) +quiet_cmd_gen_featuremasks = GEN $@ + cmd_gen_featuremasks = $(AWK) -f $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) > $@ + +$(cpufeaturemasks.hdr): $(cpufeaturemasks.awk) $(cpufeatures_hdr) $(KCONFIG_CONFIG) FORCE + $(shell mkdir -p $(dir $@)) + $(call if_changed,gen_featuremasks) +archprepare: $(cpufeaturemasks.hdr) + +### # Kernel objects libs-y += arch/x86/lib/ diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 94834c4b5e5e..af7de9a42752 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -24,7 +24,6 @@ cflags-$(CONFIG_MK6) += -march=k6 # Please note, that patches that add -march=athlon-xp and friends are pointless. # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align) cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align) cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) @@ -32,9 +31,7 @@ cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align) cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 -cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) -cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) +cflags-$(CONFIG_MATOM) += -march=atom # AMD Elan support cflags-$(CONFIG_MELAN) += -march=i486 diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 1189be057ebd..070ef534c915 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -12,3 +12,4 @@ fdimage mtools.conf image.iso hdimage +tools/ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 9cc0ff6e9067..8589471b65a1 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -35,7 +35,6 @@ setup-y += video-vesa.o setup-y += video-bios.o targets += $(setup-y) -hostprogs := tools/build hostprogs += mkcpustr HOST_EXTRACFLAGS += -I$(srctree)/tools/include \ @@ -61,11 +60,9 @@ KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH) $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ -silent_redirect_image = >/dev/null -cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ - $(obj)/zoffset.h $@ $($(quiet)redirect_image) + cmd_image = cp $< $@; truncate -s %4K $@; cat $(obj)/vmlinux.bin >>$@ -$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE +$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE $(call if_changed,image) @$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')' diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 0f24f7ebec9b..38f17a1e1e36 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -16,7 +16,7 @@ #define STACK_SIZE 1024 /* Minimum number of bytes for stack */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/stdarg.h> #include <linux/types.h> @@ -327,6 +327,6 @@ void probe_cards(int unsafe); /* video-vesa.c */ void vesa_store_edid(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* BOOT_BOOT_H */ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 606c74f27459..0e0b238e8363 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -98,6 +98,7 @@ ifdef CONFIG_X86_64 vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o + vmlinux-objs-y += $(obj)/la57toggle.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 1dcb794c5479..3dc86352cdbe 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -483,110 +483,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) jmp *%rax SYM_FUNC_END(.Lrelocated) -/* - * This is the 32-bit trampoline that will be copied over to low memory. It - * will be called using the ordinary 64-bit calling convention from code - * running in 64-bit mode. - * - * Return address is at the top of the stack (might be above 4G). - * The first argument (EDI) contains the address of the temporary PGD level - * page table in 32-bit addressable memory which will be programmed into - * register CR3. - */ - .section ".rodata", "a", @progbits -SYM_CODE_START(trampoline_32bit_src) - /* - * Preserve callee save 64-bit registers on the stack: this is - * necessary because the architecture does not guarantee that GPRs will - * retain their full 64-bit values across a 32-bit mode switch. - */ - pushq %r15 - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbp - pushq %rbx - - /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */ - movq %rsp, %rbx - shrq $32, %rbx - - /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ - pushq $__KERNEL32_CS - leaq 0f(%rip), %rax - pushq %rax - lretq - - /* - * The 32-bit code below will do a far jump back to long mode and end - * up here after reconfiguring the number of paging levels. First, the - * stack pointer needs to be restored to its full 64-bit value before - * the callee save register contents can be popped from the stack. - */ -.Lret: - shlq $32, %rbx - orq %rbx, %rsp - - /* Restore the preserved 64-bit registers */ - popq %rbx - popq %rbp - popq %r12 - popq %r13 - popq %r14 - popq %r15 - retq - .code32 -0: - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* Point CR3 to the trampoline's new top level page table */ - movl %edi, %cr3 - - /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - /* Avoid writing EFER if no change was made (for TDX guest) */ - jc 1f - wrmsr -1: - /* Toggle CR4.LA57 */ - movl %cr4, %eax - btcl $X86_CR4_LA57_BIT, %eax - movl %eax, %cr4 - - /* Enable paging again. */ - movl %cr0, %eax - btsl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* - * Return to the 64-bit calling code using LJMP rather than LRET, to - * avoid the need for a 32-bit addressable stack. The destination - * address will be adjusted after the template code is copied into a - * 32-bit addressable buffer. - */ -.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src) -SYM_CODE_END(trampoline_32bit_src) - -/* - * This symbol is placed right after trampoline_32bit_src() so its address can - * be used to infer the size of the trampoline code. - */ -SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src) - - /* - * The trampoline code has a size limit. - * Make sure we fail to compile if the trampoline code grows - * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. - */ - .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE - - .text SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: diff --git a/arch/x86/boot/compressed/la57toggle.S b/arch/x86/boot/compressed/la57toggle.S new file mode 100644 index 000000000000..9ee002387eb1 --- /dev/null +++ b/arch/x86/boot/compressed/la57toggle.S @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/boot.h> +#include <asm/msr.h> +#include <asm/processor-flags.h> +#include "pgtable.h" + +/* + * This is the 32-bit trampoline that will be copied over to low memory. It + * will be called using the ordinary 64-bit calling convention from code + * running in 64-bit mode. + * + * Return address is at the top of the stack (might be above 4G). + * The first argument (EDI) contains the address of the temporary PGD level + * page table in 32-bit addressable memory which will be programmed into + * register CR3. + */ + + .section ".rodata", "a", @progbits +SYM_CODE_START(trampoline_32bit_src) + /* + * Preserve callee save 64-bit registers on the stack: this is + * necessary because the architecture does not guarantee that GPRs will + * retain their full 64-bit values across a 32-bit mode switch. + */ + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */ + movq %rsp, %rbx + shrq $32, %rbx + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS + leaq 0f(%rip), %rax + pushq %rax + lretq + + /* + * The 32-bit code below will do a far jump back to long mode and end + * up here after reconfiguring the number of paging levels. First, the + * stack pointer needs to be restored to its full 64-bit value before + * the callee save register contents can be popped from the stack. + */ +.Lret: + shlq $32, %rbx + orq %rbx, %rsp + + /* Restore the preserved 64-bit registers */ + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + retq + + .code32 +0: + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Point CR3 to the trampoline's new top level page table */ + movl %edi, %cr3 + + /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + /* Avoid writing EFER if no change was made (for TDX guest) */ + jc 1f + wrmsr +1: + /* Toggle CR4.LA57 */ + movl %cr4, %eax + btcl $X86_CR4_LA57_BIT, %eax + movl %eax, %cr4 + + /* Enable paging again. */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* + * Return to the 64-bit calling code using LJMP rather than LRET, to + * avoid the need for a 32-bit addressable stack. The destination + * address will be adjusted after the template code is copied into a + * 32-bit addressable buffer. + */ +.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src) +SYM_CODE_END(trampoline_32bit_src) + +/* + * This symbol is placed right after trampoline_32bit_src() so its address can + * be used to infer the size of the trampoline code. + */ +SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src) + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0d37420cad02..1cdcd4aaf395 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -235,7 +235,7 @@ static void handle_relocations(void *output, unsigned long output_len, /* * Process relocations: 32 bit relocations first then 64 bit after. - * Three sets of binary relocations are added to the end of the kernel + * Two sets of binary relocations are added to the end of the kernel * before compression. Each relocation table entry is the kernel * address of the location which needs to be updated stored as a * 32-bit value which is sign extended to 64 bits. @@ -245,8 +245,6 @@ static void handle_relocations(void *output, unsigned long output_len, * kernel bits... * 0 - zero terminator for 64 bit relocations * 64 bit relocation repeated - * 0 - zero terminator for inverse 32 bit relocations - * 32 bit inverse relocation repeated * 0 - zero terminator for 32 bit relocations * 32 bit relocation repeated * @@ -263,16 +261,6 @@ static void handle_relocations(void *output, unsigned long output_len, *(uint32_t *)ptr += delta; } #ifdef CONFIG_X86_64 - while (*--reloc) { - long extended = *reloc; - extended += map; - - ptr = (unsigned long)extended; - if (ptr < min_addr || ptr > max_addr) - error("inverse 32-bit relocation outside of kernel!\n"); - - *(int32_t *)ptr -= delta; - } for (reloc--; *reloc; reloc--) { long extended = *reloc; extended += map; diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 083ec6d7722a..3b2bc61c9408 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -48,7 +48,7 @@ SECTIONS *(.data) *(.data.*) - /* Add 4 bytes of extra space for a CRC-32 checksum */ + /* Add 4 bytes of extra space for the obsolete CRC-32 checksum */ . = ALIGN(. + 4, 0x200); _edata = . ; } diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 0aae4d4ed615..f82de8de5dc6 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -22,10 +22,11 @@ # include "boot.h" #endif #include <linux/types.h> +#include <asm/cpufeaturemasks.h> #include <asm/intel-family.h> #include <asm/processor-flags.h> -#include <asm/required-features.h> #include <asm/msr-index.h> + #include "string.h" #include "msr.h" diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index d75237ba7ce9..916bac09b464 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -3,7 +3,6 @@ #include "bitops.h" #include <asm/processor-flags.h> -#include <asm/required-features.h> #include <asm/msr-index.h> #include "cpuflags.h" @@ -29,40 +28,32 @@ static int has_fpu(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } +#ifdef CONFIG_X86_32 /* * For building the 16-bit code we want to explicitly specify 32-bit * push/pop operations, rather than just saying 'pushf' or 'popf' and - * letting the compiler choose. But this is also included from the - * compressed/ directory where it may be 64-bit code, and thus needs - * to be 'pushfq' or 'popfq' in that case. + * letting the compiler choose. */ -#ifdef __x86_64__ -#define PUSHF "pushfq" -#define POPF "popfq" -#else -#define PUSHF "pushfl" -#define POPF "popfl" -#endif - -int has_eflag(unsigned long mask) +bool has_eflag(unsigned long mask) { unsigned long f0, f1; - asm volatile(PUSHF " \n\t" - PUSHF " \n\t" + asm volatile("pushfl \n\t" + "pushfl \n\t" "pop %0 \n\t" "mov %0,%1 \n\t" "xor %2,%1 \n\t" "push %1 \n\t" - POPF " \n\t" - PUSHF " \n\t" + "popfl \n\t" + "pushfl \n\t" "pop %1 \n\t" - POPF + "popfl" : "=&r" (f0), "=&r" (f1) : "ri" (mask)); return !!((f0^f1) & mask); } +#endif void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d) { diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 475b8fde90f7..a398d9204ad0 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -15,8 +15,13 @@ struct cpu_features { extern struct cpu_features cpu; extern u32 cpu_vendor[3]; -int has_eflag(unsigned long mask); +#ifdef CONFIG_X86_32 +bool has_eflag(unsigned long mask); +#else +static inline bool has_eflag(unsigned long mask) { return true; } +#endif void get_cpuflags(void); void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d); +bool has_cpuflag(int flag); #endif diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index c9299aeb7333..3882ead513f7 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -22,6 +22,7 @@ # This script requires: # bash # syslinux +# genisoimage # mtools (for fdimage* and hdimage) # edk2/OVMF (for hdimage) # @@ -251,7 +252,9 @@ geniso() { cp "$isolinux" "$ldlinux" "$tmp_dir" cp "$FBZIMAGE" "$tmp_dir"/linux echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg - cp "${FDINITRDS[@]}" "$tmp_dir"/ + if [ ${#FDINITRDS[@]} -gt 0 ]; then + cp "${FDINITRDS[@]}" "$tmp_dir"/ + fi genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \ -quiet -o "$FIMAGE" -b isolinux.bin \ -c boot.cat -no-emul-boot -boot-load-size 4 \ diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index da0ccc5de538..22d730b227e3 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -12,8 +12,6 @@ #include <stdio.h> -#include "../include/asm/required-features.h" -#include "../include/asm/disabled-features.h" #include "../include/asm/cpufeatures.h" #include "../include/asm/vmxfeatures.h" #include "../kernel/cpu/capflags.c" @@ -23,6 +21,7 @@ int main(void) int i, j; const char *str; + printf("#include <asm/cpufeaturemasks.h>\n\n"); printf("static const char x86_cap_strs[] =\n"); for (i = 0; i < NCAPINTS; i++) { diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 3a2d1360abb0..e1d594a60204 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -45,6 +45,8 @@ SECTIONS setup_size = ALIGN(ABSOLUTE(.), 4096); setup_sects = ABSOLUTE(setup_size / 512); + ASSERT(setup_sects >= 5, "The setup must be at least 5 sectors in size"); + ASSERT(setup_sects <= 64, "The setup must be at most 64 sectors in size"); } . = ALIGN(16); diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c deleted file mode 100644 index 10311d77c67f..000000000000 --- a/arch/x86/boot/tools/build.c +++ /dev/null @@ -1,247 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 1997 Martin Mares - * Copyright (C) 2007 H. Peter Anvin - */ - -/* - * This file builds a disk-image from three different files: - * - * - setup: 8086 machine code, sets up system parm - * - system: 80386 code for actual system - * - zoffset.h: header with ZO_* defines - * - * It does some checking that all files are of the correct type, and writes - * the result to the specified destination, removing headers and padding to - * the right amount. It also writes some system data to stdout. - */ - -/* - * Changes by tytso to allow root device specification - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 - * Cross compiling fixes by Gertjan van Wingerde, July 1996 - * Rewritten by Martin Mares, April 1997 - * Substantially overhauled by H. Peter Anvin, April 2007 - */ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#include <stdarg.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/mman.h> -#include <tools/le_byteshift.h> - -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; - -/* Minimal number of setup sectors */ -#define SETUP_SECT_MIN 5 -#define SETUP_SECT_MAX 64 - -/* This must be large enough to hold the entire setup */ -u8 buf[SETUP_SECT_MAX*512]; - -static unsigned long _edata; - -/*----------------------------------------------------------------------*/ - -static const u32 crctab32[] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, - 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, - 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, - 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, - 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, - 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, - 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, - 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, - 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, - 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, - 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, - 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, - 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, - 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, - 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, - 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, - 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, - 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, - 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, - 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, - 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, - 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, - 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, - 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, - 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, - 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, - 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, - 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, - 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, - 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, - 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, - 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, - 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, - 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, - 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, - 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, - 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, - 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, - 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, - 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, - 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, - 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, - 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, - 0x2d02ef8d -}; - -static u32 partial_crc32_one(u8 c, u32 crc) -{ - return crctab32[(crc ^ c) & 0xff] ^ (crc >> 8); -} - -static u32 partial_crc32(const u8 *s, int len, u32 crc) -{ - while (len--) - crc = partial_crc32_one(*s++, crc); - return crc; -} - -static void die(const char * str, ...) -{ - va_list args; - va_start(args, str); - vfprintf(stderr, str, args); - va_end(args); - fputc('\n', stderr); - exit(1); -} - -static void usage(void) -{ - die("Usage: build setup system zoffset.h image"); -} - -/* - * Parse zoffset.h and find the entry points. We could just #include zoffset.h - * but that would mean tools/build would have to be rebuilt every time. It's - * not as if parsing it is hard... - */ -#define PARSE_ZOFS(p, sym) do { \ - if (!strncmp(p, "#define ZO_" #sym " ", 11+sizeof(#sym))) \ - sym = strtoul(p + 11 + sizeof(#sym), NULL, 16); \ -} while (0) - -static void parse_zoffset(char *fname) -{ - FILE *file; - char *p; - int c; - - file = fopen(fname, "r"); - if (!file) - die("Unable to open `%s': %m", fname); - c = fread(buf, 1, sizeof(buf) - 1, file); - if (ferror(file)) - die("read-error on `zoffset.h'"); - fclose(file); - buf[c] = 0; - - p = (char *)buf; - - while (p && *p) { - PARSE_ZOFS(p, _edata); - - p = strchr(p, '\n'); - while (p && (*p == '\r' || *p == '\n')) - p++; - } -} - -int main(int argc, char ** argv) -{ - unsigned int i, sz, setup_sectors; - int c; - struct stat sb; - FILE *file, *dest; - int fd; - void *kernel; - u32 crc = 0xffffffffUL; - - if (argc != 5) - usage(); - parse_zoffset(argv[3]); - - dest = fopen(argv[4], "w"); - if (!dest) - die("Unable to write `%s': %m", argv[4]); - - /* Copy the setup code */ - file = fopen(argv[1], "r"); - if (!file) - die("Unable to open `%s': %m", argv[1]); - c = fread(buf, 1, sizeof(buf), file); - if (ferror(file)) - die("read-error on `setup'"); - if (c < 1024) - die("The setup must be at least 1024 bytes"); - if (get_unaligned_le16(&buf[510]) != 0xAA55) - die("Boot block hasn't got boot flag (0xAA55)"); - fclose(file); - - /* Pad unused space with zeros */ - setup_sectors = (c + 4095) / 4096; - setup_sectors *= 8; - if (setup_sectors < SETUP_SECT_MIN) - setup_sectors = SETUP_SECT_MIN; - i = setup_sectors*512; - memset(buf+c, 0, i-c); - - /* Open and stat the kernel file */ - fd = open(argv[2], O_RDONLY); - if (fd < 0) - die("Unable to open `%s': %m", argv[2]); - if (fstat(fd, &sb)) - die("Unable to stat `%s': %m", argv[2]); - if (_edata != sb.st_size) - die("Unexpected file size `%s': %u != %u", argv[2], _edata, - sb.st_size); - sz = _edata - 4; - kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); - if (kernel == MAP_FAILED) - die("Unable to mmap '%s': %m", argv[2]); - - crc = partial_crc32(buf, i, crc); - if (fwrite(buf, 1, i, dest) != i) - die("Writing setup failed"); - - /* Copy the kernel code */ - crc = partial_crc32(kernel, sz, crc); - if (fwrite(kernel, 1, sz, dest) != sz) - die("Writing kernel failed"); - - /* Write the CRC */ - put_unaligned_le32(crc, buf); - if (fwrite(buf, 1, 4, dest) != 4) - die("Writing CRC failed"); - - /* Catch any delayed write failures */ - if (fclose(dest)) - die("Writing image failed"); - - close(fd); - - /* Everything is OK */ - return 0; -} diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 96c7bc698e6b..b0c1a7a57497 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -9,8 +9,6 @@ #define pr_fmt(fmt) "SEV: " fmt -#define DISABLE_BRANCH_PROFILING - #include <linux/sched/debug.h> /* For show_regs() */ #include <linux/percpu-defs.h> #include <linux/cc_platform.h> @@ -1482,8 +1480,7 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) case MSR_AMD64_GUEST_TSC_FREQ: if (sev_status & MSR_AMD64_SNP_SECURE_TSC) return __vc_handle_secure_tsc_msrs(regs, write); - else - break; + break; default: break; } diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 32809a06dab4..7772b01ab738 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -167,11 +167,11 @@ static void __noreturn tdx_panic(const char *msg) /* Define register order according to the GHCI */ struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; }; - char str[64]; + char bytes[64] __nonstring; } message; /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */ - strtomem_pad(message.str, msg, '\0'); + strtomem_pad(message.bytes, msg, '\0'); args.r8 = message.r8; args.r9 = message.r9; diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config index 581296255b39..d5d091e03bd3 100644 --- a/arch/x86/configs/xen.config +++ b/arch/x86/configs/xen.config @@ -1,6 +1,4 @@ # global x86 required specific stuff -# On 32-bit HIGHMEM4G is not allowed -CONFIG_HIGHMEM64G=y CONFIG_64BIT=y # These enable us to allow some of the diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index eb153eff9331..b37881bb9f15 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -17,6 +17,7 @@ */ #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/frame.h> #define STATE1 %xmm0 @@ -1071,6 +1072,7 @@ SYM_FUNC_END(_aesni_inc) * size_t len, u8 *iv) */ SYM_FUNC_START(aesni_ctr_enc) + ANNOTATE_NOENDBR FRAME_BEGIN cmp $16, LEN jb .Lctr_enc_just_ret diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index 646477a13e11..1dfef28c1266 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -16,6 +16,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -882,7 +883,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16) jmp .Ldec_max24; SYM_FUNC_END(__camellia_dec_blk16) -SYM_FUNC_START(camellia_ecb_enc_16way) +SYM_TYPED_FUNC_START(camellia_ecb_enc_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -907,7 +908,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way) RET; SYM_FUNC_END(camellia_ecb_enc_16way) -SYM_FUNC_START(camellia_ecb_dec_16way) +SYM_TYPED_FUNC_START(camellia_ecb_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) @@ -937,7 +938,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way) RET; SYM_FUNC_END(camellia_ecb_dec_16way) -SYM_FUNC_START(camellia_cbc_dec_16way) +SYM_TYPED_FUNC_START(camellia_cbc_dec_16way) /* input: * %rdi: ctx, CTX * %rsi: dst (16 blocks) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index a0eb94e53b1b..b1c9b9450555 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #define CAMELLIA_TABLE_BYTE_LEN 272 diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 816b6bb8bded..824cb94de6c2 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "camellia-x86_64-asm_64.S" .text @@ -177,7 +178,7 @@ bswapq RAB0; \ movq RAB0, 4*2(RIO); -SYM_FUNC_START(__camellia_enc_blk) +SYM_TYPED_FUNC_START(__camellia_enc_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -224,7 +225,7 @@ SYM_FUNC_START(__camellia_enc_blk) RET; SYM_FUNC_END(__camellia_enc_blk) -SYM_FUNC_START(camellia_dec_blk) +SYM_TYPED_FUNC_START(camellia_dec_blk) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -411,7 +412,7 @@ SYM_FUNC_END(camellia_dec_blk) bswapq RAB1; \ movq RAB1, 12*2(RIO); -SYM_FUNC_START(__camellia_enc_blk_2way) +SYM_TYPED_FUNC_START(__camellia_enc_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -460,7 +461,7 @@ SYM_FUNC_START(__camellia_enc_blk_2way) RET; SYM_FUNC_END(__camellia_enc_blk_2way) -SYM_FUNC_START(camellia_dec_blk_2way) +SYM_TYPED_FUNC_START(camellia_dec_blk_2way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 97e283621851..84e47f7f6188 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -9,6 +9,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/frame.h> #include "glue_helper-asm-avx.S" @@ -656,7 +657,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) RET; SYM_FUNC_END(__serpent_dec_blk8_avx) -SYM_FUNC_START(serpent_ecb_enc_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_enc_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -674,7 +675,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx) RET; SYM_FUNC_END(serpent_ecb_enc_8way_avx) -SYM_FUNC_START(serpent_ecb_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_ecb_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -692,7 +693,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx) RET; SYM_FUNC_END(serpent_ecb_dec_8way_avx) -SYM_FUNC_START(serpent_cbc_dec_8way_avx) +SYM_TYPED_FUNC_START(serpent_cbc_dec_8way_avx) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index d2288bf38a8a..071e90e7f0d8 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -6,6 +6,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> .file "twofish-x86_64-asm-3way.S" .text @@ -220,7 +221,7 @@ rorq $32, RAB2; \ outunpack3(mov, RIO, 2, RAB, 2); -SYM_FUNC_START(__twofish_enc_blk_3way) +SYM_TYPED_FUNC_START(__twofish_enc_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst @@ -269,7 +270,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) RET; SYM_FUNC_END(__twofish_enc_blk_3way) -SYM_FUNC_START(twofish_dec_blk_3way) +SYM_TYPED_FUNC_START(twofish_dec_blk_3way) /* input: * %rdi: ctx, CTX * %rsi: dst diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 775af290cd19..e08b4ba07b93 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -8,6 +8,7 @@ .text #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/asm-offsets.h> #define a_offset 0 @@ -202,7 +203,7 @@ xor %r8d, d ## D;\ ror $1, d ## D; -SYM_FUNC_START(twofish_enc_blk) +SYM_TYPED_FUNC_START(twofish_enc_blk) pushq R1 /* %rdi contains the ctx address */ @@ -255,7 +256,7 @@ SYM_FUNC_START(twofish_enc_blk) RET SYM_FUNC_END(twofish_enc_blk) -SYM_FUNC_START(twofish_dec_blk) +SYM_TYPED_FUNC_START(twofish_dec_blk) pushq R1 /* %rdi contains the ctx address */ diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index ce1cc1622385..72cae8e0ce85 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -7,12 +7,13 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n -CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -CFLAGS_common.o += -fno-stack-protector +CFLAGS_syscall_32.o += -fno-stack-protector +CFLAGS_syscall_64.o += -fno-stack-protector obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o -obj-y += common.o obj-y += vdso/ obj-y += vsyscall/ @@ -23,4 +24,3 @@ CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE) obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o -obj-$(CONFIG_X86_X32_ABI) += syscall_x32.o diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index ea81770629ee..cb0911c5dc5d 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -431,6 +431,7 @@ For 32-bit we have the following conventions - kernel is built with /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ .macro THUNK name, func SYM_FUNC_START(\name) + ANNOTATE_NOENDBR pushq %rbp movq %rsp, %rbp diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c deleted file mode 100644 index 14db5b85114c..000000000000 --- a/arch/x86/entry/common.c +++ /dev/null @@ -1,524 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * common.c - C code for kernel entry and exit - * Copyright (c) 2015 Andrew Lutomirski - * - * Based on asm and ptrace code by many authors. The code here originated - * in ptrace.c and signal.c. - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/entry-common.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/export.h> -#include <linux/nospec.h> -#include <linux/syscalls.h> -#include <linux/uaccess.h> -#include <linux/init.h> - -#ifdef CONFIG_XEN_PV -#include <xen/xen-ops.h> -#include <xen/events.h> -#endif - -#include <asm/apic.h> -#include <asm/desc.h> -#include <asm/traps.h> -#include <asm/vdso.h> -#include <asm/cpufeature.h> -#include <asm/fpu/api.h> -#include <asm/nospec-branch.h> -#include <asm/io_bitmap.h> -#include <asm/syscall.h> -#include <asm/irq_stack.h> - -#ifdef CONFIG_X86_64 - -static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) -{ - /* - * Convert negative numbers to very high and thus out of range - * numbers for comparisons. - */ - unsigned int unr = nr; - - if (likely(unr < NR_syscalls)) { - unr = array_index_nospec(unr, NR_syscalls); - regs->ax = x64_sys_call(regs, unr); - return true; - } - return false; -} - -static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) -{ - /* - * Adjust the starting offset of the table, and convert numbers - * < __X32_SYSCALL_BIT to very high and thus out of range - * numbers for comparisons. - */ - unsigned int xnr = nr - __X32_SYSCALL_BIT; - - if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { - xnr = array_index_nospec(xnr, X32_NR_syscalls); - regs->ax = x32_sys_call(regs, xnr); - return true; - } - return false; -} - -/* Returns true to return using SYSRET, or false to use IRET */ -__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) -{ - add_random_kstack_offset(); - nr = syscall_enter_from_user_mode(regs, nr); - - instrumentation_begin(); - - if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { - /* Invalid system call, but still a system call. */ - regs->ax = __x64_sys_ni_syscall(regs); - } - - instrumentation_end(); - syscall_exit_to_user_mode(regs); - - /* - * Check that the register state is valid for using SYSRET to exit - * to userspace. Otherwise use the slower but fully capable IRET - * exit path. - */ - - /* XEN PV guests always use the IRET path */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) - return false; - - /* SYSRET requires RCX == RIP and R11 == EFLAGS */ - if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) - return false; - - /* CS and SS must match the values set in MSR_STAR */ - if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) - return false; - - /* - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. - * - * TASK_SIZE_MAX covers all user-accessible addresses other than - * the deprecated vsyscall page. - */ - if (unlikely(regs->ip >= TASK_SIZE_MAX)) - return false; - - /* - * SYSRET cannot restore RF. It can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. - */ - if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) - return false; - - /* Use SYSRET to exit to userspace */ - return true; -} -#endif - -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -static __always_inline int syscall_32_enter(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_IA32_EMULATION)) - current_thread_info()->status |= TS_COMPAT; - - return (int)regs->orig_ax; -} - -#ifdef CONFIG_IA32_EMULATION -bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); - -static int ia32_emulation_override_cmdline(char *arg) -{ - return kstrtobool(arg, &__ia32_enabled); -} -early_param("ia32_emulation", ia32_emulation_override_cmdline); -#endif - -/* - * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL. - */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) -{ - /* - * Convert negative numbers to very high and thus out of range - * numbers for comparisons. - */ - unsigned int unr = nr; - - if (likely(unr < IA32_NR_syscalls)) { - unr = array_index_nospec(unr, IA32_NR_syscalls); - regs->ax = ia32_sys_call(regs, unr); - } else if (nr != -1) { - regs->ax = __ia32_sys_ni_syscall(regs); - } -} - -#ifdef CONFIG_IA32_EMULATION -static __always_inline bool int80_is_external(void) -{ - const unsigned int offs = (0x80 / 32) * 0x10; - const u32 bit = BIT(0x80 % 32); - - /* The local APIC on XENPV guests is fake */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) - return false; - - /* - * If vector 0x80 is set in the APIC ISR then this is an external - * interrupt. Either from broken hardware or injected by a VMM. - * - * Note: In guest mode this is only valid for secure guests where - * the secure module fully controls the vAPIC exposed to the guest. - */ - return apic_read(APIC_ISR + offs) & bit; -} - -/** - * do_int80_emulation - 32-bit legacy syscall C entry from asm - * @regs: syscall arguments in struct pt_args on the stack. - * - * This entry point can be used by 32-bit and 64-bit programs to perform - * 32-bit system calls. Instances of INT $0x80 can be found inline in - * various programs and libraries. It is also used by the vDSO's - * __kernel_vsyscall fallback for hardware that doesn't support a faster - * entry method. Restarted 32-bit system calls also fall back to INT - * $0x80 regardless of what instruction was originally used to do the - * system call. - * - * This is considered a slow path. It is not used by most libc - * implementations on modern hardware except during process startup. - * - * The arguments for the INT $0x80 based syscall are on stack in the - * pt_regs structure: - * eax: system call number - * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 - */ -__visible noinstr void do_int80_emulation(struct pt_regs *regs) -{ - int nr; - - /* Kernel does not use INT $0x80! */ - if (unlikely(!user_mode(regs))) { - irqentry_enter(regs); - instrumentation_begin(); - panic("Unexpected external interrupt 0x80\n"); - } - - /* - * Establish kernel context for instrumentation, including for - * int80_is_external() below which calls into the APIC driver. - * Identical for soft and external interrupts. - */ - enter_from_user_mode(regs); - - instrumentation_begin(); - add_random_kstack_offset(); - - /* Validate that this is a soft interrupt to the extent possible */ - if (unlikely(int80_is_external())) - panic("Unexpected external interrupt 0x80\n"); - - /* - * The low level idtentry code pushed -1 into regs::orig_ax - * and regs::ax contains the syscall number. - * - * User tracing code (ptrace or signal handlers) might assume - * that the regs::orig_ax contains a 32-bit number on invoking - * a 32-bit syscall. - * - * Establish the syscall convention by saving the 32bit truncated - * syscall number in regs::orig_ax and by invalidating regs::ax. - */ - regs->orig_ax = regs->ax & GENMASK(31, 0); - regs->ax = -ENOSYS; - - nr = syscall_32_enter(regs); - - local_irq_enable(); - nr = syscall_enter_from_user_mode_work(regs, nr); - do_syscall_32_irqs_on(regs, nr); - - instrumentation_end(); - syscall_exit_to_user_mode(regs); -} - -#ifdef CONFIG_X86_FRED -/* - * A FRED-specific INT80 handler is warranted for the follwing reasons: - * - * 1) As INT instructions and hardware interrupts are separate event - * types, FRED does not preclude the use of vector 0x80 for external - * interrupts. As a result, the FRED setup code does not reserve - * vector 0x80 and calling int80_is_external() is not merely - * suboptimal but actively incorrect: it could cause a system call - * to be incorrectly ignored. - * - * 2) It is called only for handling vector 0x80 of event type - * EVENT_TYPE_SWINT and will never be called to handle any external - * interrupt (event type EVENT_TYPE_EXTINT). - * - * 3) FRED has separate entry flows depending on if the event came from - * user space or kernel space, and because the kernel does not use - * INT insns, the FRED kernel entry handler fred_entry_from_kernel() - * falls through to fred_bad_type() if the event type is - * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling - * an INT insn, it can only be from a user level. - * - * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will - * likely take a different approach if it is ever needed: it - * probably belongs in either fred_intx()/ fred_other() or - * asm_fred_entrypoint_user(), depending on if this ought to be done - * for all entries from userspace or only system - * calls. - * - * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. - */ -DEFINE_FREDENTRY_RAW(int80_emulation) -{ - int nr; - - enter_from_user_mode(regs); - - instrumentation_begin(); - add_random_kstack_offset(); - - /* - * FRED pushed 0 into regs::orig_ax and regs::ax contains the - * syscall number. - * - * User tracing code (ptrace or signal handlers) might assume - * that the regs::orig_ax contains a 32-bit number on invoking - * a 32-bit syscall. - * - * Establish the syscall convention by saving the 32bit truncated - * syscall number in regs::orig_ax and by invalidating regs::ax. - */ - regs->orig_ax = regs->ax & GENMASK(31, 0); - regs->ax = -ENOSYS; - - nr = syscall_32_enter(regs); - - local_irq_enable(); - nr = syscall_enter_from_user_mode_work(regs, nr); - do_syscall_32_irqs_on(regs, nr); - - instrumentation_end(); - syscall_exit_to_user_mode(regs); -} -#endif -#else /* CONFIG_IA32_EMULATION */ - -/* Handles int $0x80 on a 32bit kernel */ -__visible noinstr void do_int80_syscall_32(struct pt_regs *regs) -{ - int nr = syscall_32_enter(regs); - - add_random_kstack_offset(); - /* - * Subtlety here: if ptrace pokes something larger than 2^31-1 into - * orig_ax, the int return value truncates it. This matches - * the semantics of syscall_get_nr(). - */ - nr = syscall_enter_from_user_mode(regs, nr); - instrumentation_begin(); - - do_syscall_32_irqs_on(regs, nr); - - instrumentation_end(); - syscall_exit_to_user_mode(regs); -} -#endif /* !CONFIG_IA32_EMULATION */ - -static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) -{ - int nr = syscall_32_enter(regs); - int res; - - add_random_kstack_offset(); - /* - * This cannot use syscall_enter_from_user_mode() as it has to - * fetch EBP before invoking any of the syscall entry work - * functions. - */ - syscall_enter_from_user_mode_prepare(regs); - - instrumentation_begin(); - /* Fetch EBP from where the vDSO stashed it. */ - if (IS_ENABLED(CONFIG_X86_64)) { - /* - * Micro-optimization: the pointer we're following is - * explicitly 32 bits, so it can't be out of range. - */ - res = __get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp); - } else { - res = get_user(*(u32 *)®s->bp, - (u32 __user __force *)(unsigned long)(u32)regs->sp); - } - - if (res) { - /* User code screwed up. */ - regs->ax = -EFAULT; - - local_irq_disable(); - instrumentation_end(); - irqentry_exit_to_user_mode(regs); - return false; - } - - nr = syscall_enter_from_user_mode_work(regs, nr); - - /* Now this is just like a normal syscall. */ - do_syscall_32_irqs_on(regs, nr); - - instrumentation_end(); - syscall_exit_to_user_mode(regs); - return true; -} - -/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ -__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) -{ - /* - * Called using the internal vDSO SYSENTER/SYSCALL32 calling - * convention. Adjust regs so it looks like we entered using int80. - */ - unsigned long landing_pad = (unsigned long)current->mm->context.vdso + - vdso_image_32.sym_int80_landing_pad; - - /* - * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward - * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. - * Fix it up. - */ - regs->ip = landing_pad; - - /* Invoke the syscall. If it failed, keep it simple: use IRET. */ - if (!__do_fast_syscall_32(regs)) - return false; - - /* - * Check that the register state is valid for using SYSRETL/SYSEXIT - * to exit to userspace. Otherwise use the slower but fully capable - * IRET exit path. - */ - - /* XEN PV guests always use the IRET path */ - if (cpu_feature_enabled(X86_FEATURE_XENPV)) - return false; - - /* EIP must point to the VDSO landing pad */ - if (unlikely(regs->ip != landing_pad)) - return false; - - /* CS and SS must match the values set in MSR_STAR */ - if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) - return false; - - /* If the TF, RF, or VM flags are set, use IRET */ - if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) - return false; - - /* Use SYSRETL/SYSEXIT to exit to userspace */ - return true; -} - -/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ -__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) -{ - /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ - regs->sp = regs->bp; - - /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ - regs->flags |= X86_EFLAGS_IF; - - return do_fast_syscall_32(regs); -} -#endif - -SYSCALL_DEFINE0(ni_syscall) -{ - return -ENOSYS; -} - -#ifdef CONFIG_XEN_PV -#ifndef CONFIG_PREEMPTION -/* - * Some hypercalls issued by the toolstack can take many 10s of - * seconds. Allow tasks running hypercalls via the privcmd driver to - * be voluntarily preempted even if full kernel preemption is - * disabled. - * - * Such preemptible hypercalls are bracketed by - * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() - * calls. - */ -DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); -EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); - -/* - * In case of scheduling the flag must be cleared and restored after - * returning from schedule as the task might move to a different CPU. - */ -static __always_inline bool get_and_clear_inhcall(void) -{ - bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); - - __this_cpu_write(xen_in_preemptible_hcall, false); - return inhcall; -} - -static __always_inline void restore_inhcall(bool inhcall) -{ - __this_cpu_write(xen_in_preemptible_hcall, inhcall); -} -#else -static __always_inline bool get_and_clear_inhcall(void) { return false; } -static __always_inline void restore_inhcall(bool inhcall) { } -#endif - -static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - inc_irq_stat(irq_hv_callback_count); - - xen_evtchn_do_upcall(); - - set_irq_regs(old_regs); -} - -__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) -{ - irqentry_state_t state = irqentry_enter(regs); - bool inhcall; - - instrumentation_begin(); - run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); - - inhcall = get_and_clear_inhcall(); - if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { - irqentry_exit_cond_resched(); - instrumentation_end(); - restore_inhcall(inhcall); - } else { - instrumentation_end(); - irqentry_exit(regs, state); - } -} -#endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index b7ea3e8e9ecc..d3caa31240ed 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -5,6 +5,7 @@ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> #include <asm/segment.h> @@ -17,6 +18,7 @@ .pushsection .noinstr.text, "ax" SYM_FUNC_START(entry_ibpb) + ANNOTATE_NOENDBR movl $MSR_IA32_PRED_CMD, %ecx movl $PRED_CMD_IBPB, %eax xorl %edx, %edx @@ -52,7 +54,6 @@ EXPORT_SYMBOL_GPL(mds_verw_sel); THUNK warn_thunk_thunk, __warn_thunk -#ifndef CONFIG_X86_64 /* * Clang's implementation of TLS stack cookies requires the variable in * question to be a TLS variable. If the variable happens to be defined as an @@ -63,7 +64,6 @@ THUNK warn_thunk_thunk, __warn_thunk * entirely in the C code, and use an alias emitted by the linker script * instead. */ -#ifdef CONFIG_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP) EXPORT_SYMBOL(__ref_stack_chk_guard); #endif -#endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 20be5758c2d2..92c0b4a94e0a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1153,7 +1153,7 @@ SYM_CODE_START(asm_exc_nmi) * is using the thread stack right now, so it's safe for us to use it. */ movl %esp, %ebx - movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call exc_nmi movl %ebx, %esp @@ -1217,7 +1217,7 @@ SYM_CODE_START(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi + movl PER_CPU_VAR(cpu_current_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp call make_task_dead diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f52dbe0ad93c..f40bdf97d390 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64) /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -175,6 +175,7 @@ SYM_CODE_END(entry_SYSCALL_64) */ .pushsection .text, "ax" SYM_FUNC_START(__switch_to_asm) + ANNOTATE_NOENDBR /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -192,7 +193,7 @@ SYM_FUNC_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary) + movq %rbx, PER_CPU_VAR(__stack_chk_guard) #endif /* @@ -742,6 +743,7 @@ _ASM_NOKPROBE(common_interrupt_return) * Is in entry.text as it shouldn't be instrumented. */ SYM_FUNC_START(asm_load_gs_index) + ANNOTATE_NOENDBR FRAME_BEGIN swapgs .Lgs_change: @@ -1166,7 +1168,7 @@ SYM_CODE_START(asm_exc_nmi) FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ @@ -1484,7 +1486,7 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS @@ -1526,6 +1528,7 @@ SYM_CODE_END(rewind_stack_and_make_dead) * refactored in the future if needed. */ SYM_FUNC_START(clear_bhb_loop) + ANNOTATE_NOENDBR push %rbp mov %rsp, %rbp movl $5, %ecx diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index ed0a5f2dc129..a45e1125fc6c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -57,7 +57,7 @@ SYM_CODE_START(entry_SYSENTER_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rax popq %rax - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ @@ -193,7 +193,7 @@ SYM_CODE_START(entry_SYSCALL_compat) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp /* Switch to the kernel stack */ - movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index a02bc6f3d2e6..29c5c32c16c3 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -58,6 +58,7 @@ SYM_CODE_END(asm_fred_entrypoint_kernel) #if IS_ENABLED(CONFIG_KVM_INTEL) SYM_FUNC_START(asm_fred_entry_from_kvm) + ANNOTATE_NOENDBR push %rbp mov %rsp, %rbp diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8cc9950d7104..2b15ea17bb7c 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -1,10 +1,16 @@ -// SPDX-License-Identifier: GPL-2.0 -/* System call table for i386. */ +// SPDX-License-Identifier: GPL-2.0-only +/* 32-bit system call dispatch */ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> +#include <linux/entry-common.h> +#include <linux/nospec.h> +#include <linux/uaccess.h> +#include <asm/apic.h> +#include <asm/traps.h> +#include <asm/cpufeature.h> #include <asm/syscall.h> #ifdef CONFIG_IA32_EMULATION @@ -41,4 +47,324 @@ long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) #include <asm/syscalls_32.h> default: return __ia32_sys_ni_syscall(regs); } -}; +} + +static __always_inline int syscall_32_enter(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_IA32_EMULATION)) + current_thread_info()->status |= TS_COMPAT; + + return (int)regs->orig_ax; +} + +#ifdef CONFIG_IA32_EMULATION +bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); + +static int __init ia32_emulation_override_cmdline(char *arg) +{ + return kstrtobool(arg, &__ia32_enabled); +} +early_param("ia32_emulation", ia32_emulation_override_cmdline); +#endif + +/* + * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL. + */ +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) +{ + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < IA32_NR_syscalls)) { + unr = array_index_nospec(unr, IA32_NR_syscalls); + regs->ax = ia32_sys_call(regs, unr); + } else if (nr != -1) { + regs->ax = __ia32_sys_ni_syscall(regs); + } +} + +#ifdef CONFIG_IA32_EMULATION +static __always_inline bool int80_is_external(void) +{ + const unsigned int offs = (0x80 / 32) * 0x10; + const u32 bit = BIT(0x80 % 32); + + /* The local APIC on XENPV guests is fake */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* + * If vector 0x80 is set in the APIC ISR then this is an external + * interrupt. Either from broken hardware or injected by a VMM. + * + * Note: In guest mode this is only valid for secure guests where + * the secure module fully controls the vAPIC exposed to the guest. + */ + return apic_read(APIC_ISR + offs) & bit; +} + +/** + * do_int80_emulation - 32-bit legacy syscall C entry from asm + * @regs: syscall arguments in struct pt_args on the stack. + * + * This entry point can be used by 32-bit and 64-bit programs to perform + * 32-bit system calls. Instances of INT $0x80 can be found inline in + * various programs and libraries. It is also used by the vDSO's + * __kernel_vsyscall fallback for hardware that doesn't support a faster + * entry method. Restarted 32-bit system calls also fall back to INT + * $0x80 regardless of what instruction was originally used to do the + * system call. + * + * This is considered a slow path. It is not used by most libc + * implementations on modern hardware except during process startup. + * + * The arguments for the INT $0x80 based syscall are on stack in the + * pt_regs structure: + * eax: system call number + * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 + */ +__visible noinstr void do_int80_emulation(struct pt_regs *regs) +{ + int nr; + + /* Kernel does not use INT $0x80! */ + if (unlikely(!user_mode(regs))) { + irqentry_enter(regs); + instrumentation_begin(); + panic("Unexpected external interrupt 0x80\n"); + } + + /* + * Establish kernel context for instrumentation, including for + * int80_is_external() below which calls into the APIC driver. + * Identical for soft and external interrupts. + */ + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* Validate that this is a soft interrupt to the extent possible */ + if (unlikely(int80_is_external())) + panic("Unexpected external interrupt 0x80\n"); + + /* + * The low level idtentry code pushed -1 into regs::orig_ax + * and regs::ax contains the syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} + +#ifdef CONFIG_X86_FRED +/* + * A FRED-specific INT80 handler is warranted for the follwing reasons: + * + * 1) As INT instructions and hardware interrupts are separate event + * types, FRED does not preclude the use of vector 0x80 for external + * interrupts. As a result, the FRED setup code does not reserve + * vector 0x80 and calling int80_is_external() is not merely + * suboptimal but actively incorrect: it could cause a system call + * to be incorrectly ignored. + * + * 2) It is called only for handling vector 0x80 of event type + * EVENT_TYPE_SWINT and will never be called to handle any external + * interrupt (event type EVENT_TYPE_EXTINT). + * + * 3) FRED has separate entry flows depending on if the event came from + * user space or kernel space, and because the kernel does not use + * INT insns, the FRED kernel entry handler fred_entry_from_kernel() + * falls through to fred_bad_type() if the event type is + * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling + * an INT insn, it can only be from a user level. + * + * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will + * likely take a different approach if it is ever needed: it + * probably belongs in either fred_intx()/ fred_other() or + * asm_fred_entrypoint_user(), depending on if this ought to be done + * for all entries from userspace or only system + * calls. + * + * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. + */ +DEFINE_FREDENTRY_RAW(int80_emulation) +{ + int nr; + + enter_from_user_mode(regs); + + instrumentation_begin(); + add_random_kstack_offset(); + + /* + * FRED pushed 0 into regs::orig_ax and regs::ax contains the + * syscall number. + * + * User tracing code (ptrace or signal handlers) might assume + * that the regs::orig_ax contains a 32-bit number on invoking + * a 32-bit syscall. + * + * Establish the syscall convention by saving the 32bit truncated + * syscall number in regs::orig_ax and by invalidating regs::ax. + */ + regs->orig_ax = regs->ax & GENMASK(31, 0); + regs->ax = -ENOSYS; + + nr = syscall_32_enter(regs); + + local_irq_enable(); + nr = syscall_enter_from_user_mode_work(regs, nr); + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif /* CONFIG_X86_FRED */ + +#else /* CONFIG_IA32_EMULATION */ + +/* Handles int $0x80 on a 32bit kernel */ +__visible noinstr void do_int80_syscall_32(struct pt_regs *regs) +{ + int nr = syscall_32_enter(regs); + + add_random_kstack_offset(); + /* + * Subtlety here: if ptrace pokes something larger than 2^31-1 into + * orig_ax, the int return value truncates it. This matches + * the semantics of syscall_get_nr(). + */ + nr = syscall_enter_from_user_mode(regs, nr); + instrumentation_begin(); + + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); +} +#endif /* !CONFIG_IA32_EMULATION */ + +static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) +{ + int nr = syscall_32_enter(regs); + int res; + + add_random_kstack_offset(); + /* + * This cannot use syscall_enter_from_user_mode() as it has to + * fetch EBP before invoking any of the syscall entry work + * functions. + */ + syscall_enter_from_user_mode_prepare(regs); + + instrumentation_begin(); + /* Fetch EBP from where the vDSO stashed it. */ + if (IS_ENABLED(CONFIG_X86_64)) { + /* + * Micro-optimization: the pointer we're following is + * explicitly 32 bits, so it can't be out of range. + */ + res = __get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } else { + res = get_user(*(u32 *)®s->bp, + (u32 __user __force *)(unsigned long)(u32)regs->sp); + } + + if (res) { + /* User code screwed up. */ + regs->ax = -EFAULT; + + local_irq_disable(); + instrumentation_end(); + irqentry_exit_to_user_mode(regs); + return false; + } + + nr = syscall_enter_from_user_mode_work(regs, nr); + + /* Now this is just like a normal syscall. */ + do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); + syscall_exit_to_user_mode(regs); + return true; +} + +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) +{ + /* + * Called using the internal vDSO SYSENTER/SYSCALL32 calling + * convention. Adjust regs so it looks like we entered using int80. + */ + unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + vdso_image_32.sym_int80_landing_pad; + + /* + * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward + * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. + * Fix it up. + */ + regs->ip = landing_pad; + + /* Invoke the syscall. If it failed, keep it simple: use IRET. */ + if (!__do_fast_syscall_32(regs)) + return false; + + /* + * Check that the register state is valid for using SYSRETL/SYSEXIT + * to exit to userspace. Otherwise use the slower but fully capable + * IRET exit path. + */ + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* EIP must point to the VDSO landing pad */ + if (unlikely(regs->ip != landing_pad)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) + return false; + + /* If the TF, RF, or VM flags are set, use IRET */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) + return false; + + /* Use SYSRETL/SYSEXIT to exit to userspace */ + return true; +} + +/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ +__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) +{ + /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ + regs->sp = regs->bp; + + /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ + regs->flags |= X86_EFLAGS_IF; + + return do_fast_syscall_32(regs); +} diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index ba8354424860..b6e68ea98b83 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -1,15 +1,20 @@ -// SPDX-License-Identifier: GPL-2.0 -/* System call table for x86-64. */ +// SPDX-License-Identifier: GPL-2.0-only +/* 64-bit system call dispatch */ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> #include <linux/syscalls.h> +#include <linux/entry-common.h> +#include <linux/nospec.h> #include <asm/syscall.h> #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *); #include <asm/syscalls_64.h> +#ifdef CONFIG_X86_X32_ABI +#include <asm/syscalls_x32.h> +#endif #undef __SYSCALL #undef __SYSCALL_NORETURN @@ -33,4 +38,104 @@ long x64_sys_call(const struct pt_regs *regs, unsigned int nr) #include <asm/syscalls_64.h> default: return __x64_sys_ni_syscall(regs); } -}; +} + +#ifdef CONFIG_X86_X32_ABI +long x32_sys_call(const struct pt_regs *regs, unsigned int nr) +{ + switch (nr) { + #include <asm/syscalls_x32.h> + default: return __x64_sys_ni_syscall(regs); + } +} +#endif + +static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) +{ + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < NR_syscalls)) { + unr = array_index_nospec(unr, NR_syscalls); + regs->ax = x64_sys_call(regs, unr); + return true; + } + return false; +} + +static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) +{ + /* + * Adjust the starting offset of the table, and convert numbers + * < __X32_SYSCALL_BIT to very high and thus out of range + * numbers for comparisons. + */ + unsigned int xnr = nr - __X32_SYSCALL_BIT; + + if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { + xnr = array_index_nospec(xnr, X32_NR_syscalls); + regs->ax = x32_sys_call(regs, xnr); + return true; + } + return false; +} + +/* Returns true to return using SYSRET, or false to use IRET */ +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) +{ + add_random_kstack_offset(); + nr = syscall_enter_from_user_mode(regs, nr); + + instrumentation_begin(); + + if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { + /* Invalid system call, but still a system call. */ + regs->ax = __x64_sys_ni_syscall(regs); + } + + instrumentation_end(); + syscall_exit_to_user_mode(regs); + + /* + * Check that the register state is valid for using SYSRET to exit + * to userspace. Otherwise use the slower but fully capable IRET + * exit path. + */ + + /* XEN PV guests always use the IRET path */ + if (cpu_feature_enabled(X86_FEATURE_XENPV)) + return false; + + /* SYSRET requires RCX == RIP and R11 == EFLAGS */ + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags)) + return false; + + /* CS and SS must match the values set in MSR_STAR */ + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS)) + return false; + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. + * + * TASK_SIZE_MAX covers all user-accessible addresses other than + * the deprecated vsyscall page. + */ + if (unlikely(regs->ip >= TASK_SIZE_MAX)) + return false; + + /* + * SYSRET cannot restore RF. It can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. + */ + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF))) + return false; + + /* Use SYSRET to exit to userspace */ + return true; +} diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c deleted file mode 100644 index fb77908f44f3..000000000000 --- a/arch/x86/entry/syscall_x32.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* System call table for x32 ABI. */ - -#include <linux/linkage.h> -#include <linux/sys.h> -#include <linux/cache.h> -#include <linux/syscalls.h> -#include <asm/syscall.h> - -#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); -#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *); -#include <asm/syscalls_x32.h> -#undef __SYSCALL - -#undef __SYSCALL_NORETURN -#define __SYSCALL_NORETURN __SYSCALL - -#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs); -long x32_sys_call(const struct pt_regs *regs, unsigned int nr) -{ - switch (nr) { - #include <asm/syscalls_x32.h> - default: return __x64_sys_ni_syscall(regs); - } -}; diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4d0fb2fba7e2..ac007ea00979 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -396,7 +396,7 @@ 381 i386 pkey_alloc sys_pkey_alloc 382 i386 pkey_free sys_pkey_free 383 i386 statx sys_statx -384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl +384 i386 arch_prctl sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents_time32 compat_sys_io_pgetevents 386 i386 rseq sys_rseq 393 i386 semget sys_semget @@ -472,3 +472,4 @@ 464 i386 getxattrat sys_getxattrat 465 i386 listxattrat sys_listxattrat 466 i386 removexattrat sys_removexattrat +467 i386 open_tree_attr sys_open_tree_attr diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 5eb708bff1c7..cfb5ca41e30d 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -390,6 +390,7 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index c9216ac4fb1e..54d3e9774d62 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -4,7 +4,7 @@ # # Include the generic Makefile to check the built vDSO: -include $(srctree)/lib/vdso/Makefile +include $(srctree)/lib/vdso/Makefile.include # Files to link into the vDSO: vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o @@ -32,7 +32,7 @@ targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ +VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 \ -z max-page-size=4096 $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE @@ -133,6 +133,7 @@ KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_32 += -DBUILD_VDSO ifdef CONFIG_MITIGATION_RETPOLINE ifneq ($(RETPOLINE_VDSO_CFLAGS),) @@ -151,10 +152,9 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE quiet_cmd_vdso = VDSO $@ cmd_vdso = $(LD) -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -T $(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(src)/checkundef.sh '$(NM)' '$@' + -T $(filter %.lds,$^) $(filter %.o,$^) -VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \ +VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 --no-undefined \ $(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack quiet_cmd_vdso_and_check = VDSO $@ diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh deleted file mode 100755 index 7ee90a9b549d..000000000000 --- a/arch/x86/entry/vdso/checkundef.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -nm="$1" -file="$2" -$nm "$file" | grep '^ *U' > /dev/null 2>&1 -if [ $? -eq 1 ]; then - exit 0 -else - echo "$file: undefined symbols found" >&2 - exit 1 -fi diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h index b56f6b012941..baba612b832c 100644 --- a/arch/x86/entry/vdso/extable.h +++ b/arch/x86/entry/vdso/extable.h @@ -7,7 +7,7 @@ * vDSO uses a dedicated handler the addresses are relative to the overall * exception table, not each individual entry. */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define _ASM_VDSO_EXTABLE_HANDLE(from, to) \ ASM_VDSO_EXTABLE_HANDLE from to diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 872947c1004c..ec1ac191a057 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/vdso.h> #include <asm/vdso/vsyscall.h> +#include <vdso/datapage.h> /* * Linker script for vDSO. This is an ELF shared object prelinked to @@ -17,14 +18,9 @@ SECTIONS * segment. */ - vvar_start = . - __VVAR_PAGES * PAGE_SIZE; - vvar_page = vvar_start; + VDSO_VVAR_SYMS - vdso_rng_data = vvar_page + __VDSO_RND_DATA_OFFSET; - - timens_page = vvar_start + PAGE_SIZE; - - vclock_pages = vvar_start + VDSO_NR_VCLOCK_PAGES * PAGE_SIZE; + vclock_pages = VDSO_VCLOCK_PAGES_START(vdso_u_data); pvclock_page = vclock_pages + VDSO_PAGE_PVCLOCK_OFFSET * PAGE_SIZE; hvclock_page = vclock_pages + VDSO_PAGE_HVCLOCK_OFFSET * PAGE_SIZE; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 90d15f2a7205..f84e8f8fa5fe 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -69,33 +69,12 @@ const char *outfilename; -/* Symbols that we need in vdso2c. */ -enum { - sym_vvar_start, - sym_vvar_page, - sym_pvclock_page, - sym_hvclock_page, - sym_timens_page, -}; - -const int special_pages[] = { - sym_vvar_page, - sym_pvclock_page, - sym_hvclock_page, - sym_timens_page, -}; - struct vdso_sym { const char *name; bool export; }; struct vdso_sym required_syms[] = { - [sym_vvar_start] = {"vvar_start", true}, - [sym_vvar_page] = {"vvar_page", true}, - [sym_pvclock_page] = {"pvclock_page", true}, - [sym_hvclock_page] = {"hvclock_page", true}, - [sym_timens_page] = {"timens_page", true}, {"VDSO32_NOTE_MASK", true}, {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 67b3e37576a6..78ed1c1f28b9 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -150,26 +150,6 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } } - /* Validate mapping addresses. */ - for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { - INT_BITS symval = syms[special_pages[i]]; - - if (!symval) - continue; /* The mapping isn't used; ignore it. */ - - if (symval % 4096) - fail("%s must be a multiple of 4096\n", - required_syms[i].name); - if (symval + 4096 < syms[sym_vvar_start]) - fail("%s underruns vvar_start\n", - required_syms[i].name); - if (symval + 4096 > 0) - fail("%s is on the wrong side of the vdso text\n", - required_syms[i].name); - } - if (syms[sym_vvar_start] % 4096) - fail("vvar_begin must be a multiple of 4096\n"); - if (!image_name) { fwrite(stripped_addr, stripped_len, 1, outfile); return; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 39e6efc1a9ca..9518bf1ddf35 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -14,7 +14,7 @@ #include <linux/elf.h> #include <linux/cpu.h> #include <linux/ptrace.h> -#include <linux/time_namespace.h> +#include <linux/vdso_datastore.h> #include <asm/pvclock.h> #include <asm/vgtod.h> @@ -27,13 +27,7 @@ #include <asm/vdso/vsyscall.h> #include <clocksource/hyperv_timer.h> -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)vvar_page; -} - -static union vdso_data_store vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; +static_assert(VDSO_NR_PAGES + VDSO_NR_VCLOCK_PAGES == __VDSO_PAGES); unsigned int vclocks_used __read_mostly; @@ -48,13 +42,11 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len), - NULL); + image->alt_len)); return 0; } -static const struct vm_special_mapping vvar_mapping; struct linux_binprm; static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, @@ -98,99 +90,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -#ifdef CONFIG_TIME_NS -/* - * The vvar page layout depends on whether a task belongs to the root or - * non-root time namespace. Whenever a task changes its namespace, the VVAR - * page tables are cleared and then they will re-faulted with a - * corresponding layout. - * See also the comment near timens_setup_vdso_data() for details. - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - for_each_vma(vmi, vma) { - if (vma_is_special_mapping(vma, &vvar_mapping)) - zap_vma_pages(vma); - } - mmap_read_unlock(mm); - - return 0; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - const struct vdso_image *image = vma->vm_mm->context.vdso_image; - unsigned long pfn; - long sym_offset; - - if (!image) - return VM_FAULT_SIGBUS; - - sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + - image->sym_vvar_start; - - /* - * Sanity check: a symbol offset of zero means that the page - * does not exist for this vdso image, not that the page is at - * offset zero relative to the text mapping. This should be - * impossible here, because sym_offset should only be zero for - * the page past the end of the vvar mapping. - */ - if (sym_offset == 0) - return VM_FAULT_SIGBUS; - - if (sym_offset == image->sym_vvar_page) { - struct page *timens_page = find_timens_vvar_page(vma); - - pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT; - - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the sym_vvar_page offset and - * the real VVAR page is mapped with the sym_timens_page - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (timens_page) { - unsigned long addr; - vm_fault_t err; - - /* - * Optimization: inside time namespace pre-fault - * VVAR page too. As on timens page there are only - * offsets for clocks on VVAR, it'll be faulted - * shortly by VDSO code. - */ - addr = vmf->address + (image->sym_timens_page - sym_offset); - err = vmf_insert_pfn(vma, addr, pfn); - if (unlikely(err & VM_FAULT_ERROR)) - return err; - - pfn = page_to_pfn(timens_page); - } - - return vmf_insert_pfn(vma, vmf->address, pfn); - - } else if (sym_offset == image->sym_timens_page) { - struct page *timens_page = find_timens_vvar_page(vma); - - if (!timens_page) - return VM_FAULT_SIGBUS; - - pfn = __pa_symbol(vdso_data) >> PAGE_SHIFT; - return vmf_insert_pfn(vma, vmf->address, pfn); - } - - return VM_FAULT_SIGBUS; -} - static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -212,7 +111,6 @@ static vm_fault_t vvar_vclock_fault(const struct vm_special_mapping *sm, case VDSO_PAGE_HVCLOCK_OFFSET: { unsigned long pfn = hv_get_tsc_pfn(); - if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) return vmf_insert_pfn(vma, vmf->address, pfn); break; @@ -228,10 +126,6 @@ static const struct vm_special_mapping vdso_mapping = { .fault = vdso_fault, .mremap = vdso_mremap, }; -static const struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, -}; static const struct vm_special_mapping vvar_vclock_mapping = { .name = "[vvar_vclock]", .fault = vvar_vclock_fault, @@ -253,13 +147,13 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) return -EINTR; addr = get_unmapped_area(NULL, addr, - image->size - image->sym_vvar_start, 0, 0); + image->size + __VDSO_PAGES * PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - text_start = addr - image->sym_vvar_start; + text_start = addr + __VDSO_PAGES * PAGE_SIZE; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -276,13 +170,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) goto up_fail; } - vma = _install_special_mapping(mm, - addr, - (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, - &vvar_mapping); - + vma = vdso_install_vvar_mapping(mm, addr); if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); @@ -290,7 +178,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } vma = _install_special_mapping(mm, - addr + (__VVAR_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE, + VDSO_VCLOCK_PAGES_START(addr), VDSO_NR_VCLOCK_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| VM_PFNMAP, @@ -327,7 +215,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) */ for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || - vma_is_special_mapping(vma, &vvar_mapping) || + vma_is_special_mapping(vma, &vdso_vvar_mapping) || vma_is_special_mapping(vma, &vvar_vclock_mapping)) { mmap_write_unlock(mm); return -EEXIST; diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 780acd3dff22..ec3427463382 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -381,7 +381,8 @@ static void amd_brs_poison_buffer(void) * On ctxswin, sched_in = true, called after the PMU has started * On ctxswout, sched_in = false, called before the PMU is stopped */ -void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index e7a8b8758e08..66f981865091 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -28,9 +28,6 @@ static u32 ibs_caps; #include <asm/nmi.h> #include <asm/amd-ibs.h> -#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) -#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT - /* attr.config2 */ #define IBS_SW_FILTER_MASK 1 @@ -89,6 +86,7 @@ struct perf_ibs { u64 cnt_mask; u64 enable_mask; u64 valid_mask; + u16 min_period; u64 max_period; unsigned long offset_mask[1]; int offset_max; @@ -270,11 +268,19 @@ static int validate_group(struct perf_event *event) return 0; } +static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + return perf_ibs == &perf_ibs_op && + (ibs_caps & IBS_CAPS_OPLDLAT) && + (event->attr.config1 & 0xFFF); +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; - u64 max_cnt, config; + u64 config; int ret; perf_ibs = get_ibs_pmu(event->attr.type); @@ -310,25 +316,47 @@ static int perf_ibs_init(struct perf_event *event) if (config & perf_ibs->cnt_mask) /* raw max_cnt may not be set */ return -EINVAL; - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) - /* - * lower 4 bits can not be set in ibs max cnt, - * but allowing it in case we adjust the - * sample period to set a frequency. - */ - return -EINVAL; - hwc->sample_period &= ~0x0FULL; - if (!hwc->sample_period) - hwc->sample_period = 0x10; + + if (event->attr.freq) { + hwc->sample_period = perf_ibs->min_period; + } else { + /* Silently mask off lower nibble. IBS hw mandates it. */ + hwc->sample_period &= ~0x0FULL; + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; + } } else { - max_cnt = config & perf_ibs->cnt_mask; + u64 period = 0; + + if (event->attr.freq) + return -EINVAL; + + if (perf_ibs == &perf_ibs_op) { + period = (config & IBS_OP_MAX_CNT) << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + period |= config & IBS_OP_MAX_CNT_EXT_MASK; + } else { + period = (config & IBS_FETCH_MAX_CNT) << 4; + } + config &= ~perf_ibs->cnt_mask; - event->attr.sample_period = max_cnt << 4; - hwc->sample_period = event->attr.sample_period; + event->attr.sample_period = period; + hwc->sample_period = period; + + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; } - if (!hwc->sample_period) - return -EINVAL; + if (perf_ibs_ldlat_event(perf_ibs, event)) { + u64 ldlat = event->attr.config1 & 0xFFF; + + if (ldlat < 128 || ldlat > 2048) + return -EINVAL; + ldlat >>= 7; + + config |= (ldlat - 1) << 59; + config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN; + } /* * If we modify hwc->sample_period, we also need to update @@ -349,7 +377,8 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, int overflow; /* ignore lower 4 bits in min count: */ - overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + overflow = perf_event_set_period(hwc, perf_ibs->min_period, + perf_ibs->max_period, period); local64_set(&hwc->prev_count, 0); return overflow; @@ -447,6 +476,9 @@ static void perf_ibs_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + perf_ibs_set_period(perf_ibs, hwc, &period); if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { config |= period & IBS_OP_MAX_CNT_EXT_MASK; @@ -554,6 +586,28 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +static int perf_ibs_check_period(struct perf_event *event, u64 value) +{ + struct perf_ibs *perf_ibs; + u64 low_nibble; + + if (event->attr.freq) + return 0; + + perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + low_nibble = value & 0xFULL; + + /* + * This contradicts with perf_ibs_init() which allows sample period + * with lower nibble bits set but silently masks them off. Whereas + * this returns error. + */ + if (low_nibble || value < perf_ibs->min_period) + return -EINVAL; + + return 0; +} + /* * We need to initialize with empty group if all attributes in the * group are dynamic. @@ -572,7 +626,10 @@ PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_FORMAT_ATTR(swfilt, "config2:0"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); +PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -580,6 +637,18 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; } +static umode_t +ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; +} + +static umode_t +ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0; +} + static struct attribute *fetch_attrs[] = { &format_attr_rand_en.attr, &format_attr_swfilt.attr, @@ -596,6 +665,16 @@ static struct attribute *zen4_ibs_extensions_attrs[] = { NULL, }; +static struct attribute *ibs_op_ldlat_cap_attrs[] = { + &ibs_op_ldlat_cap.attr.attr, + NULL, +}; + +static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = { + &ibs_op_dtlb_pgsize_cap.attr.attr, + NULL, +}; + static struct attribute_group group_fetch_formats = { .name = "format", .attrs = fetch_attrs, @@ -613,6 +692,18 @@ static struct attribute_group group_zen4_ibs_extensions = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_ibs_op_ldlat_cap = { + .name = "caps", + .attrs = ibs_op_ldlat_cap_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + +static struct attribute_group group_ibs_op_dtlb_pgsize_cap = { + .name = "caps", + .attrs = ibs_op_dtlb_pgsize_cap_attrs, + .is_visible = ibs_op_dtlb_pgsize_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_fetch_formats, &empty_caps_group, @@ -651,6 +742,11 @@ static struct attribute_group group_op_formats = { .attrs = op_attrs, }; +static struct attribute *ibs_op_ldlat_format_attrs[] = { + &ibs_op_ldlat_format.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -669,10 +765,19 @@ static const struct attribute_group *op_attr_groups[] = { NULL, }; +static struct attribute_group group_ibs_op_ldlat_format = { + .name = "format", + .attrs = ibs_op_ldlat_format_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, &group_zen4_ibs_extensions, + &group_ibs_op_ldlat_cap, + &group_ibs_op_ldlat_format, + &group_ibs_op_dtlb_pgsize_cap, NULL, }; @@ -686,12 +791,14 @@ static struct perf_ibs perf_ibs_fetch = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSFETCHCTL, - .config_mask = IBS_FETCH_CONFIG_MASK, + .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, + .min_period = 0x10, .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, @@ -709,13 +816,15 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSOPCTL, - .config_mask = IBS_OP_CONFIG_MASK, + .config_mask = IBS_OP_MAX_CNT, .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, + .min_period = 0x90, .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, @@ -917,6 +1026,10 @@ static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, if (!op_data3->dc_lin_addr_valid) return; + if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) && + !op_data3->dc_phy_addr_valid) + return; + if (!op_data3->dc_l1tlb_miss) { data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; return; @@ -941,6 +1054,8 @@ static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, data_src->mem_lock = PERF_MEM_LOCK_LOCKED; } +/* Be careful. Works only for contiguous MSRs. */ +#define ibs_fetch_msr_idx(msr) (msr - MSR_AMD64_IBSFETCHCTL) #define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, @@ -1021,21 +1136,92 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, } } -static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, +static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + return perf_ibs == &perf_ibs_op && + sample_type & (PERF_SAMPLE_DATA_SRC | + PERF_SAMPLE_WEIGHT_TYPE | + PERF_SAMPLE_ADDR | + PERF_SAMPLE_PHYS_ADDR); +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, + struct perf_event *event, int check_rip) { - if (sample_type & PERF_SAMPLE_RAW || - (perf_ibs == &perf_ibs_op && - (sample_type & PERF_SAMPLE_DATA_SRC || - sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR || - sample_type & PERF_SAMPLE_PHYS_ADDR))) + if (event->attr.sample_type & PERF_SAMPLE_RAW || + perf_ibs_is_mem_sample_type(perf_ibs, event) || + perf_ibs_ldlat_event(perf_ibs, event)) return perf_ibs->offset_max; else if (check_rip) return 3; return 1; } +static bool perf_ibs_is_kernel_data_addr(struct perf_event *event, + struct perf_ibs_data *ibs_data) +{ + u64 sample_type_mask = PERF_SAMPLE_ADDR | PERF_SAMPLE_RAW; + union ibs_op_data3 op_data3; + u64 dc_lin_addr; + + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + dc_lin_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + + return unlikely((event->attr.sample_type & sample_type_mask) && + op_data3.dc_lin_addr_valid && kernel_ip(dc_lin_addr)); +} + +static bool perf_ibs_is_kernel_br_target(struct perf_event *event, + struct perf_ibs_data *ibs_data, + int br_target_idx) +{ + union ibs_op_data op_data; + u64 br_target; + + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + br_target = ibs_data->regs[br_target_idx]; + + return unlikely((event->attr.sample_type & PERF_SAMPLE_RAW) && + op_data.op_brn_ret && kernel_ip(br_target)); +} + +static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event *event, + struct pt_regs *regs, struct perf_ibs_data *ibs_data, + int br_target_idx) +{ + if (perf_exclude_event(event, regs)) + return true; + + if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel) + return false; + + if (perf_ibs_is_kernel_data_addr(event, ibs_data)) + return true; + + if (br_target_idx != -1 && + perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx)) + return true; + + return false; +} + +static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs, + struct perf_ibs_data *ibs_data) +{ + if (perf_ibs == &perf_ibs_op) { + ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)] &= ~(1ULL << 18); + ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)] = 0; + return; + } + + ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHCTL)] &= ~(1ULL << 52); + ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHPHYSAD)] = 0; +} + static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); @@ -1048,6 +1234,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; u64 *buf, *config, period, new_config = 0; + int br_target_idx = -1; if (!test_bit(IBS_STARTED, pcpu->state)) { fail: @@ -1084,7 +1271,7 @@ fail: offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip); do { rdmsrl(msr + offset, *buf++); @@ -1093,6 +1280,22 @@ fail: perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + + if (perf_ibs_ldlat_event(perf_ibs, event)) { + union ibs_op_data3 op_data3; + + op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + /* + * Opening event is errored out if load latency threshold is + * outside of [128, 2048] range. Since the event has reached + * interrupt handler, we can safely assume the threshold is + * within [128, 2048] range. + */ + if (!op_data3.ld_op || !op_data3.dc_miss || + op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF)) + goto out; + } + /* * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately * depending on their availability. @@ -1102,6 +1305,7 @@ fail: if (perf_ibs == &perf_ibs_op) { if (ibs_caps & IBS_CAPS_BRNTRGT) { rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + br_target_idx = size; size++; } if (ibs_caps & IBS_CAPS_OPDATA4) { @@ -1129,10 +1333,19 @@ fail: } if ((event->attr.config2 & IBS_SW_FILTER_MASK) && - perf_exclude_event(event, ®s)) { + perf_ibs_swfilt_discard(perf_ibs, event, ®s, &ibs_data, br_target_idx)) { throttle = perf_event_account_interrupt(event); goto out; } + /* + * Prevent leaking physical addresses to unprivileged users. Skip + * PERF_SAMPLE_PHYS_ADDR check since generic code prevents it for + * unprivileged users. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_allow_kernel(&event->attr)) { + perf_ibs_phyaddr_clear(perf_ibs, &ibs_data); + } if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw = (struct perf_raw_record){ @@ -1155,6 +1368,10 @@ fail: perf_sample_save_callchain(&data, event, iregs); throttle = perf_event_overflow(event, &data, ®s); + + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + out: if (throttle) { perf_ibs_stop(event, 0); @@ -1244,7 +1461,8 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; - perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK | + IBS_OP_CUR_CNT_EXT_MASK); } if (ibs_caps & IBS_CAPS_ZEN4) diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index b15f7b950d2e..f8228d8243f7 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -30,7 +30,7 @@ #define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL) #define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL) -#define IOMMU_NAME_SIZE 16 +#define IOMMU_NAME_SIZE 24 struct perf_amd_iommu { struct list_head list; diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 19c7b76e21bc..c06ccca96851 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -371,7 +371,8 @@ void amd_pmu_lbr_del(struct perf_event *event) perf_sched_cb_dec(event->pmu); } -void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 2092d615333d..6866cc5acb0b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -87,13 +87,14 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); -DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); +DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -1298,6 +1299,15 @@ static void x86_pmu_enable(struct pmu *pmu) if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; + + /* + * The late setup (after counters are scheduled) + * is required for some cases, e.g., PEBS counters + * snapshotting. Because an accurate counter index + * is needed. + */ + static_call_cond(x86_pmu_late_setup)(); + /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() @@ -2028,13 +2038,14 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); - static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx); static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); + + static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); } static void _x86_pmu_read(struct perf_event *event) @@ -2625,15 +2636,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; -static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) -{ - static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in); -} - -static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, - struct perf_event_pmu_context *next_epc) +static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { - static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc); + static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in); } void perf_check_microcode(void) @@ -2700,7 +2706,6 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, - .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, @@ -2844,7 +2849,7 @@ static bool is_uprobe_at_func_entry(struct pt_regs *regs) return true; /* endbr64 (64-bit only) */ - if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn)) + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) return true; return false; diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 8f78b0c900ef..39a987d5eb6e 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -36,7 +36,7 @@ enum { BTS_STATE_ACTIVE, }; -static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); +static struct bts_ctx __percpu *bts_ctx; #define BTS_RECORD_SIZE 24 #define BTS_SAFETY_MARGIN 4080 @@ -58,7 +58,7 @@ struct bts_buffer { local_t head; unsigned long end; void **data_pages; - struct bts_phys buf[]; + struct bts_phys buf[] __counted_by(nr_bufs); }; static struct pmu bts_pmu; @@ -231,7 +231,7 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); static void __bts_event_start(struct perf_event *event) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); struct bts_buffer *buf = perf_get_aux(&bts->handle); u64 config = 0; @@ -260,7 +260,7 @@ static void __bts_event_start(struct perf_event *event) static void bts_event_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); struct bts_buffer *buf; buf = perf_aux_output_begin(&bts->handle, event); @@ -290,7 +290,7 @@ fail_stop: static void __bts_event_stop(struct perf_event *event, int state) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */ WRITE_ONCE(bts->state, state); @@ -305,7 +305,7 @@ static void __bts_event_stop(struct perf_event *event, int state) static void bts_event_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); struct bts_buffer *buf = NULL; int state = READ_ONCE(bts->state); @@ -338,9 +338,14 @@ static void bts_event_stop(struct perf_event *event, int flags) void intel_bts_enable_local(void) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - int state = READ_ONCE(bts->state); + struct bts_ctx *bts; + int state; + + if (!bts_ctx) + return; + bts = this_cpu_ptr(bts_ctx); + state = READ_ONCE(bts->state); /* * Here we transition from INACTIVE to ACTIVE; * if we instead are STOPPED from the interrupt handler, @@ -358,7 +363,12 @@ void intel_bts_enable_local(void) void intel_bts_disable_local(void) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts; + + if (!bts_ctx) + return; + + bts = this_cpu_ptr(bts_ctx); /* * Here we transition from ACTIVE to INACTIVE; @@ -450,12 +460,17 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) int intel_bts_interrupt(void) { struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct perf_event *event = bts->handle.event; + struct bts_ctx *bts; + struct perf_event *event; struct bts_buffer *buf; s64 old_head; int err = -ENOSPC, handled = 0; + if (!bts_ctx) + return 0; + + bts = this_cpu_ptr(bts_ctx); + event = bts->handle.event; /* * The only surefire way of knowing if this NMI is ours is by checking * the write ptr against the PMI threshold. @@ -518,7 +533,7 @@ static void bts_event_del(struct perf_event *event, int mode) static int bts_event_add(struct perf_event *event, int mode) { - struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + struct bts_ctx *bts = this_cpu_ptr(bts_ctx); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -605,6 +620,10 @@ static __init int bts_init(void) return -ENODEV; } + bts_ctx = alloc_percpu(struct bts_ctx); + if (!bts_ctx) + return -ENOMEM; + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cdb19e3ba3aa..1ac39611fea8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2714,7 +2714,7 @@ static void update_saved_topdown_regs(struct perf_event *event, u64 slots, * modify by a NMI. PMU has to be disabled before calling this function. */ -static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2722,13 +2722,24 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) bool reset = true; int idx; - /* read Fixed counter 3 */ - rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); - if (!slots) - return 0; + if (!val) { + /* read Fixed counter 3 */ + rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + if (!slots) + return 0; - /* read PERF_METRICS */ - rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + /* read PERF_METRICS */ + rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + } else { + slots = val[0]; + metrics = val[1]; + /* + * Don't reset the PERF_METRICS and Fixed counter 3 + * for each PEBS record read. Utilize the RDPMC metrics + * clear mode. + */ + reset = false; + } for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) @@ -2771,36 +2782,47 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) return slots; } -static u64 icl_update_topdown_event(struct perf_event *event) +static u64 icl_update_topdown_event(struct perf_event *event, u64 *val) { return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + - x86_pmu.num_topdown_events - 1); + x86_pmu.num_topdown_events - 1, + val); } -DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); -static void intel_pmu_read_topdown_event(struct perf_event *event) +static void intel_pmu_read_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN) || + is_pebs_counter_event_group(event)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + bool pmu_enabled = cpuc->enabled; - /* Only need to call update_topdown_event() once for group read. */ - if ((cpuc->txn_flags & PERF_PMU_TXN_READ) && - !is_slots_event(event)) - return; + /* Only need to call update_topdown_event() once for group read. */ + if (is_metric_event(event) && (cpuc->txn_flags & PERF_PMU_TXN_READ)) + return; - perf_pmu_disable(event->pmu); - static_call(intel_pmu_update_topdown_event)(event); - perf_pmu_enable(event->pmu); -} + cpuc->enabled = 0; + if (pmu_enabled) + intel_pmu_disable_all(); -static void intel_pmu_read_event(struct perf_event *event) -{ - if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) - intel_pmu_auto_reload_read(event); - else if (is_topdown_count(event)) - intel_pmu_read_topdown_event(event); - else - x86_perf_event_update(event); + /* + * If the PEBS counters snapshotting is enabled, + * the topdown event is available in PEBS records. + */ + if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) + static_call(intel_pmu_update_topdown_event)(event, NULL); + else + intel_pmu_drain_pebs_buffer(); + + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + intel_pmu_enable_all(0); + + return; + } + + x86_perf_event_update(event); } static void intel_pmu_enable_fixed(struct perf_event *event) @@ -2932,7 +2954,7 @@ static int intel_pmu_set_period(struct perf_event *event) static u64 intel_pmu_update(struct perf_event *event) { if (unlikely(is_topdown_count(event))) - return static_call(intel_pmu_update_topdown_event)(event); + return static_call(intel_pmu_update_topdown_event)(event, NULL); return x86_perf_event_update(event); } @@ -3070,7 +3092,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) handled++; x86_pmu_handle_guest_pebs(regs, &data); - x86_pmu.drain_pebs(regs, &data); + static_call(x86_pmu_drain_pebs)(regs, &data); status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; /* @@ -3098,7 +3120,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - static_call(intel_pmu_update_topdown_event)(NULL); + static_call(intel_pmu_update_topdown_event)(NULL, NULL); } /* @@ -3116,6 +3138,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) if (!test_bit(bit, cpuc->active_mask)) continue; + /* + * There may be unprocessed PEBS records in the PEBS buffer, + * which still stores the previous values. + * Process those records first before handling the latest value. + * For example, + * A is a regular counter + * B is a PEBS event which reads A + * C is a PEBS event + * + * The following can happen: + * B-assist A=1 + * C A=2 + * B-assist A=3 + * A-overflow-PMI A=4 + * C-assist-PMI (PEBS buffer) A=5 + * + * The PEBS buffer has to be drained before handling the A-PMI + */ + if (is_pebs_counter_event_group(event)) + x86_pmu.drain_pebs(regs, &data); + if (!intel_pmu_save_and_restart(event)) continue; @@ -4148,6 +4191,13 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; } + if ((event->attr.sample_type & PERF_SAMPLE_READ) && + (x86_pmu.intel_cap.pebs_format >= 6) && + x86_pmu.intel_cap.pebs_baseline && + is_sampling_event(event) && + event->attr.precise_ip) + event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; @@ -4685,9 +4735,9 @@ static int adl_hw_config(struct perf_event *event) return -EOPNOTSUPP; } -static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) +static enum intel_cpu_type adl_get_hybrid_cpu_type(void) { - return HYBRID_INTEL_CORE; + return INTEL_CPU_TYPE_CORE; } static inline bool erratum_hsw11(struct perf_event *event) @@ -5032,7 +5082,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) { - u8 cpu_type = get_this_hybrid_cpu_type(); + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + enum intel_cpu_type cpu_type = c->topo.intel_type; int i; /* @@ -5041,7 +5092,7 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) * on it. There should be a fixup function provided for these * troublesome CPUs (->get_hybrid_cpu_type). */ - if (cpu_type == HYBRID_INTEL_NONE) { + if (cpu_type == INTEL_CPU_TYPE_UNKNOWN) { if (x86_pmu.get_hybrid_cpu_type) cpu_type = x86_pmu.get_hybrid_cpu_type(); else @@ -5058,16 +5109,16 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type; u32 native_id; - if (cpu_type == HYBRID_INTEL_CORE && pmu_type == hybrid_big) + if (cpu_type == INTEL_CPU_TYPE_CORE && pmu_type == hybrid_big) return &x86_pmu.hybrid_pmu[i]; - if (cpu_type == HYBRID_INTEL_ATOM) { + if (cpu_type == INTEL_CPU_TYPE_ATOM) { if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i]; - native_id = get_this_hybrid_cpu_native_id(); - if (native_id == skt_native_id && pmu_type == hybrid_small) + native_id = c->topo.intel_native_model_id; + if (native_id == INTEL_ATOM_SKT_NATIVE_ID && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i]; - if (native_id == cmt_native_id && pmu_type == hybrid_tiny) + if (native_id == INTEL_ATOM_CMT_NATIVE_ID && pmu_type == hybrid_tiny) return &x86_pmu.hybrid_pmu[i]; } } @@ -5244,16 +5295,10 @@ static void intel_pmu_cpu_dead(int cpu) } static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, - bool sched_in) + struct task_struct *task, bool sched_in) { intel_pmu_pebs_sched_task(pmu_ctx, sched_in); - intel_pmu_lbr_sched_task(pmu_ctx, sched_in); -} - -static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc, - struct perf_event_pmu_context *next_epc) -{ - intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc); + intel_pmu_lbr_sched_task(pmu_ctx, task, sched_in); } static int intel_pmu_check_period(struct perf_event *event, u64 value) @@ -5424,7 +5469,6 @@ static __initconst const struct x86_pmu intel_pmu = { .guest_get_msrs = intel_guest_get_msrs, .sched_task = intel_pmu_sched_task, - .swap_task_ctx = intel_pmu_swap_task_ctx, .check_period = intel_pmu_check_period, @@ -6540,15 +6584,21 @@ __init int intel_pmu_init(void) char *name; struct x86_hybrid_pmu *pmu; + /* Architectural Perfmon was introduced starting with Core "Yonah" */ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { - case 0x6: - return p6_pmu_init(); - case 0xb: + case 6: + if (boot_cpu_data.x86_vfm < INTEL_CORE_YONAH) + return p6_pmu_init(); + break; + case 11: return knc_pmu_init(); - case 0xf: + case 15: return p4_pmu_init(); } + + pr_cont("unsupported CPU family %d model %d ", + boot_cpu_data.x86, boot_cpu_data.x86_model); return -ENODEV; } @@ -6696,7 +6746,7 @@ __init int intel_pmu_init(void) case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_SILVERMONT_MID: case INTEL_ATOM_AIRMONT: - case INTEL_ATOM_AIRMONT_MID: + case INTEL_ATOM_SILVERMONT_MID2: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index f122882ef278..1f7e1a692a7a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -953,11 +953,11 @@ unlock: return 1; } -static inline void intel_pmu_drain_pebs_buffer(void) +void intel_pmu_drain_pebs_buffer(void) { struct perf_sample_data data; - x86_pmu.drain_pebs(NULL, &data); + static_call(x86_pmu_drain_pebs)(NULL, &data); } /* @@ -1294,6 +1294,19 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) ds->pebs_interrupt_threshold = threshold; } +#define PEBS_DATACFG_CNTRS(x) \ + ((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK) + +#define PEBS_DATACFG_CNTR_BIT(x) \ + (((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT) + +#define PEBS_DATACFG_FIX(x) \ + ((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK) + +#define PEBS_DATACFG_FIX_BIT(x) \ + (((1ULL << (x)) & PEBS_DATACFG_FIX_MASK) \ + << PEBS_DATACFG_FIX_SHIFT) + static void adaptive_pebs_record_size_update(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1308,10 +1321,58 @@ static void adaptive_pebs_record_size_update(void) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); + if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) { + sz += sizeof(struct pebs_cntr_header); + + /* Metrics base and Metrics Data */ + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + sz += 2 * sizeof(u64); + + if (pebs_data_cfg & PEBS_DATACFG_CNTR) { + sz += (hweight64(PEBS_DATACFG_CNTRS(pebs_data_cfg)) + + hweight64(PEBS_DATACFG_FIX(pebs_data_cfg))) * + sizeof(u64); + } + } cpuc->pebs_record_size = sz; } +static void __intel_pmu_pebs_update_cfg(struct perf_event *event, + int idx, u64 *pebs_data_cfg) +{ + if (is_metric_event(event)) { + *pebs_data_cfg |= PEBS_DATACFG_METRICS; + return; + } + + *pebs_data_cfg |= PEBS_DATACFG_CNTR; + + if (idx >= INTEL_PMC_IDX_FIXED) + *pebs_data_cfg |= PEBS_DATACFG_FIX_BIT(idx - INTEL_PMC_IDX_FIXED); + else + *pebs_data_cfg |= PEBS_DATACFG_CNTR_BIT(idx); +} + + +static void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + u64 pebs_data_cfg = 0; + int i; + + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + if (!is_pebs_counter_event_group(event)) + continue; + __intel_pmu_pebs_update_cfg(event, cpuc->assign[i], &pebs_data_cfg); + } + + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) + cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW; +} + #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_WEIGHT_TYPE | \ @@ -1914,12 +1975,89 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, #endif } +static void intel_perf_event_update_pmc(struct perf_event *event, u64 pmc) +{ + int shift = 64 - x86_pmu.cntval_bits; + struct hw_perf_event *hwc; + u64 delta, prev_pmc; + + /* + * A recorded counter may not have an assigned event in the + * following cases. The value should be dropped. + * - An event is deleted. There is still an active PEBS event. + * The PEBS record doesn't shrink on pmu::del(). + * If the counter of the deleted event once occurred in a PEBS + * record, PEBS still records the counter until the counter is + * reassigned. + * - An event is stopped for some reason, e.g., throttled. + * During this period, another event is added and takes the + * counter of the stopped event. The stopped event is assigned + * to another new and uninitialized counter, since the + * x86_pmu_start(RELOAD) is not invoked for a stopped event. + * The PEBS__DATA_CFG is updated regardless of the event state. + * The uninitialized counter can be recorded in a PEBS record. + * But the cpuc->events[uninitialized_counter] is always NULL, + * because the event is stopped. The uninitialized value is + * safely dropped. + */ + if (!event) + return; + + hwc = &event->hw; + prev_pmc = local64_read(&hwc->prev_count); + + /* Only update the count when the PMU is disabled */ + WARN_ON(this_cpu_read(cpu_hw_events.enabled)); + local64_set(&hwc->prev_count, pmc); + + delta = (pmc << shift) - (prev_pmc << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); +} + +static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, + struct perf_event *event, + struct pebs_cntr_header *cntr, + void *next_record) +{ + int bit; + + for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) { + intel_perf_event_update_pmc(cpuc->events[bit], *(u64 *)next_record); + next_record += sizeof(u64); + } + + for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) { + /* The slots event will be handled with perf_metric later */ + if ((cntr->metrics == INTEL_CNTR_METRICS) && + (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) { + next_record += sizeof(u64); + continue; + } + intel_perf_event_update_pmc(cpuc->events[bit + INTEL_PMC_IDX_FIXED], + *(u64 *)next_record); + next_record += sizeof(u64); + } + + /* HW will reload the value right after the overflow. */ + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period); + + if (cntr->metrics == INTEL_CNTR_METRICS) { + static_call(intel_pmu_update_topdown_event) + (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS], + (u64 *)next_record); + next_record += 2 * sizeof(u64); + } +} + #define PEBS_LATENCY_MASK 0xffff /* * With adaptive PEBS the layout depends on what fields are configured. */ - static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -2049,6 +2187,28 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } + if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) { + struct pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct pebs_cntr_header); + /* + * The PEBS_DATA_CFG is a global register, which is the + * superset configuration for all PEBS events. + * For the PEBS record of non-sample-read group, ignore + * the counter snapshot fields. + */ + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + WARN_ONCE(next_record != __pebs + basic->format_size, "PEBS record size %u, expected %llu, config %llx\n", basic->format_size, @@ -2094,15 +2254,6 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) return NULL; } -void intel_pmu_auto_reload_read(struct perf_event *event) -{ - WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)); - - perf_pmu_disable(event->pmu); - intel_pmu_drain_pebs_buffer(); - perf_pmu_enable(event->pmu); -} - /* * Special variant of intel_pmu_save_and_restart() for auto-reload. */ @@ -2211,13 +2362,21 @@ __intel_pmu_pebs_last_event(struct perf_event *event, } if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); + if ((is_pebs_counter_event_group(event))) { + /* + * The value of each sample has been updated when setup + * the corresponding sample data. + */ + perf_event_update_userpage(event); + } else { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } } else intel_pmu_save_and_restart(event); } @@ -2552,6 +2711,11 @@ void __init intel_ds_init(void) break; case 6: + if (x86_pmu.intel_cap.pebs_baseline) { + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; + x86_pmu.late_setup = intel_pmu_late_setup; + } + fallthrough; case 5: x86_pmu.pebs_ept = 1; fallthrough; @@ -2576,7 +2740,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); } - pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual); /* * The PEBS-via-PT is not supported on hybrid platforms, diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index dc641b50814e..f44c3d866f24 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -422,11 +422,17 @@ static __always_inline bool lbr_is_reset_in_cstate(void *ctx) return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); } +static inline bool has_lbr_callstack_users(void *ctx) +{ + return task_context_opt(ctx)->lbr_callstack_users || + x86_pmu.lbr_callstack_users; +} + static void __intel_pmu_lbr_restore(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_context_opt(ctx)->lbr_callstack_users == 0 || + if (!has_lbr_callstack_users(ctx) || task_context_opt(ctx)->lbr_stack_state == LBR_NONE) { intel_pmu_lbr_reset(); return; @@ -503,7 +509,7 @@ static void __intel_pmu_lbr_save(void *ctx) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (task_context_opt(ctx)->lbr_callstack_users == 0) { + if (!has_lbr_callstack_users(ctx)) { task_context_opt(ctx)->lbr_stack_state = LBR_NONE; return; } @@ -516,32 +522,11 @@ static void __intel_pmu_lbr_save(void *ctx) cpuc->last_log_id = ++task_context_opt(ctx)->log_id; } -void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, - struct perf_event_pmu_context *next_epc) -{ - void *prev_ctx_data, *next_ctx_data; - - swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); - - /* - * Architecture specific synchronization makes sense in case - * both prev_epc->task_ctx_data and next_epc->task_ctx_data - * pointers are allocated. - */ - - prev_ctx_data = next_epc->task_ctx_data; - next_ctx_data = prev_epc->task_ctx_data; - - if (!prev_ctx_data || !next_ctx_data) - return; - - swap(task_context_opt(prev_ctx_data)->lbr_callstack_users, - task_context_opt(next_ctx_data)->lbr_callstack_users); -} - -void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_ctx_data *ctx_data; void *task_ctx; if (!cpuc->lbr_users) @@ -552,14 +537,18 @@ void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched * the task was scheduled out, restore the stack. Otherwise flush * the LBR stack. */ - task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL; + rcu_read_lock(); + ctx_data = rcu_dereference(task->perf_ctx_data); + task_ctx = ctx_data ? ctx_data->data : NULL; if (task_ctx) { if (sched_in) __intel_pmu_lbr_restore(task_ctx); else __intel_pmu_lbr_save(task_ctx); + rcu_read_unlock(); return; } + rcu_read_unlock(); /* * Since a context switch can flip the address space and LBR entries @@ -588,9 +577,19 @@ void intel_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data) - task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++; + if (branch_user_callstack(cpuc->br_sel)) { + if (event->attach_state & PERF_ATTACH_TASK) { + struct task_struct *task = event->hw.target; + struct perf_ctx_data *ctx_data; + rcu_read_lock(); + ctx_data = rcu_dereference(task->perf_ctx_data); + if (ctx_data) + task_context_opt(ctx_data->data)->lbr_callstack_users++; + rcu_read_unlock(); + } else + x86_pmu.lbr_callstack_users++; + } /* * Request pmu::sched_task() callback, which will fire inside the * regular perf event scheduling, so that call will: @@ -664,9 +663,19 @@ void intel_pmu_lbr_del(struct perf_event *event) if (!x86_pmu.lbr_nr) return; - if (branch_user_callstack(cpuc->br_sel) && - event->pmu_ctx->task_ctx_data) - task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--; + if (branch_user_callstack(cpuc->br_sel)) { + if (event->attach_state & PERF_ATTACH_TASK) { + struct task_struct *task = event->hw.target; + struct perf_ctx_data *ctx_data; + + rcu_read_lock(); + ctx_data = rcu_dereference(task->perf_ctx_data); + if (ctx_data) + task_context_opt(ctx_data->data)->lbr_callstack_users--; + rcu_read_unlock(); + } else + x86_pmu.lbr_callstack_users--; + } if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) cpuc->lbr_select = 0; diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 844bc4fc4724..fb726c6fc6e7 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -10,6 +10,7 @@ #include <linux/perf_event.h> #include <asm/perf_event_p4.h> +#include <asm/cpu_device_id.h> #include <asm/hardirq.h> #include <asm/apic.h> @@ -732,9 +733,9 @@ static bool p4_event_match_cpu_model(unsigned int event_idx) { /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */ if (event_idx == P4_EVENT_INSTR_COMPLETED) { - if (boot_cpu_data.x86_model != 3 && - boot_cpu_data.x86_model != 4 && - boot_cpu_data.x86_model != 6) + if (boot_cpu_data.x86_vfm != INTEL_P4_PRESCOTT && + boot_cpu_data.x86_vfm != INTEL_P4_PRESCOTT_2M && + boot_cpu_data.x86_vfm != INTEL_P4_CEDARMILL) return false; } diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c index a6cffb4f4ef5..65b45e9d7016 100644 --- a/arch/x86/events/intel/p6.c +++ b/arch/x86/events/intel/p6.c @@ -2,6 +2,8 @@ #include <linux/perf_event.h> #include <linux/types.h> +#include <asm/cpu_device_id.h> + #include "../perf_event.h" /* @@ -248,30 +250,8 @@ __init int p6_pmu_init(void) { x86_pmu = p6_pmu; - switch (boot_cpu_data.x86_model) { - case 1: /* Pentium Pro */ + if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO) x86_add_quirk(p6_pmu_rdpmc_quirk); - break; - - case 3: /* Pentium II - Klamath */ - case 5: /* Pentium II - Deschutes */ - case 6: /* Pentium II - Mendocino */ - break; - - case 7: /* Pentium III - Katmai */ - case 8: /* Pentium III - Coppermine */ - case 10: /* Pentium III Xeon */ - case 11: /* Pentium III - Tualatin */ - break; - - case 9: /* Pentium M - Banias */ - case 13: /* Pentium M - Dothan */ - break; - - default: - pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model); - return -ENODEV; - } memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 60b3078b7502..a34e50fc4a8f 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -347,8 +347,7 @@ void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) { - hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - box->hrtimer.function = uncore_pmu_hrtimer; + hrtimer_setup(&box->hrtimer, uncore_pmu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 31c2771545a6..2c0ce0e9545e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -115,6 +115,11 @@ static inline bool is_branch_counters_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; } +static inline bool is_pebs_counter_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -669,18 +674,6 @@ enum { #define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10 #define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1) -/* - * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture - * of the core. Bits 31-24 indicates its core type (Core or Atom) - * and Bits [23:0] indicates the native model ID of the core. - * Core type and native model ID are defined in below enumerations. - */ -enum hybrid_cpu_type { - HYBRID_INTEL_NONE, - HYBRID_INTEL_ATOM = 0x20, - HYBRID_INTEL_CORE = 0x40, -}; - #define X86_HYBRID_PMU_ATOM_IDX 0 #define X86_HYBRID_PMU_CORE_IDX 1 #define X86_HYBRID_PMU_TINY_IDX 2 @@ -697,11 +690,6 @@ enum hybrid_pmu_type { hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, }; -enum atom_native_id { - cmt_native_id = 0x2, /* Crestmont */ - skt_native_id = 0x3, /* Skymont */ -}; - struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -800,6 +788,7 @@ struct x86_pmu { u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + void (*late_setup)(void); unsigned eventsel; unsigned perfctr; unsigned fixedctr; @@ -869,7 +858,7 @@ struct x86_pmu { void (*check_microcode)(void); void (*sched_task)(struct perf_event_pmu_context *pmu_ctx, - bool sched_in); + struct task_struct *task, bool sched_in); /* * Intel Arch Perfmon v2+ @@ -914,6 +903,7 @@ struct x86_pmu { const int *lbr_sel_map; /* lbr_select mappings */ int *lbr_ctl_map; /* LBR_CTL mappings */ }; + u64 lbr_callstack_users; /* lbr callstack system wide users */ bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ @@ -952,14 +942,6 @@ struct x86_pmu { int num_topdown_events; /* - * perf task context (i.e. struct perf_event_pmu_context::task_ctx_data) - * switch helper to bridge calls from perf/core to perf/x86. - * See struct pmu::swap_task_ctx() usage for examples; - */ - void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc, - struct perf_event_pmu_context *next_epc); - - /* * AMD bits */ unsigned int amd_nb_constraints : 1; @@ -994,7 +976,7 @@ struct x86_pmu { */ int num_hybrid_pmus; struct x86_hybrid_pmu *hybrid_pmu; - enum hybrid_cpu_type (*get_hybrid_cpu_type) (void); + enum intel_cpu_type (*get_hybrid_cpu_type) (void); }; struct x86_perf_task_context_opt { @@ -1107,6 +1089,8 @@ extern struct x86_pmu x86_pmu __read_mostly; DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); +DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); +DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { @@ -1148,6 +1132,12 @@ extern u64 __read_mostly hw_cache_extra_regs u64 x86_perf_event_update(struct perf_event *event); +static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val) +{ + return x86_perf_event_update(event); +} +DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); + static inline unsigned int x86_pmu_config_addr(int index) { return x86_pmu.eventsel + (x86_pmu.addr_offset ? @@ -1394,7 +1384,8 @@ void amd_pmu_lbr_reset(void); void amd_pmu_lbr_read(void); void amd_pmu_lbr_add(struct perf_event *event); void amd_pmu_lbr_del(struct perf_event *event); -void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); +void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in); void amd_pmu_lbr_enable_all(void); void amd_pmu_lbr_disable_all(void); int amd_pmu_lbr_hw_config(struct perf_event *event); @@ -1448,7 +1439,8 @@ static inline void amd_pmu_brs_del(struct perf_event *event) perf_sched_cb_dec(event->pmu); } -void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); +void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in); #else static inline int amd_brs_init(void) { @@ -1473,7 +1465,8 @@ static inline void amd_pmu_brs_del(struct perf_event *event) { } -static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) +static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) { } @@ -1643,7 +1636,7 @@ void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); -void intel_pmu_auto_reload_read(struct perf_event *event); +void intel_pmu_drain_pebs_buffer(void); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); @@ -1653,10 +1646,8 @@ void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, struct cpu_hw_events *cpuc, struct perf_event *event); -void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, - struct perf_event_pmu_context *next_epc); - -void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); +void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in); u64 lbr_from_signext_quirk_wr(u64 val); diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 6c977c19f2cd..1d9e385649b5 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -9,7 +9,7 @@ PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ - /* 0x00080 */ +PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 6941f4811bec..8ddace8cea96 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -274,8 +274,7 @@ static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) { struct hrtimer *hr = &rapl_pmu->hrtimer; - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = rapl_hrtimer_handle; + hrtimer_setup(hr, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, @@ -730,6 +729,7 @@ static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_ { int nr_rapl_pmu = topology_max_packages(); struct rapl_pmus *rapl_pmus; + int ret; /* * rapl_pmu_scope must be either PKG, DIE or CORE @@ -761,7 +761,11 @@ static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_ rapl_pmus->pmu.module = THIS_MODULE; rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; - return init_rapl_pmu(rapl_pmus); + ret = init_rapl_pmu(rapl_pmus); + if (ret) + kfree(rapl_pmus); + + return ret; } static struct rapl_model model_snb = { diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index cc8c3bd0e7c2..1f7c3082a36d 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 58f4ddecc5fa..4566000e15c4 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -8,6 +8,7 @@ generated-y += syscalls_x32.h generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h +generated-y += cpufeaturemasks.h generic-y += early_ioremap.h generic-y += fprobe.h diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e3903b731305..4a37a8bd87fd 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -15,7 +15,7 @@ #define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature)) #define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/stddef.h> @@ -48,7 +48,7 @@ ".popsection\n" \ "671:" -#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " +#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock " #else /* ! CONFIG_SMP */ #define LOCK_PREFIX_HERE "" @@ -87,20 +87,19 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; * instructions were patched in already: */ extern int alternatives_patched; -struct module; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod); -extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); -extern void apply_returns(s32 *start, s32 *end, struct module *mod); -extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); +extern void apply_returns(s32 *start, s32 *end); +extern void apply_seal_endbr(s32 *start, s32 *end); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, - s32 *start_cfi, s32 *end_cfi, struct module *mod); + s32 *start_cfi, s32 *end_cfi); + +struct module; struct callthunk_sites { s32 *call_start, *call_end; - struct alt_instr *alt_start, *alt_end; }; #ifdef CONFIG_CALL_THUNKS @@ -237,10 +236,12 @@ static inline int alternatives_text_reserved(void *start, void *end) * references: i.e., if used for a function, it would add the PLT * suffix. */ -#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \ +#define alternative_call(oldfunc, newfunc, ft_flags, output, input, clobbers...) \ asm_inline volatile(ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \ : ALT_OUTPUT_SP(output) \ - : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) + : [old] "i" (oldfunc), [new] "i" (newfunc) \ + COMMA(input) \ + : clobbers) /* * Like alternative_call, but there are two features and respective functions. @@ -249,24 +250,14 @@ static inline int alternatives_text_reserved(void *start, void *end) * Otherwise, old function is used. */ #define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \ - output, input...) \ + output, input, clobbers...) \ asm_inline volatile(ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \ "call %c[new2]", ft_flags2) \ : ALT_OUTPUT_SP(output) \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ - [new2] "i" (newfunc2), ## input) - -/* - * use this macro(s) if you need more than one output parameter - * in alternative_io - */ -#define ASM_OUTPUT2(a...) a - -/* - * use this macro if you need clobbers but no inputs in - * alternative_{input,io,call}() - */ -#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr + [new2] "i" (newfunc2) \ + COMMA(input) \ + : clobbers) #define ALT_OUTPUT_SP(...) ASM_CALL_CONSTRAINT, ## __VA_ARGS__ @@ -286,7 +277,7 @@ static inline int alternatives_text_reserved(void *start, void *end) void BUG_func(void); void nop_func(void); -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #ifdef CONFIG_SMP .macro LOCK_PREFIX @@ -369,6 +360,6 @@ void nop_func(void); ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ newinstr_yes, ft_flags -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index cb2a5e113daa..77f3a589a99a 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 4c4efb93045e..adfa0854cf2d 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -27,7 +27,6 @@ struct amd_l3_cache { }; struct amd_northbridge { - struct pci_dev *root; struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; diff --git a/arch/x86/include/asm/amd_node.h b/arch/x86/include/asm/amd_node.h index 113ad3e8ee40..23fe617898a8 100644 --- a/arch/x86/include/asm/amd_node.h +++ b/arch/x86/include/asm/amd_node.h @@ -30,7 +30,31 @@ static inline u16 amd_num_nodes(void) return topology_amd_nodes_per_pkg() * topology_max_packages(); } +#ifdef CONFIG_AMD_NODE int __must_check amd_smn_read(u16 node, u32 address, u32 *value); int __must_check amd_smn_write(u16 node, u32 address, u32 value); +/* Should only be used by the HSMP driver. */ +int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write); +#else +static inline int __must_check amd_smn_read(u16 node, u32 address, u32 *value) { return -ENODEV; } +static inline int __must_check amd_smn_write(u16 node, u32 address, u32 value) { return -ENODEV; } + +static inline int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write) +{ + return -ENODEV; +} +#endif /* CONFIG_AMD_NODE */ + +/* helper for use with read_poll_timeout */ +static inline int smn_read_register(u32 reg) +{ + int data, rc; + + rc = amd_smn_read(0, reg, &data); + if (rc) + return rc; + + return data; +} #endif /*_ASM_X86_AMD_NODE_H_*/ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f21ff1932699..c903d358405d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -99,8 +99,8 @@ static inline void native_apic_mem_write(u32 reg, u32 v) volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, - ASM_OUTPUT2("=r" (v), "=m" (*addr)), - ASM_OUTPUT2("0" (v), "m" (*addr))); + ASM_OUTPUT("=r" (v), "=m" (*addr)), + ASM_INPUT("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index ba88edd0d58b..b5982b94bdba 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -16,9 +16,10 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; - asm (ALTERNATIVE("call __sw_hweight32", "popcntl %1, %0", X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + asm_inline (ALTERNATIVE("call __sw_hweight32", + "popcntl %[val], %[cnt]", X86_FEATURE_POPCNT) + : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT + : [val] REG_IN (w)); return res; } @@ -44,9 +45,10 @@ static __always_inline unsigned long __arch_hweight64(__u64 w) { unsigned long res; - asm (ALTERNATIVE("call __sw_hweight64", "popcntq %1, %0", X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + asm_inline (ALTERNATIVE("call __sw_hweight64", + "popcntq %[val], %[cnt]", X86_FEATURE_POPCNT) + : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT + : [val] REG_IN (w)); return res; } diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 3674006e3974..11c6fecc3ad7 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -16,10 +16,10 @@ #include <asm/gsseg.h> #include <asm/nospec-branch.h> -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 extern void cmpxchg8b_emu(void); #endif -#if defined(__GENKSYMS__) && defined(CONFIG_STACKPROTECTOR) +#ifdef CONFIG_STACKPROTECTOR extern unsigned long __ref_stack_chk_guard; #endif diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 2bec0c89a95c..cc2881576c2c 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, @@ -113,7 +113,7 @@ #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifndef __pic__ static __always_inline __pure void *rip_rel_ptr(void *p) { @@ -144,7 +144,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # include <asm/extable_fixup_types.h> /* Exception table entry */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ @@ -164,7 +164,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_NOKPROBE(entry) # endif -#else /* ! __ASSEMBLY__ */ +#else /* ! __ASSEMBLER__ */ # define DEFINE_EXTABLE_TYPE_REG \ ".macro extable_type_reg type:req reg:req\n" \ @@ -213,6 +213,17 @@ static __always_inline __pure void *rip_rel_ptr(void *p) /* For C file, we already have NOKPROBE_SYMBOL macro */ +/* Insert a comma if args are non-empty */ +#define COMMA(x...) __COMMA(x) +#define __COMMA(...) , ##__VA_ARGS__ + +/* + * Combine multiple asm inline constraint args into a single arg for passing to + * another macro. + */ +#define ASM_OUTPUT(x...) x +#define ASM_INPUT(x...) x + /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -221,7 +232,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) */ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define _ASM_EXTABLE(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 55b4d24356ea..75743f1dfd4e 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -30,14 +30,14 @@ static __always_inline void arch_atomic_set(atomic_t *v, int i) static __always_inline void arch_atomic_add(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "addl %1,%0" + asm_inline volatile(LOCK_PREFIX "addl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline void arch_atomic_sub(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "subl %1,%0" + asm_inline volatile(LOCK_PREFIX "subl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } @@ -50,14 +50,14 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) static __always_inline void arch_atomic_inc(atomic_t *v) { - asm volatile(LOCK_PREFIX "incl %0" + asm_inline volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc static __always_inline void arch_atomic_dec(atomic_t *v) { - asm volatile(LOCK_PREFIX "decl %0" + asm_inline volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec @@ -116,7 +116,7 @@ static __always_inline int arch_atomic_xchg(atomic_t *v, int new) static __always_inline void arch_atomic_and(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "andl %1,%0" + asm_inline volatile(LOCK_PREFIX "andl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); @@ -134,7 +134,7 @@ static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) static __always_inline void arch_atomic_or(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "orl %1,%0" + asm_inline volatile(LOCK_PREFIX "orl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); @@ -152,7 +152,7 @@ static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) static __always_inline void arch_atomic_xor(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "xorl %1,%0" + asm_inline volatile(LOCK_PREFIX "xorl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 6c6e9b9f98a4..ab838205c1c6 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -48,17 +48,20 @@ static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) ATOMIC64_EXPORT(atomic64_##sym) #endif -#ifdef CONFIG_X86_CMPXCHG64 -#define __alternative_atomic64(f, g, out, in...) \ - asm volatile("call %c[func]" \ +#ifdef CONFIG_X86_CX8 +#define __alternative_atomic64(f, g, out, in, clobbers...) \ + asm volatile("call %c[func]" \ : ALT_OUTPUT_SP(out) \ - : [func] "i" (atomic64_##g##_cx8), ## in) + : [func] "i" (atomic64_##g##_cx8) \ + COMMA(in) \ + : clobbers) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) #else -#define __alternative_atomic64(f, g, out, in...) \ - alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ - X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in) +#define __alternative_atomic64(f, g, out, in, clobbers...) \ + alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ + X86_FEATURE_CX8, ASM_OUTPUT(out), \ + ASM_INPUT(in), clobbers) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \ ATOMIC64_DECL_ONE(sym##_386) @@ -69,8 +72,8 @@ ATOMIC64_DECL_ONE(inc_386); ATOMIC64_DECL_ONE(dec_386); #endif -#define alternative_atomic64(f, out, in...) \ - __alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in) +#define alternative_atomic64(f, out, in, clobbers...) \ + __alternative_atomic64(f, f, ASM_OUTPUT(out), ASM_INPUT(in), clobbers) ATOMIC64_DECL(read); ATOMIC64_DECL(set); @@ -105,9 +108,10 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) s64 o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; - alternative_atomic64(xchg, "=&A" (o), - "S" (v), "b" (low), "c" (high) - : "memory"); + alternative_atomic64(xchg, + "=&A" (o), + ASM_INPUT("S" (v), "b" (low), "c" (high)), + "memory"); return o; } #define arch_atomic64_xchg arch_atomic64_xchg @@ -116,23 +120,25 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; - alternative_atomic64(set, /* no output */, - "S" (v), "b" (low), "c" (high) - : "eax", "edx", "memory"); + alternative_atomic64(set, + /* no output */, + ASM_INPUT("S" (v), "b" (low), "c" (high)), + "eax", "edx", "memory"); } static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { s64 r; - alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); + alternative_atomic64(read, "=&A" (r), "c" (v), "memory"); return r; } static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); return i; } #define arch_atomic64_add_return arch_atomic64_add_return @@ -140,8 +146,9 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); return i; } #define arch_atomic64_sub_return arch_atomic64_sub_return @@ -149,8 +156,10 @@ static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) { s64 a; - alternative_atomic64(inc_return, "=&A" (a), - "S" (v) : "memory", "ecx"); + alternative_atomic64(inc_return, + "=&A" (a), + "S" (v), + "memory", "ecx"); return a; } #define arch_atomic64_inc_return arch_atomic64_inc_return @@ -158,8 +167,10 @@ static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) { s64 a; - alternative_atomic64(dec_return, "=&A" (a), - "S" (v) : "memory", "ecx"); + alternative_atomic64(dec_return, + "=&A" (a), + "S" (v), + "memory", "ecx"); return a; } #define arch_atomic64_dec_return arch_atomic64_dec_return @@ -167,28 +178,34 @@ static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); } static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); } static __always_inline void arch_atomic64_inc(atomic64_t *v) { - __alternative_atomic64(inc, inc_return, /* no output */, - "S" (v) : "memory", "eax", "ecx", "edx"); + __alternative_atomic64(inc, inc_return, + /* no output */, + "S" (v), + "memory", "eax", "ecx", "edx"); } #define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_dec(atomic64_t *v) { - __alternative_atomic64(dec, dec_return, /* no output */, - "S" (v) : "memory", "eax", "ecx", "edx"); + __alternative_atomic64(dec, dec_return, + /* no output */, + "S" (v), + "memory", "eax", "ecx", "edx"); } #define arch_atomic64_dec arch_atomic64_dec @@ -197,8 +214,9 @@ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); alternative_atomic64(add_unless, - ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)), - "S" (v) : "memory"); + ASM_OUTPUT("+A" (a), "+c" (low), "+D" (high)), + "S" (v), + "memory"); return (int)a; } #define arch_atomic64_add_unless arch_atomic64_add_unless @@ -206,8 +224,10 @@ static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; - alternative_atomic64(inc_not_zero, "=&a" (r), - "S" (v) : "ecx", "edx", "memory"); + alternative_atomic64(inc_not_zero, + "=&a" (r), + "S" (v), + "ecx", "edx", "memory"); return r; } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero @@ -215,8 +235,10 @@ static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { s64 r; - alternative_atomic64(dec_if_positive, "=&A" (r), - "S" (v) : "ecx", "memory"); + alternative_atomic64(dec_if_positive, + "=&A" (r), + "S" (v), + "ecx", "memory"); return r; } #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index ae12acae5b06..87b496325b5b 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -22,14 +22,14 @@ static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "addq %1,%0" + asm_inline volatile(LOCK_PREFIX "addq %1, %0" : "=m" (v->counter) : "er" (i), "m" (v->counter) : "memory"); } static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "subq %1,%0" + asm_inline volatile(LOCK_PREFIX "subq %1, %0" : "=m" (v->counter) : "er" (i), "m" (v->counter) : "memory"); } @@ -42,7 +42,7 @@ static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) static __always_inline void arch_atomic64_inc(atomic64_t *v) { - asm volatile(LOCK_PREFIX "incq %0" + asm_inline volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) : "m" (v->counter) : "memory"); } @@ -50,7 +50,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) static __always_inline void arch_atomic64_dec(atomic64_t *v) { - asm volatile(LOCK_PREFIX "decq %0" + asm_inline volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) : "m" (v->counter) : "memory"); } @@ -110,7 +110,7 @@ static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "andq %1,%0" + asm_inline volatile(LOCK_PREFIX "andq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); @@ -128,7 +128,7 @@ static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "orq %1,%0" + asm_inline volatile(LOCK_PREFIX "orq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); @@ -146,7 +146,7 @@ static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "xorq %1,%0" + asm_inline volatile(LOCK_PREFIX "xorq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 7b44b3c4cce1..db70832232d4 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -12,11 +12,11 @@ */ #ifdef CONFIG_X86_32 -#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \ +#define mb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "mfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \ +#define rmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "lfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ +#define wmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else #define __mb() asm volatile("mfence":::"memory") @@ -50,7 +50,7 @@ #define __dma_rmb() barrier() #define __dma_wmb() barrier() -#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") +#define __smp_mb() asm volatile("lock addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index b96d45944c59..100413aff640 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -52,12 +52,12 @@ static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "orb %b1,%0" + asm_inline volatile(LOCK_PREFIX "orb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr)) : "memory"); } else { - asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" + asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -72,11 +72,11 @@ static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "andb %b1,%0" + asm_inline volatile(LOCK_PREFIX "andb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (~CONST_MASK(nr))); } else { - asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" + asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -98,7 +98,7 @@ static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "xorb %2,%1" + asm_inline volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) : "iq" ((char)mask) : "memory"); @@ -122,11 +122,11 @@ static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "xorb %b1,%0" + asm_inline volatile(LOCK_PREFIX "xorb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr))); } else { - asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" + asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 3e5b111e619d..3f02ff6d333d 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -74,7 +74,7 @@ # define BOOT_STACK_SIZE 0x1000 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern unsigned int output_len; extern const unsigned long kernel_text_size; extern const unsigned long kernel_total_size; diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index e85ac0c7c039..f0e9acf72547 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -17,13 +17,17 @@ * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit. */ #define INSN_ASOP 0x67 +#define INSN_LOCK 0xf0 #define OPCODE_ESCAPE 0x0f #define SECOND_BYTE_OPCODE_UD1 0xb9 #define SECOND_BYTE_OPCODE_UD2 0x0b #define BUG_NONE 0xffff -#define BUG_UD1 0xfffe -#define BUG_UD2 0xfffd +#define BUG_UD2 0xfffe +#define BUG_UD1 0xfffd +#define BUG_UD1_UBSAN 0xfffc +#define BUG_EA 0xffea +#define BUG_LOCK 0xfff0 #ifdef CONFIG_GENERIC_BUG diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 31d19c815f99..3e51ba459154 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -101,6 +101,16 @@ enum cfi_mode { extern enum cfi_mode cfi_mode; +#ifdef CONFIG_FINEIBT_BHI +extern bool cfi_bhi; +#else +#define cfi_bhi (0) +#endif + +typedef u8 bhi_thunk[32]; +extern bhi_thunk __bhi_args[]; +extern bhi_thunk __bhi_args_end[]; + struct pt_regs; #ifdef CONFIG_CFI_CLANG @@ -125,6 +135,18 @@ static inline int cfi_get_offset(void) #define cfi_get_offset cfi_get_offset extern u32 cfi_get_func_hash(void *func); +extern int cfi_get_func_arity(void *func); + +#ifdef CONFIG_FINEIBT +extern bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type); +#else +static inline bool +decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + return false; +} + +#endif #else static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) @@ -137,6 +159,10 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; } +static inline int cfi_get_func_arity(void *func) +{ + return 0; +} #endif /* CONFIG_CFI_CLANG */ #if HAS_KERNEL_IBT == 1 diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 5612648b0202..b61f32c3459f 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -44,22 +44,22 @@ extern void __add_wrong_size(void) __typeof__ (*(ptr)) __ret = (arg); \ switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ - asm volatile (lock #op "b %b0, %1\n" \ + asm_inline volatile (lock #op "b %b0, %1" \ : "+q" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_W: \ - asm volatile (lock #op "w %w0, %1\n" \ + asm_inline volatile (lock #op "w %w0, %1" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_L: \ - asm volatile (lock #op "l %0, %1\n" \ + asm_inline volatile (lock #op "l %0, %1" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_Q: \ - asm volatile (lock #op "q %q0, %1\n" \ + asm_inline volatile (lock #op "q %q0, %1" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ @@ -91,7 +91,7 @@ extern void __add_wrong_size(void) case __X86_CASE_B: \ { \ volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile(lock "cmpxchgb %2,%1" \ + asm_inline volatile(lock "cmpxchgb %2, %1" \ : "=a" (__ret), "+m" (*__ptr) \ : "q" (__new), "0" (__old) \ : "memory"); \ @@ -100,7 +100,7 @@ extern void __add_wrong_size(void) case __X86_CASE_W: \ { \ volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile(lock "cmpxchgw %2,%1" \ + asm_inline volatile(lock "cmpxchgw %2, %1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ @@ -109,7 +109,7 @@ extern void __add_wrong_size(void) case __X86_CASE_L: \ { \ volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile(lock "cmpxchgl %2,%1" \ + asm_inline volatile(lock "cmpxchgl %2, %1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ @@ -118,7 +118,7 @@ extern void __add_wrong_size(void) case __X86_CASE_Q: \ { \ volatile u64 *__ptr = (volatile u64 *)(ptr); \ - asm volatile(lock "cmpxchgq %2,%1" \ + asm_inline volatile(lock "cmpxchgq %2, %1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ @@ -134,7 +134,7 @@ extern void __add_wrong_size(void) __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) #define __sync_cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + __raw_cmpxchg((ptr), (old), (new), (size), "lock ") #define __cmpxchg_local(ptr, old, new, size) \ __raw_cmpxchg((ptr), (old), (new), (size), "") @@ -165,7 +165,7 @@ extern void __add_wrong_size(void) case __X86_CASE_B: \ { \ volatile u8 *__ptr = (volatile u8 *)(_ptr); \ - asm volatile(lock "cmpxchgb %[new], %[ptr]" \ + asm_inline volatile(lock "cmpxchgb %[new], %[ptr]" \ CC_SET(z) \ : CC_OUT(z) (success), \ [ptr] "+m" (*__ptr), \ @@ -177,7 +177,7 @@ extern void __add_wrong_size(void) case __X86_CASE_W: \ { \ volatile u16 *__ptr = (volatile u16 *)(_ptr); \ - asm volatile(lock "cmpxchgw %[new], %[ptr]" \ + asm_inline volatile(lock "cmpxchgw %[new], %[ptr]" \ CC_SET(z) \ : CC_OUT(z) (success), \ [ptr] "+m" (*__ptr), \ @@ -189,7 +189,7 @@ extern void __add_wrong_size(void) case __X86_CASE_L: \ { \ volatile u32 *__ptr = (volatile u32 *)(_ptr); \ - asm volatile(lock "cmpxchgl %[new], %[ptr]" \ + asm_inline volatile(lock "cmpxchgl %[new], %[ptr]" \ CC_SET(z) \ : CC_OUT(z) (success), \ [ptr] "+m" (*__ptr), \ @@ -201,7 +201,7 @@ extern void __add_wrong_size(void) case __X86_CASE_Q: \ { \ volatile u64 *__ptr = (volatile u64 *)(_ptr); \ - asm volatile(lock "cmpxchgq %[new], %[ptr]" \ + asm_inline volatile(lock "cmpxchgq %[new], %[ptr]" \ CC_SET(z) \ : CC_OUT(z) (success), \ [ptr] "+m" (*__ptr), \ @@ -222,7 +222,7 @@ extern void __add_wrong_size(void) __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) #define __sync_try_cmpxchg(ptr, pold, new, size) \ - __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock; ") + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock ") #define __try_cmpxchg_local(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), "") diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fd1282a783dd..371f7906019e 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -19,7 +19,7 @@ union __u64_halves { union __u64_halves o = { .full = (_old), }, \ n = { .full = (_new), }; \ \ - asm volatile(_lock "cmpxchg8b %[ptr]" \ + asm_inline volatile(_lock "cmpxchg8b %[ptr]" \ : [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ : "b" (n.low), "c" (n.high) \ @@ -45,7 +45,7 @@ static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new n = { .full = (_new), }; \ bool ret; \ \ - asm volatile(_lock "cmpxchg8b %[ptr]" \ + asm_inline volatile(_lock "cmpxchg8b %[ptr]" \ CC_SET(e) \ : CC_OUT(e) (ret), \ [ptr] "+m" (*(_ptr)), \ @@ -69,7 +69,7 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, return __arch_try_cmpxchg64(ptr, oldp, new,); } -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 #define arch_cmpxchg64 __cmpxchg64 @@ -91,19 +91,21 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, union __u64_halves o = { .full = (_old), }, \ n = { .full = (_new), }; \ \ - asm volatile(ALTERNATIVE(_lock_loc \ - "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ - : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ - : "memory"); \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ \ o.full; \ }) static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; "); + return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock "); } #define arch_cmpxchg64 arch_cmpxchg64 @@ -119,14 +121,16 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 n = { .full = (_new), }; \ bool ret; \ \ - asm volatile(ALTERNATIVE(_lock_loc \ - "call cmpxchg8b_emu", \ - _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ - CC_SET(e) \ - : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ - "+a" (o.low), "+d" (o.high)) \ - : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \ - : "memory"); \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ + "+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ \ if (unlikely(!ret)) \ *(_oldp) = o.full; \ @@ -136,7 +140,7 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; "); + return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock "); } #define arch_try_cmpxchg64 arch_try_cmpxchg64 diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 5e241306db26..71d1e72ed879 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -38,7 +38,7 @@ union __u128_halves { union __u128_halves o = { .full = (_old), }, \ n = { .full = (_new), }; \ \ - asm volatile(_lock "cmpxchg16b %[ptr]" \ + asm_inline volatile(_lock "cmpxchg16b %[ptr]" \ : [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ : "b" (n.low), "c" (n.high) \ @@ -65,7 +65,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, n = { .full = (_new), }; \ bool ret; \ \ - asm volatile(_lock "cmpxchg16b %[ptr]" \ + asm_inline volatile(_lock "cmpxchg16b %[ptr]" \ CC_SET(e) \ : CC_OUT(e) (ret), \ [ptr] "+m" (*(_ptr)), \ diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index aa6c8f8ca958..e7225452963f 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -15,6 +15,11 @@ enum cc_vendor { extern enum cc_vendor cc_vendor; extern u64 cc_mask; +static inline u64 cc_get_mask(void) +{ + return cc_mask; +} + static inline void cc_set_mask(u64 mask) { RIP_REL_REF(cc_mask) = mask; @@ -25,7 +30,10 @@ u64 cc_mkdec(u64 val); void cc_random_init(void); #else #define cc_vendor (CC_VENDOR_NONE) -static const u64 cc_mask = 0; +static inline u64 cc_get_mask(void) +{ + return 0; +} static inline u64 cc_mkenc(u64 val) { diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 98eced5084ca..ad235dda1ded 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -12,7 +12,6 @@ #ifndef CONFIG_SMP #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define cpu_acpi_id(cpu) 0 -#define safe_smp_processor_id() 0 #endif /* CONFIG_SMP */ #ifdef CONFIG_HOTPLUG_CPU @@ -50,20 +49,6 @@ static inline void split_lock_init(void) {} static inline void bus_lock_init(void) {} #endif -#ifdef CONFIG_CPU_SUP_INTEL -u8 get_this_hybrid_cpu_type(void); -u32 get_this_hybrid_cpu_native_id(void); -#else -static inline u8 get_this_hybrid_cpu_type(void) -{ - return 0; -} - -static inline u32 get_this_hybrid_cpu_native_id(void) -{ - return 0; -} -#endif #ifdef CONFIG_IA32_FEAT_CTL void init_ia32_feat_ctl(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index ba32e0f44cba..6be777a06944 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -57,7 +57,7 @@ #define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0) /** - * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching + * X86_MATCH_CPU - Base macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@_vendor * @_family: The family number or X86_FAMILY_ANY @@ -74,47 +74,18 @@ * into another macro at the usage site for good reasons, then please * start this local macro with X86_MATCH to allow easy grepping. */ -#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ - _steppings, _feature, _data) { \ - .vendor = X86_VENDOR_##_vendor, \ - .family = _family, \ - .model = _model, \ - .steppings = _steppings, \ - .feature = _feature, \ - .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \ - .driver_data = (unsigned long) _data \ -} - -#define X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ - _steppings, _feature, _data) { \ +#define X86_MATCH_CPU(_vendor, _family, _model, _steppings, _feature, _type, _data) { \ .vendor = _vendor, \ .family = _family, \ .model = _model, \ .steppings = _steppings, \ .feature = _feature, \ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \ + .type = _type, \ .driver_data = (unsigned long) _data \ } /** - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching - * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY - * The name is expanded to X86_VENDOR_@_vendor - * @_family: The family number or X86_FAMILY_ANY - * @_model: The model number, model constant or X86_MODEL_ANY - * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY - * @_data: Driver specific data or NULL. The internal storage - * format is unsigned long. The supplied value, pointer - * etc. is casted to unsigned long internally. - * - * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is - * set to wildcards. - */ -#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ - X86_STEPPING_ANY, feature, data) - -/** * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@vendor @@ -123,13 +94,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set to wildcards. */ -#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, \ - X86_MODEL_ANY, feature, data) +#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \ + X86_MATCH_CPU(X86_VENDOR_##vendor, family, X86_MODEL_ANY, \ + X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature @@ -139,12 +107,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set to wildcards. */ -#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \ - X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data) +#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \ + X86_MATCH_CPU(X86_VENDOR_##vendor, X86_FAMILY_ANY, X86_MODEL_ANY, \ + X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_FEATURE - Macro for matching a CPU feature @@ -152,12 +118,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set to wildcards. */ -#define X86_MATCH_FEATURE(feature, data) \ - X86_MATCH_VENDOR_FEATURE(ANY, feature, data) +#define X86_MATCH_FEATURE(feature, data) \ + X86_MATCH_CPU(X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, \ + X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model @@ -168,13 +132,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set to wildcards. */ -#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, \ - X86_FEATURE_ANY, data) +#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \ + X86_MATCH_CPU(X86_VENDOR_##vendor, family, model, X86_STEPPING_ANY, \ + X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_VENDOR_FAM - Match vendor and family @@ -184,12 +145,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set of wildcards. */ -#define X86_MATCH_VENDOR_FAM(vendor, family, data) \ - X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) +#define X86_MATCH_VENDOR_FAM(vendor, family, data) \ + X86_MATCH_CPU(X86_VENDOR_##vendor, family, X86_MODEL_ANY, \ + X86_STEPPING_ANY, X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_VFM - Match encoded vendor/family/model @@ -197,34 +156,26 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is cast to unsigned long internally. - * - * Stepping and feature are set to wildcards */ -#define X86_MATCH_VFM(vfm, data) \ - X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ - VFM_VENDOR(vfm), \ - VFM_FAMILY(vfm), \ - VFM_MODEL(vfm), \ - X86_STEPPING_ANY, X86_FEATURE_ANY, data) +#define X86_MATCH_VFM(vfm, data) \ + X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \ + X86_STEPPING_ANY, X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data) #define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** - * X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping + * X86_MATCH_VFM_STEPS - Match encoded vendor/family/model and steppings + * range. * @vfm: Encoded 8-bits each for vendor, family, model - * @steppings: Bitmask of steppings to match + * @min_step: Lowest stepping number to match + * @max_step: Highest stepping number to match * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is cast to unsigned long internally. - * - * feature is set to wildcard */ -#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \ - X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ - VFM_VENDOR(vfm), \ - VFM_FAMILY(vfm), \ - VFM_MODEL(vfm), \ - __X86_STEPPINGS(min_step, max_step), \ - X86_FEATURE_ANY, data) +#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \ + X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \ + __X86_STEPPINGS(min_step, max_step), X86_FEATURE_ANY, \ + X86_CPU_TYPE_ANY, data) /** * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature @@ -233,15 +184,22 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is cast to unsigned long internally. - * - * Steppings is set to wildcard */ -#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \ - X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \ - VFM_VENDOR(vfm), \ - VFM_FAMILY(vfm), \ - VFM_MODEL(vfm), \ - X86_STEPPING_ANY, feature, data) +#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \ + X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \ + X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data) + +/** + * X86_MATCH_VFM_CPU_TYPE - Match encoded vendor/family/model/type + * @vfm: Encoded 8-bits each for vendor, family, model + * @type: CPU type e.g. P-core, E-core + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. + */ +#define X86_MATCH_VFM_CPU_TYPE(vfm, type, data) \ + X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \ + X86_STEPPING_ANY, X86_FEATURE_ANY, type, data) extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index de1ad09fe8d7..893cbca37fe9 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -4,11 +4,12 @@ #include <asm/processor.h> -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#if defined(__KERNEL__) && !defined(__ASSEMBLER__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> +#include <asm/cpufeaturemasks.h> enum cpuid_leafs { @@ -37,92 +38,19 @@ enum cpuid_leafs NR_CPUID_WORDS, }; -#define X86_CAP_FMT_NUM "%d:%d" -#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) - extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; -#define X86_CAP_FMT "%s" -#define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; +#define x86_bug_flag(flag) x86_bug_flags[flag] #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) -/* - * There are 32 bits/features in each mask word. The high bits - * (selected with (bit>>5) give us the word number and the low 5 - * bits give us the bit/feature number inside the word. - * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can - * see if it is set in the mask word. - */ -#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ - (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) - -/* - * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the - * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all - * header macros which use NCAPINTS need to be changed. The duplicated macro - * use causes the compiler to issue errors for all headers so that all usage - * sites can be corrected. - */ -#define REQUIRED_MASK_BIT_SET(feature_bit) \ - ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ - REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 22)) - -#define DISABLED_MASK_BIT_SET(feature_bit) \ - ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ - DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 22)) - #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) @@ -149,6 +77,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); +void check_cpufeature_deps(struct cpuinfo_x86 *c); #define setup_force_cpu_cap(bit) do { \ \ @@ -208,5 +137,5 @@ t_no: #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model -#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ +#endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */ #endif /* _ASM_X86_CPUFEATURE_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8f8aaf94dc00..70e1516b15da 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -2,14 +2,6 @@ #ifndef _ASM_X86_CPUFEATURES_H #define _ASM_X86_CPUFEATURES_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include <asm/required-features.h> -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include <asm/disabled-features.h> -#endif - /* * Defines x86 CPU feature bits */ @@ -210,7 +202,6 @@ #define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ #define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ @@ -338,6 +329,7 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ +#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */ #define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ #define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ #define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ @@ -469,6 +461,10 @@ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ #define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */ +#define X86_FEATURE_SRSO_BP_SPEC_REDUCE (20*32+31) /* + * BP_CFG[BpSpecReduce] can be used to mitigate SRSO for VMs. + * (SRSO_MSR_FIX in the official doc). + */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -535,4 +531,5 @@ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */ #define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ +#define X86_BUG_SPECTRE_V2_USER X86_BUG(1*32 + 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index b2b9b4ef3dae..d5749b25fa10 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -1,222 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * CPUID-related helpers/definitions - */ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H -#include <linux/types.h> - -#include <asm/string.h> - -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; - -enum cpuid_regs_idx { - CPUID_EAX = 0, - CPUID_EBX, - CPUID_ECX, - CPUID_EDX, -}; - -#define CPUID_LEAF_MWAIT 0x5 -#define CPUID_LEAF_DCA 0x9 -#define CPUID_LEAF_XSTATE 0x0d -#define CPUID_LEAF_TSC 0x15 -#define CPUID_LEAF_FREQ 0x16 -#define CPUID_LEAF_TILE 0x1d - -#ifdef CONFIG_X86_32 -bool have_cpuid_p(void); -#else -static inline bool have_cpuid_p(void) -{ - return true; -} -#endif -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx) - : "memory"); -} - -#define native_cpuid_reg(reg) \ -static inline unsigned int native_cpuid_##reg(unsigned int op) \ -{ \ - unsigned int eax = op, ebx, ecx = 0, edx; \ - \ - native_cpuid(&eax, &ebx, &ecx, &edx); \ - \ - return reg; \ -} - -/* - * Native CPUID functions returning a single datum. - */ -native_cpuid_reg(eax) -native_cpuid_reg(ebx) -native_cpuid_reg(ecx) -native_cpuid_reg(edx) - -#ifdef CONFIG_PARAVIRT_XXL -#include <asm/paravirt.h> -#else -#define __cpuid native_cpuid -#endif - -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = 0; - __cpuid(eax, ebx, ecx, edx); -} - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(unsigned int op, int count, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return eax; -} - -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return ebx; -} - -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return ecx; -} - -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return edx; -} - -static inline void __cpuid_read(unsigned int leaf, unsigned int subleaf, u32 *regs) -{ - regs[CPUID_EAX] = leaf; - regs[CPUID_ECX] = subleaf; - __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX); -} - -#define cpuid_subleaf(leaf, subleaf, regs) { \ - static_assert(sizeof(*(regs)) == 16); \ - __cpuid_read(leaf, subleaf, (u32 *)(regs)); \ -} - -#define cpuid_leaf(leaf, regs) { \ - static_assert(sizeof(*(regs)) == 16); \ - __cpuid_read(leaf, 0, (u32 *)(regs)); \ -} - -static inline void __cpuid_read_reg(unsigned int leaf, unsigned int subleaf, - enum cpuid_regs_idx regidx, u32 *reg) -{ - u32 regs[4]; - - __cpuid_read(leaf, subleaf, regs); - *reg = regs[regidx]; -} - -#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \ - static_assert(sizeof(*(reg)) == 4); \ - __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \ -} - -#define cpuid_leaf_reg(leaf, regidx, reg) { \ - static_assert(sizeof(*(reg)) == 4); \ - __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \ -} - -static __always_inline bool cpuid_function_is_indexed(u32 function) -{ - switch (function) { - case 4: - case 7: - case 0xb: - case 0xd: - case 0xf: - case 0x10: - case 0x12: - case 0x14: - case 0x17: - case 0x18: - case 0x1d: - case 0x1e: - case 0x1f: - case 0x24: - case 0x8000001d: - return true; - } - - return false; -} - -#define for_each_possible_hypervisor_cpuid_base(function) \ - for (function = 0x40000000; function < 0x40010000; function += 0x100) - -static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) -{ - uint32_t base, eax, signature[3]; - - for_each_possible_hypervisor_cpuid_base(base) { - cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); - - /* - * This must not compile to "call memcmp" because it's called - * from PVH early boot code before instrumentation is set up - * and memcmp() itself may be instrumented. - */ - if (!__builtin_memcmp(sig, signature, 12) && - (leaves == 0 || ((eax - base) >= leaves))) - return base; - } - - return 0; -} +#include <asm/cpuid/api.h> #endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/include/asm/cpuid/api.h b/arch/x86/include/asm/cpuid/api.h new file mode 100644 index 000000000000..9c180c9cc58e --- /dev/null +++ b/arch/x86/include/asm/cpuid/api.h @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CPUID_API_H +#define _ASM_X86_CPUID_API_H + +#include <asm/cpuid/types.h> + +#include <linux/build_bug.h> +#include <linux/types.h> + +#include <asm/string.h> + +/* + * Raw CPUID accessors: + */ + +#ifdef CONFIG_X86_32 +bool have_cpuid_p(void); +#else +static inline bool have_cpuid_p(void) +{ + return true; +} +#endif + +static inline void native_cpuid(u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + +#define NATIVE_CPUID_REG(reg) \ +static inline u32 native_cpuid_##reg(u32 op) \ +{ \ + u32 eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum: + */ +NATIVE_CPUID_REG(eax) +NATIVE_CPUID_REG(ebx) +NATIVE_CPUID_REG(ecx) +NATIVE_CPUID_REG(edx) + +#ifdef CONFIG_PARAVIRT_XXL +# include <asm/paravirt.h> +#else +# define __cpuid native_cpuid +#endif + +/* + * Generic CPUID function + * + * Clear ECX since some CPUs (Cyrix MII) do not set or clear ECX + * resulting in stale register contents being returned. + */ +static inline void cpuid(u32 op, + u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ECX */ +static inline void cpuid_count(u32 op, int count, + u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum: + */ + +static inline u32 cpuid_eax(u32 op) +{ + u32 eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static inline u32 cpuid_ebx(u32 op) +{ + u32 eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static inline u32 cpuid_ecx(u32 op) +{ + u32 eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static inline u32 cpuid_edx(u32 op) +{ + u32 eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + +static inline void __cpuid_read(u32 leaf, u32 subleaf, u32 *regs) +{ + regs[CPUID_EAX] = leaf; + regs[CPUID_ECX] = subleaf; + __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX); +} + +#define cpuid_subleaf(leaf, subleaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, subleaf, (u32 *)(regs)); \ +} + +#define cpuid_leaf(leaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, 0, (u32 *)(regs)); \ +} + +static inline void __cpuid_read_reg(u32 leaf, u32 subleaf, + enum cpuid_regs_idx regidx, u32 *reg) +{ + u32 regs[4]; + + __cpuid_read(leaf, subleaf, regs); + *reg = regs[regidx]; +} + +#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \ +} + +#define cpuid_leaf_reg(leaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \ +} + +static __always_inline bool cpuid_function_is_indexed(u32 function) +{ + switch (function) { + case 4: + case 7: + case 0xb: + case 0xd: + case 0xf: + case 0x10: + case 0x12: + case 0x14: + case 0x17: + case 0x18: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x24: + case 0x8000001d: + return true; + } + + return false; +} + +#define for_each_possible_hypervisor_cpuid_base(function) \ + for (function = 0x40000000; function < 0x40010000; function += 0x100) + +static inline u32 hypervisor_cpuid_base(const char *sig, u32 leaves) +{ + u32 base, eax, signature[3]; + + for_each_possible_hypervisor_cpuid_base(base) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); + + /* + * This must not compile to "call memcmp" because it's called + * from PVH early boot code before instrumentation is set up + * and memcmp() itself may be instrumented. + */ + if (!__builtin_memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } + + return 0; +} + +#endif /* _ASM_X86_CPUID_API_H */ diff --git a/arch/x86/include/asm/cpuid/types.h b/arch/x86/include/asm/cpuid/types.h new file mode 100644 index 000000000000..8582e27e836d --- /dev/null +++ b/arch/x86/include/asm/cpuid/types.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CPUID_TYPES_H +#define _ASM_X86_CPUID_TYPES_H + +#include <linux/types.h> + +/* + * Types for raw CPUID access: + */ + +struct cpuid_regs { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +enum cpuid_regs_idx { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + +#define CPUID_LEAF_MWAIT 0x05 +#define CPUID_LEAF_DCA 0x09 +#define CPUID_LEAF_XSTATE 0x0d +#define CPUID_LEAF_TSC 0x15 +#define CPUID_LEAF_FREQ 0x16 +#define CPUID_LEAF_TILE 0x1d + +#endif /* _ASM_X86_CPUID_TYPES_H */ diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h index 4acfd57de8f1..70f6b60ad67b 100644 --- a/arch/x86/include/asm/cpumask.h +++ b/arch/x86/include/asm/cpumask.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUMASK_H #define _ASM_X86_CPUMASK_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/cpumask.h> extern void setup_cpu_local_masks(void); @@ -34,5 +34,5 @@ static __always_inline void arch_cpumask_clear_cpu(int cpu, struct cpumask *dstp #define arch_cpu_is_offline(cpu) unlikely(!arch_cpu_online(cpu)) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_CPUMASK_H */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index bf5953883ec3..cc4a3f725b37 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -5,52 +5,28 @@ #include <linux/build_bug.h> #include <linux/compiler.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/cache.h> #include <asm/percpu.h> struct task_struct; -struct pcpu_hot { - union { - struct { - struct task_struct *current_task; - int preempt_count; - int cpu_number; -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING - u64 call_depth; -#endif - unsigned long top_of_stack; - void *hardirq_stack_ptr; - u16 softirq_pending; -#ifdef CONFIG_X86_64 - bool hardirq_stack_inuse; -#else - void *softirq_stack_ptr; -#endif - }; - u8 pad[64]; - }; -}; -static_assert(sizeof(struct pcpu_hot) == 64); - -DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot); - -/* const-qualified alias to pcpu_hot, aliased by linker. */ -DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override, - const_pcpu_hot); +DECLARE_PER_CPU_CACHE_HOT(struct task_struct *, current_task); +/* const-qualified alias provided by the linker. */ +DECLARE_PER_CPU_CACHE_HOT(struct task_struct * const __percpu_seg_override, + const_current_task); static __always_inline struct task_struct *get_current(void) { if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) - return this_cpu_read_const(const_pcpu_hot.current_task); + return this_cpu_read_const(const_current_task); - return this_cpu_read_stable(pcpu_hot.current_task); + return this_cpu_read_stable(current_task); } #define current get_current() -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_CURRENT_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 62dc9f59ea76..ec95fe44fa3a 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -46,7 +46,6 @@ struct gdt_page { } __attribute__((aligned(PAGE_SIZE))); DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); -DECLARE_INIT_PER_CPU(gdt_page); /* Provide the original GDT */ static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index d440a65af8f3..7e6b9314758a 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -58,7 +58,7 @@ #define DESC_USER (_DESC_DPL(3)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> @@ -166,7 +166,7 @@ struct desc_ptr { unsigned long address; } __attribute__((packed)) ; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* Boot IDT definitions */ #define BOOT_IDT_ENTRIES 32 diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h deleted file mode 100644 index c492bdc97b05..000000000000 --- a/arch/x86/include/asm/disabled-features.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef _ASM_X86_DISABLED_FEATURES_H -#define _ASM_X86_DISABLED_FEATURES_H - -/* These features, although they might be available in a CPU - * will not be used because the compile options to support - * them are not present. - * - * This code allows them to be checked and disabled at - * compile time without an explicit #ifdef. Use - * cpu_feature_enabled(). - */ - -#ifdef CONFIG_X86_UMIP -# define DISABLE_UMIP 0 -#else -# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) -#endif - -#ifdef CONFIG_X86_64 -# define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) -# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) -# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) -# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) -# define DISABLE_PCID 0 -#else -# define DISABLE_VME 0 -# define DISABLE_K6_MTRR 0 -# define DISABLE_CYRIX_ARR 0 -# define DISABLE_CENTAUR_MCR 0 -# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -# define DISABLE_PKU 0 -# define DISABLE_OSPKE 0 -#else -# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) -# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ - -#ifdef CONFIG_X86_5LEVEL -# define DISABLE_LA57 0 -#else -# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#endif - -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION -# define DISABLE_PTI 0 -#else -# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) -#endif - -#ifdef CONFIG_MITIGATION_RETPOLINE -# define DISABLE_RETPOLINE 0 -#else -# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ - (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) -#endif - -#ifdef CONFIG_MITIGATION_RETHUNK -# define DISABLE_RETHUNK 0 -#else -# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) -#endif - -#ifdef CONFIG_MITIGATION_UNRET_ENTRY -# define DISABLE_UNRET 0 -#else -# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) -#endif - -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING -# define DISABLE_CALL_DEPTH_TRACKING 0 -#else -# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) -#endif - -#ifdef CONFIG_ADDRESS_MASKING -# define DISABLE_LAM 0 -#else -# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) -#endif - -#ifdef CONFIG_INTEL_IOMMU_SVM -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif - -#ifdef CONFIG_X86_SGX -# define DISABLE_SGX 0 -#else -# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) -#endif - -#ifdef CONFIG_XEN_PV -# define DISABLE_XENPV 0 -#else -# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31)) -#endif - -#ifdef CONFIG_INTEL_TDX_GUEST -# define DISABLE_TDX_GUEST 0 -#else -# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) -#endif - -#ifdef CONFIG_X86_USER_SHADOW_STACK -#define DISABLE_USER_SHSTK 0 -#else -#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) -#endif - -#ifdef CONFIG_X86_KERNEL_IBT -#define DISABLE_IBT 0 -#else -#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) -#endif - -#ifdef CONFIG_X86_FRED -# define DISABLE_FRED 0 -#else -# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) -#endif - -#ifdef CONFIG_KVM_AMD_SEV -#define DISABLE_SEV_SNP 0 -#else -#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) -#endif - -/* - * Make sure to add features to the correct mask - */ -#define DISABLED_MASK0 (DISABLE_VME) -#define DISABLED_MASK1 0 -#define DISABLED_MASK2 0 -#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 (DISABLE_PCID) -#define DISABLED_MASK5 0 -#define DISABLED_MASK6 0 -#define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST) -#define DISABLED_MASK9 (DISABLE_SGX) -#define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ - DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) -#define DISABLED_MASK13 0 -#define DISABLED_MASK14 0 -#define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ - DISABLE_ENQCMD) -#define DISABLED_MASK17 0 -#define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 (DISABLE_SEV_SNP) -#define DISABLED_MASK20 0 -#define DISABLED_MASK21 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 430fca13bb56..302e11b15da8 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_DWARF2_H #define _ASM_X86_DWARF2_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #warning "asm/dwarf2.h should be only included in pure assembly files" #endif diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 2e74a7f0e935..c83645d5b2a8 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -29,7 +29,6 @@ extern unsigned long e820__end_of_low_ram_pfn(void); extern u64 e820__memblock_alloc_reserved(u64 size, u64 align); extern void e820__memblock_setup(void); -extern void e820__reserve_setup_data(void); extern void e820__finish_early_params(void); extern void e820__reserve_resources(void); extern void e820__reserve_resources_late(void); diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h index 314f75d886d0..80c4a7266629 100644 --- a/arch/x86/include/asm/e820/types.h +++ b/arch/x86/include/asm/e820/types.h @@ -35,15 +35,6 @@ enum e820_type { * marking it with the IORES_DESC_SOFT_RESERVED designation. */ E820_TYPE_SOFT_RESERVED = 0xefffffff, - - /* - * Reserved RAM used by the kernel itself if - * CONFIG_INTEL_TXT=y is enabled, memory of this type - * will be included in the S3 integrity calculation - * and so should not include any memory that the BIOS - * might alter over the S3 transition: - */ - E820_TYPE_RESERVED_KERN = 128, }; /* diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h index 426fc53ff803..dfbd1ebb9f10 100644 --- a/arch/x86/include/asm/edac.h +++ b/arch/x86/include/asm/edac.h @@ -13,7 +13,7 @@ static inline void edac_atomic_scrub(void *va, u32 size) * are interrupt, DMA and SMP safe. */ for (i = 0; i < size / 4; i++, virt_addr++) - asm volatile("lock; addl $0, %0"::"m" (*virt_addr)); + asm volatile("lock addl $0, %0"::"m" (*virt_addr)); } #endif /* _ASM_X86_EDAC_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 1fb83d47711f..128602612eca 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -54,8 +54,9 @@ typedef struct user_i387_struct elf_fpregset_t; #define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ #define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ #define R_X86_64_RELATIVE 8 /* Adjust by program base */ -#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative - offset to GOT */ +#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative offset to GOT */ +#define R_X86_64_GOTPCRELX 41 +#define R_X86_64_REX_GOTPCRELX 42 #define R_X86_64_32 10 /* Direct 32 bit zero extended */ #define R_X86_64_32S 11 /* Direct 32 bit sign extended */ #define R_X86_64_16 12 /* Direct 16 bit zero extended */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index d0dcefb5cc59..4519c9f35ba0 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -31,7 +31,7 @@ /* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ #define FIXMAP_PMD_TOP 507 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/kernel.h> #include <asm/apicdef.h> #include <asm/page.h> @@ -196,5 +196,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, void __early_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index f86ad3335529..f42de5f05e7e 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -16,10 +16,9 @@ /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It - * disables preemption so be careful if you intend to use it for long periods - * of time. - * If you intend to use the FPU in irq/softirq you need to check first with - * irq_fpu_usable() if it is possible. + * disables preemption and softirq processing, so be careful if you intend to + * use it for long periods of time. Kernel-mode FPU cannot be used in all + * contexts -- see irq_fpu_usable() for details. */ /* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ @@ -50,10 +49,10 @@ static inline void kernel_fpu_begin(void) } /* - * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. - * A context switch will (and softirq might) save CPU's FPU registers to - * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in - * a random state. + * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate, or while + * using the FPU in kernel mode. A context switch will (and softirq might) save + * CPU's FPU registers to fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving + * CPU's FPU registers in a random state. * * local_bh_disable() protects against both preemption and soft interrupts * on !RT kernels. @@ -63,8 +62,6 @@ static inline void kernel_fpu_begin(void) * preemptible. Disabling preemption is the right choice here as bottom * half processing is always in thread context on RT kernels so it * implicitly prevents bottom half processing as well. - * - * Disabling preemption also serializes against kernel_fpu_begin(). */ static inline void fpregs_lock(void) { diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index fb42659f6e98..0ab65073c1cc 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -11,7 +11,7 @@ #ifdef CONFIG_FRAME_POINTER -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro FRAME_BEGIN push %_ASM_BP @@ -51,7 +51,7 @@ .endm #endif /* CONFIG_X86_64 */ -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define FRAME_BEGIN \ "push %" _ASM_BP "\n" \ @@ -82,18 +82,18 @@ static inline unsigned long encode_frame_pointer(struct pt_regs *regs) #endif /* CONFIG_X86_64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define FRAME_OFFSET __ASM_SEL(4, 8) #else /* !CONFIG_FRAME_POINTER */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro ENCODE_FRAME_POINTER ptregs_offset=0 .endm -#else /* !__ASSEMBLY */ +#else /* !__ASSEMBLER__ */ #define ENCODE_FRAME_POINTER diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 25ca00bd70e8..2a29e5216881 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -32,7 +32,7 @@ #define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9) #define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_X86_FRED #include <linux/kernel.h> @@ -113,6 +113,6 @@ static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { static inline void fred_sync_rsp0(unsigned long rsp0) { } static inline void fred_update_rsp0(void) { } #endif /* CONFIG_X86_FRED */ -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* ASM_X86_FRED_H */ diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 9e7e8ca8e299..02f239569b93 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -2,7 +2,7 @@ #ifndef _ASM_FSGSBASE_H #define _ASM_FSGSBASE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_X86_64 @@ -80,6 +80,6 @@ extern unsigned long x86_fsgsbase_read_task(struct task_struct *task, #endif /* CONFIG_X86_64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_FSGSBASE_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f9cb4d07df58..93156ac4ffe0 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -22,7 +22,7 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void __fentry__(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) @@ -36,21 +36,9 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) static inline unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) { -#ifdef CONFIG_X86_KERNEL_IBT - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) + if (is_endbr((void*)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; -#endif + return fentry_ip; } #define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip) @@ -118,11 +106,11 @@ struct dyn_arch_ftrace { }; #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer); @@ -166,6 +154,6 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ #endif /* !COMPILE_OFFSETS */ -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 6ffa8b75f4cd..f00c09ffe6a9 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,7 +3,6 @@ #define _ASM_X86_HARDIRQ_H #include <linux/threads.h> -#include <asm/current.h> typedef struct { #if IS_ENABLED(CONFIG_KVM_INTEL) @@ -66,7 +65,8 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat -#define local_softirq_pending_ref pcpu_hot.softirq_pending +DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending); +#define local_softirq_pending_ref __softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) /* diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index edebf1020e04..162ebd73a698 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -16,7 +16,7 @@ #include <asm/irq_vectors.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/percpu.h> #include <linux/profile.h> @@ -128,6 +128,6 @@ extern char spurious_entries_start[]; typedef struct irq_desc* vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); -#endif /* !ASSEMBLY_ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_HW_IRQ_H */ diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 1e59581d500c..28d845257303 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -21,7 +21,7 @@ #define HAS_KERNEL_IBT 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_X86_64 #define ASM_ENDBR "endbr64\n\t" @@ -41,7 +41,7 @@ _ASM_PTR fname "\n\t" \ ".popsection\n\t" -static inline __attribute_const__ u32 gen_endbr(void) +static __always_inline __attribute_const__ u32 gen_endbr(void) { u32 endbr; @@ -56,7 +56,7 @@ static inline __attribute_const__ u32 gen_endbr(void) return endbr; } -static inline __attribute_const__ u32 gen_endbr_poison(void) +static __always_inline __attribute_const__ u32 gen_endbr_poison(void) { /* * 4 byte NOP that isn't NOP4 (in fact it is OSP NOP3), such that it @@ -65,19 +65,24 @@ static inline __attribute_const__ u32 gen_endbr_poison(void) return 0x001f0f66; /* osp nopl (%rax) */ } -static inline bool is_endbr(u32 val) +static inline bool __is_endbr(u32 val) { if (val == gen_endbr_poison()) return true; + /* See cfi_fineibt_bhi_preamble() */ + if (IS_ENABLED(CONFIG_FINEIBT_BHI) && val == 0x001f0ff5) + return true; + val &= ~0x01000000U; /* ENDBR32 -> ENDBR64 */ return val == gen_endbr(); } +extern __noendbr bool is_endbr(u32 *val); extern __noendbr u64 ibt_save(bool disable); extern __noendbr void ibt_restore(u64 save); -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #ifdef CONFIG_X86_64 #define ENDBR endbr64 @@ -85,29 +90,29 @@ extern __noendbr void ibt_restore(u64 save); #define ENDBR endbr32 #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #else /* !IBT */ #define HAS_KERNEL_IBT 0 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define ASM_ENDBR #define IBT_NOSEAL(name) #define __noendbr -static inline bool is_endbr(u32 val) { return false; } +static inline bool is_endbr(u32 *val) { return false; } static inline u64 ibt_save(bool disable) { return 0; } static inline void ibt_restore(u64 save) { } -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define ENDBR -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_X86_KERNEL_IBT */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index ad5c68f0509d..a4ec27c67988 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -7,7 +7,7 @@ #define IDT_ALIGN (8 * (1 + HAS_KERNEL_IBT)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/entry-common.h> #include <linux/hardirq.h> @@ -474,7 +474,7 @@ static inline void fred_install_sysvec(unsigned int vector, const idtentry_t fun idt_install_sysvec(vector, asm_##function); \ } -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ /* * The ASM variants for DECLARE_IDTENTRY*() which emit the ASM entry stubs. @@ -579,7 +579,7 @@ SYM_CODE_START(spurious_entries_start) SYM_CODE_END(spurious_entries_start) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * The actual entry points. Note that DECLARE_IDTENTRY*() serves two diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 0e82ebc5d1e1..8b1b1abcef15 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,7 +2,11 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H +#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 +#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector +#else #define __head __section(".head.text") __no_sanitize_undefined +#endif struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index 438ccd4f3cc4..e48a00b3311d 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -6,7 +6,7 @@ #ifndef X86_ASM_INST_H #define X86_ASM_INST_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define REG_NUM_INVALID 100 diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 6d7b04ffc5fd..3a97a7eefb51 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -45,7 +45,18 @@ /* Wildcard match so X86_MATCH_VFM(ANY) works */ #define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) +/* Family 5 */ +#define INTEL_FAM5_START IFM(5, 0x00) /* Notational marker, also P5 A-step */ +#define INTEL_PENTIUM_75 IFM(5, 0x02) /* P54C */ +#define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */ +#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ + +/* Family 6 */ #define INTEL_PENTIUM_PRO IFM(6, 0x01) +#define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03) +#define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05) +#define INTEL_PENTIUM_III_TUALATIN IFM(6, 0x0B) +#define INTEL_PENTIUM_M_DOTHAN IFM(6, 0x0D) #define INTEL_CORE_YONAH IFM(6, 0x0E) @@ -110,9 +121,9 @@ #define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ -#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) +#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) /* Raptor Cove */ -#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) +#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) /* "Hybrid" Processors (P-Core/E-Core) */ @@ -126,16 +137,16 @@ #define INTEL_RAPTORLAKE_P IFM(6, 0xBA) #define INTEL_RAPTORLAKE_S IFM(6, 0xBF) -#define INTEL_METEORLAKE IFM(6, 0xAC) +#define INTEL_METEORLAKE IFM(6, 0xAC) /* Redwood Cove / Crestmont */ #define INTEL_METEORLAKE_L IFM(6, 0xAA) -#define INTEL_ARROWLAKE_H IFM(6, 0xC5) +#define INTEL_ARROWLAKE_H IFM(6, 0xC5) /* Lion Cove / Skymont */ #define INTEL_ARROWLAKE IFM(6, 0xC6) #define INTEL_ARROWLAKE_U IFM(6, 0xB5) -#define INTEL_LUNARLAKE_M IFM(6, 0xBD) +#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */ -#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) +#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Crestmont */ /* "Small Core" Processors (Atom/E-Core) */ @@ -149,9 +160,9 @@ #define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ #define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ #define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ +#define INTEL_ATOM_SILVERMONT_MID2 IFM(6, 0x5A) /* Anniedale */ #define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ -#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ #define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ #define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ @@ -176,16 +187,35 @@ #define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */ #define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */ -/* Family 5 */ -#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ +/* Notational marker denoting the last Family 6 model */ +#define INTEL_FAM6_LAST IFM(6, 0xFF) + +/* Family 15 - NetBurst */ +#define INTEL_P4_WILLAMETTE IFM(15, 0x01) /* Also Xeon Foster */ +#define INTEL_P4_PRESCOTT IFM(15, 0x03) +#define INTEL_P4_PRESCOTT_2M IFM(15, 0x04) +#define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */ /* Family 19 */ #define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ -/* CPU core types */ +/* + * Intel CPU core types + * + * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture + * of the core. Bits 31-24 indicates its core type (Core or Atom) + * and Bits [23:0] indicates the native model ID of the core. + * Core type and native model ID are defined in below enumerations. + */ enum intel_cpu_type { + INTEL_CPU_TYPE_UNKNOWN, INTEL_CPU_TYPE_ATOM = 0x20, INTEL_CPU_TYPE_CORE = 0x40, }; +enum intel_native_id { + INTEL_ATOM_CMT_NATIVE_ID = 0x2, /* Crestmont */ + INTEL_ATOM_SKT_NATIVE_ID = 0x3, /* Skymont */ +}; + #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ed580c7f9d0a..1a0dc2b2bf5b 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -175,6 +175,9 @@ extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, un extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); +#define arch_memremap_wb arch_memremap_wb + /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 562a547c29a5..735c3a491f60 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -116,7 +116,7 @@ ASM_CALL_ARG2 #define call_on_irqstack(func, asm_call, argconstr...) \ - call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \ + call_on_stack(__this_cpu_read(hardirq_stack_ptr), \ func, asm_call, argconstr) /* Macros to assert type correctness for run_*_on_irqstack macros */ @@ -135,7 +135,7 @@ * User mode entry and interrupt on the irq stack do not \ * switch stacks. If from user mode the task stack is empty. \ */ \ - if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \ + if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \ irq_enter_rcu(); \ func(c_args); \ irq_exit_rcu(); \ @@ -146,9 +146,9 @@ * places. Invoke the stack switch macro with the call \ * sequence which matches the above direct invocation. \ */ \ - __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ + __this_cpu_write(hardirq_stack_inuse, true); \ call_on_irqstack(func, asm_call, constr); \ - __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ + __this_cpu_write(hardirq_stack_inuse, false); \ } \ } @@ -212,9 +212,9 @@ */ #define do_softirq_own_stack() \ { \ - __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \ + __this_cpu_write(hardirq_stack_inuse, true); \ call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \ - __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \ + __this_cpu_write(hardirq_stack_inuse, false); \ } #endif diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index cf7fc2b8e3ce..abb8374c9ff7 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -4,7 +4,7 @@ #include <asm/processor-flags.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/nospec-branch.h> @@ -79,7 +79,7 @@ static __always_inline void native_local_irq_restore(unsigned long flags) #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> static __always_inline unsigned long arch_local_save_flags(void) @@ -133,10 +133,10 @@ static __always_inline unsigned long arch_local_irq_save(void) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_PARAVIRT_XXL */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static __always_inline int arch_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); @@ -154,6 +154,6 @@ static __always_inline void arch_local_irq_restore(unsigned long flags) if (!arch_irqs_disabled_flags(flags)) arch_local_irq_enable(); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 3f1c1d6c0da1..61dd1dee7812 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -7,7 +7,7 @@ #include <asm/asm.h> #include <asm/nops.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/stringify.h> #include <linux/types.h> @@ -55,6 +55,6 @@ l_yes: extern int arch_jump_entry_size(struct jump_entry *entry); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index de75306b932e..d7e33c7f096b 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -23,7 +23,7 @@ (1ULL << (__VIRTUAL_MASK_SHIFT - \ KASAN_SHADOW_SCALE_SHIFT))) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_KASAN void __init kasan_early_init(void); diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 8ad187462b68..5432457d2338 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -13,11 +13,12 @@ # define KEXEC_CONTROL_PAGE_SIZE 4096 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/string.h> #include <linux/kernel.h> +#include <asm/asm.h> #include <asm/page.h> #include <asm/ptrace.h> @@ -71,41 +72,32 @@ static inline void crash_setup_regs(struct pt_regs *newregs, if (oldregs) { memcpy(newregs, oldregs, sizeof(*newregs)); } else { + asm volatile("mov %%" _ASM_BX ",%0" : "=m"(newregs->bx)); + asm volatile("mov %%" _ASM_CX ",%0" : "=m"(newregs->cx)); + asm volatile("mov %%" _ASM_DX ",%0" : "=m"(newregs->dx)); + asm volatile("mov %%" _ASM_SI ",%0" : "=m"(newregs->si)); + asm volatile("mov %%" _ASM_DI ",%0" : "=m"(newregs->di)); + asm volatile("mov %%" _ASM_BP ",%0" : "=m"(newregs->bp)); + asm volatile("mov %%" _ASM_AX ",%0" : "=m"(newregs->ax)); + asm volatile("mov %%" _ASM_SP ",%0" : "=m"(newregs->sp)); +#ifdef CONFIG_X86_64 + asm volatile("mov %%r8,%0" : "=m"(newregs->r8)); + asm volatile("mov %%r9,%0" : "=m"(newregs->r9)); + asm volatile("mov %%r10,%0" : "=m"(newregs->r10)); + asm volatile("mov %%r11,%0" : "=m"(newregs->r11)); + asm volatile("mov %%r12,%0" : "=m"(newregs->r12)); + asm volatile("mov %%r13,%0" : "=m"(newregs->r13)); + asm volatile("mov %%r14,%0" : "=m"(newregs->r14)); + asm volatile("mov %%r15,%0" : "=m"(newregs->r15)); +#endif + asm volatile("mov %%ss,%k0" : "=a"(newregs->ss)); + asm volatile("mov %%cs,%k0" : "=a"(newregs->cs)); #ifdef CONFIG_X86_32 - asm volatile("movl %%ebx,%0" : "=m"(newregs->bx)); - asm volatile("movl %%ecx,%0" : "=m"(newregs->cx)); - asm volatile("movl %%edx,%0" : "=m"(newregs->dx)); - asm volatile("movl %%esi,%0" : "=m"(newregs->si)); - asm volatile("movl %%edi,%0" : "=m"(newregs->di)); - asm volatile("movl %%ebp,%0" : "=m"(newregs->bp)); - asm volatile("movl %%eax,%0" : "=m"(newregs->ax)); - asm volatile("movl %%esp,%0" : "=m"(newregs->sp)); - asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss)); - asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs)); - asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds)); - asm volatile("movl %%es, %%eax;" :"=a"(newregs->es)); - asm volatile("pushfl; popl %0" :"=m"(newregs->flags)); -#else - asm volatile("movq %%rbx,%0" : "=m"(newregs->bx)); - asm volatile("movq %%rcx,%0" : "=m"(newregs->cx)); - asm volatile("movq %%rdx,%0" : "=m"(newregs->dx)); - asm volatile("movq %%rsi,%0" : "=m"(newregs->si)); - asm volatile("movq %%rdi,%0" : "=m"(newregs->di)); - asm volatile("movq %%rbp,%0" : "=m"(newregs->bp)); - asm volatile("movq %%rax,%0" : "=m"(newregs->ax)); - asm volatile("movq %%rsp,%0" : "=m"(newregs->sp)); - asm volatile("movq %%r8,%0" : "=m"(newregs->r8)); - asm volatile("movq %%r9,%0" : "=m"(newregs->r9)); - asm volatile("movq %%r10,%0" : "=m"(newregs->r10)); - asm volatile("movq %%r11,%0" : "=m"(newregs->r11)); - asm volatile("movq %%r12,%0" : "=m"(newregs->r12)); - asm volatile("movq %%r13,%0" : "=m"(newregs->r13)); - asm volatile("movq %%r14,%0" : "=m"(newregs->r14)); - asm volatile("movq %%r15,%0" : "=m"(newregs->r15)); - asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss)); - asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs)); - asm volatile("pushfq; popq %0" :"=m"(newregs->flags)); + asm volatile("mov %%ds,%k0" : "=a"(newregs->ds)); + asm volatile("mov %%es,%k0" : "=a"(newregs->es)); #endif + asm volatile("pushf\n\t" + "pop %0" : "=m"(newregs->flags)); newregs->ip = _THIS_IP_; } } @@ -225,6 +217,6 @@ unsigned int arch_crash_get_elfcorehdr_size(void); #define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index dc31b13b87a0..b51d8a4673f5 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -38,7 +38,7 @@ #define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN) #define SYM_F_ALIGN __FUNC_ALIGN -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk @@ -50,7 +50,7 @@ #endif #endif /* CONFIG_MITIGATION_RETPOLINE */ -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" @@ -62,7 +62,7 @@ #endif #endif /* CONFIG_MITIGATION_RETPOLINE */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the @@ -119,33 +119,27 @@ /* SYM_FUNC_START -- use for global functions */ #define SYM_FUNC_START(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) /* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */ #define SYM_FUNC_START_NOALIGN(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) /* SYM_FUNC_START_LOCAL -- use for local functions */ #define SYM_FUNC_START_LOCAL(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) /* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */ #define SYM_FUNC_START_LOCAL_NOALIGN(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) /* SYM_FUNC_START_WEAK -- use for weak functions */ #define SYM_FUNC_START_WEAK(name) \ - SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) /* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */ #define SYM_FUNC_START_WEAK_NOALIGN(name) \ - SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_WEAK, SYM_A_NONE) #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index eb2db07ef39c..6c77c03139f7 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -296,8 +296,6 @@ enum mcp_flags { void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -bool mce_notify_irq(void); - DECLARE_PER_CPU(struct mce, injectm); /* Disable CMCI/polling for MCA bank claimed by firmware */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index f922b682b9b4..1530ee301dfe 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -10,7 +10,7 @@ #ifndef __X86_MEM_ENCRYPT_H__ #define __X86_MEM_ENCRYPT_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/init.h> #include <linux/cc_platform.h> @@ -114,6 +114,6 @@ void add_encrypt_protection_map(void); extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 3b496cdcb74b..8b8055a8eb9e 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -69,6 +69,18 @@ typedef struct { u16 pkey_allocation_map; s16 execute_only_pkey; #endif + +#ifdef CONFIG_BROADCAST_TLB_FLUSH + /* + * The global ASID will be a non-zero value when the process has + * the same ASID across all CPUs, allowing it to make use of + * hardware-assisted remote TLB invalidation like AMD INVLPGB. + */ + u16 global_asid; + + /* The process is transitioning to a new global ASID number. */ + bool asid_transition; +#endif } mm_context_t; #define INIT_MM_CONTEXT(mm) \ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 795fdd53bd0a..2398058b6e83 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_MMU_CONTEXT_H #define _ASM_X86_MMU_CONTEXT_H -#include <asm/desc.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/pkeys.h> @@ -13,6 +12,7 @@ #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/gsseg.h> +#include <asm/desc.h> extern atomic64_t last_mm_ctx_id; @@ -139,6 +139,11 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm) #define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); +#define mm_init_global_asid mm_init_global_asid +extern void mm_init_global_asid(struct mm_struct *mm); + +extern void mm_free_global_asid(struct mm_struct *mm); + /* * Init a new mm. Used on mm copies, like at fork() * and on mm's that are brand-new, like at execve(). @@ -161,6 +166,8 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif + + mm_init_global_asid(mm); mm_reset_untag_mask(mm); init_new_context_ldt(mm); return 0; @@ -170,6 +177,7 @@ static inline int init_new_context(struct task_struct *tsk, static inline void destroy_context(struct mm_struct *mm) { destroy_context_ldt(mm); + mm_free_global_asid(mm); } extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index f91ab1e75f9f..5e6193dbc97e 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -77,11 +77,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return hv_tdx_hypercall(control, input_address, output_address); if (hv_isolation_type_snp() && !hyperv_paravisor_present) { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[output_address], %%r8\n" "vmmcall" : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address) + : [output_address] "r" (output_address) : "cc", "memory", "r8", "r9", "r10", "r11"); return hv_status; } @@ -89,12 +89,12 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) if (!hv_hypercall_pg) return U64_MAX; - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[output_address], %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address), - THUNK_TARGET(hv_hypercall_pg) + : [output_address] "r" (output_address), + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); #else u32 input_address_hi = upper_32_bits(input_address); @@ -187,18 +187,18 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2) return hv_tdx_hypercall(control, input1, input2); if (hv_isolation_type_snp() && !hyperv_paravisor_present) { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[input2], %%r8\n" "vmmcall" : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "r" (input2) + : [input2] "r" (input2) : "cc", "r8", "r9", "r10", "r11"); } else { - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[input2], %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "r" (input2), + : [input2] "r" (input2), THUNK_TARGET(hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); } diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 72765b2fe0d8..bc6d2de109b5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -25,6 +25,7 @@ #define _EFER_SVME 12 /* Enable virtualization */ #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ +#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ #define EFER_SCE (1<<_EFER_SCE) @@ -34,6 +35,7 @@ #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) +#define EFER_TCE (1<<_EFER_TCE) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) /* @@ -721,6 +723,7 @@ /* Zen4 */ #define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4 #define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 /* Fam 19h MSRs */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 001853541f1e..9397a319d165 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -4,7 +4,7 @@ #include "msr-index.h" -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/asm.h> #include <asm/errno.h> @@ -397,5 +397,5 @@ static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) return wrmsr_safe_regs(regs); } #endif /* CONFIG_SMP */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_MSR_H */ diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 41a0ebb699ec..f677382093f3 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -56,6 +56,8 @@ int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler); + void stop_nmi(void); void restart_nmi(void); void local_touch_nmi(void); diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index 1c1b7550fa55..cd94221d8335 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -82,7 +82,7 @@ #define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) #define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern const unsigned char * const x86_nops[]; #endif diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index aee26bb8230f..e4d11e3318f0 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,7 +12,6 @@ #include <asm/msr-index.h> #include <asm/unwind_hints.h> #include <asm/percpu.h> -#include <asm/current.h> /* * Call depth tracking for Intel SKL CPUs to address the RSB underflow @@ -78,21 +77,21 @@ #include <asm/asm-offsets.h> #define CREDIT_CALL_DEPTH \ - movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth); + movq $-1, PER_CPU_VAR(__x86_call_depth); #define RESET_CALL_DEPTH \ xor %eax, %eax; \ bts $63, %rax; \ - movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); + movq %rax, PER_CPU_VAR(__x86_call_depth); #define RESET_CALL_DEPTH_FROM_CALL \ movb $0xfc, %al; \ shl $56, %rax; \ - movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + movq %rax, PER_CPU_VAR(__x86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #define INCREMENT_CALL_DEPTH \ - sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \ + sarq $5, PER_CPU_VAR(__x86_call_depth); \ CALL_THUNKS_DEBUG_INC_CALLS #else @@ -177,7 +176,7 @@ add $(BITS_PER_LONG/8), %_ASM_SP; \ lfence; -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions @@ -335,7 +334,7 @@ #define CLEAR_BRANCH_HISTORY_VMEXIT #endif -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; @@ -387,6 +386,8 @@ extern void call_depth_return_thunk(void); __stringify(INCREMENT_CALL_DEPTH), \ X86_FEATURE_CALL_DEPTH) +DECLARE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); + #ifdef CONFIG_CALL_THUNKS_DEBUG DECLARE_PER_CPU(u64, __x86_call_count); DECLARE_PER_CPU(u64, __x86_ret_count); @@ -521,7 +522,7 @@ extern u64 x86_pred_cmd; static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB); + alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_IBPB); } /* The Intel SPEC CTRL MSR base value cache */ @@ -558,6 +559,8 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +DECLARE_STATIC_KEY_FALSE(switch_vcpu_ibpb); + DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); @@ -602,6 +605,6 @@ static __always_inline void mds_idle_clear_cpu_buffers(void) mds_clear_cpu_buffers(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 46d7e06763c9..e0125afa53fb 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -45,7 +45,7 @@ #define ORC_TYPE_REGS 3 #define ORC_TYPE_REGS_PARTIAL 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/byteorder.h> /* @@ -73,6 +73,6 @@ struct orc_entry { #endif } __packed; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index c9fe207916f4..9265f2fca99a 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -14,7 +14,7 @@ #include <asm/page_32.h> #endif /* CONFIG_X86_64 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct page; @@ -84,7 +84,7 @@ static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) return __canonical_address(vaddr, vaddr_bits) == vaddr; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 580d71aca65a..0c623706cb7e 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -4,7 +4,7 @@ #include <asm/page_32_types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) #ifdef CONFIG_DEBUG_VIRTUAL @@ -26,6 +26,6 @@ static inline void copy_page(void *to, void *from) { memcpy(to, from, PAGE_SIZE); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PAGE_32_H */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index faf9cc1c14bb..a9b62e0e6f79 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -11,8 +11,8 @@ * a virtual address space of one gigabyte, which limits the * amount of physical memory you can use to about 950MB. * - * If you want more physical memory than this then see the CONFIG_HIGHMEM4G - * and CONFIG_HIGHMEM64G options in the kernel configuration. + * If you want more physical memory than this then see the CONFIG_VMSPLIT_2G + * and CONFIG_HIGHMEM4G options in the kernel configuration. */ #define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL) #define __PAGE_OFFSET __PAGE_OFFSET_BASE @@ -63,7 +63,7 @@ */ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This much address space is reserved for vmalloc() and iomap() @@ -75,6 +75,6 @@ extern int sysctl_legacy_va_layout; extern void find_low_pfn_range(void); extern void setup_bootmem_allocator(void); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PAGE_32_DEFS_H */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index d63576608ce7..d3aab6f4e59a 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -4,7 +4,7 @@ #include <asm/page_64_types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/cpufeatures.h> #include <asm/alternative.h> @@ -55,11 +55,12 @@ static inline void clear_page(void *page) clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, "=D" (page), - "D" (page) - : "cc", "memory", "rax", "rcx"); + "D" (page), + "cc", "memory", "rax", "rcx"); } void copy_page(void *to, void *from); +KCFI_REFERENCE(copy_page); #ifdef CONFIG_X86_5LEVEL /* @@ -94,7 +95,7 @@ static __always_inline unsigned long task_size_max(void) } #endif /* CONFIG_X86_5LEVEL */ -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION # define __HAVE_ARCH_GATE_AREA 1 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 06ef25411d62..1faa8f88850a 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/kaslr.h> #endif diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 974688973cf6..9f77bf03d747 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -43,7 +43,7 @@ #define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif /* CONFIG_X86_64 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK extern phys_addr_t physical_mask; @@ -66,6 +66,6 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); extern void initmem_init(void); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PAGE_DEFS_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 041aff51eb50..bed346bfac89 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -6,7 +6,7 @@ #include <asm/paravirt_types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct mm_struct; #endif @@ -15,7 +15,7 @@ struct mm_struct; #include <asm/asm.h> #include <asm/nospec-branch.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> @@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(const struct cpumask *cpumask, PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } -static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); -} - static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); @@ -720,7 +715,7 @@ static __always_inline unsigned long arch_local_irq_save(void) extern void default_banner(void); void native_pv_lock_init(void) __init; -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL @@ -740,18 +735,18 @@ void native_pv_lock_init(void) __init; #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline void native_pv_lock_init(void) { } #endif #endif /* !CONFIG_PARAVIRT */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_enter_mmap(struct mm_struct *mm) { @@ -769,5 +764,5 @@ static inline void paravirt_set_cap(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index fea56b04f436..62912023b46f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -4,7 +4,7 @@ #ifdef CONFIG_PARAVIRT -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <asm/desc_defs.h> @@ -134,8 +134,6 @@ struct pv_mmu_ops { void (*flush_tlb_multi)(const struct cpumask *cpus, const struct flush_tlb_info *info); - void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); - /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); @@ -242,9 +240,17 @@ extern struct paravirt_patch_template pv_ops; #define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) -int paravirt_disable_iospace(void); - -/* This generates an indirect call based on the operation type number. */ +/* + * This generates an indirect call based on the operation type number. + * + * Since alternatives run after enabling CET/IBT -- the latter setting/clearing + * capabilities and the former requiring all capabilities being finalized -- + * these indirect calls are subject to IBT and the paravirt stubs should have + * ENDBR on. + * + * OTOH since this is effectively a __nocfi indirect call, the paravirt stubs + * don't need to bother with CFI prefixes. + */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ "call *%[paravirt_opptr];" @@ -519,7 +525,7 @@ unsigned long pv_native_read_cr2(void); #define paravirt_nop ((void *)nop_func) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e525cd85f999..105db2d33c7b 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -10,7 +10,7 @@ # define __percpu_rel #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_SMP # define __percpu %__percpu_seg: @@ -20,14 +20,9 @@ #define PER_CPU_VAR(var) __percpu(var)__percpu_rel -#ifdef CONFIG_X86_64_SMP -# define INIT_PER_CPU_VAR(var) init_per_cpu__##var -#else -# define INIT_PER_CPU_VAR(var) var -#endif - #else /* !__ASSEMBLY__: */ +#include <linux/args.h> #include <linux/build_bug.h> #include <linux/stringify.h> #include <asm/asm.h> @@ -41,12 +36,7 @@ # define __seg_fs __attribute__((address_space(__seg_fs))) #endif -#ifdef CONFIG_X86_64 -# define __percpu_seg_override __seg_gs -#else -# define __percpu_seg_override __seg_fs -#endif - +#define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg) #define __percpu_prefix "" #else /* !CONFIG_CC_HAS_NAMED_AS: */ @@ -98,22 +88,6 @@ #define __force_percpu_arg(x) __force_percpu_prefix "%" #x /* - * Initialized pointers to per-CPU variables needed for the boot - * processor need to use these macros to get the proper address - * offset from __per_cpu_load on SMP. - * - * There also must be an entry in vmlinux_64.lds.S - */ -#define DECLARE_INIT_PER_CPU(var) \ - extern typeof(var) init_per_cpu_var(var) - -#ifdef CONFIG_X86_64_SMP -# define init_per_cpu_var(var) init_per_cpu__##var -#else -# define init_per_cpu_var(var) var -#endif - -/* * For arch-specific code, we can use direct single-insn ops (they * don't give an lvalue though). */ @@ -128,15 +102,10 @@ #define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff)) #define __pcpu_cast_8(val) ((u64)(val)) -#define __pcpu_op1_1(op, dst) op "b " dst -#define __pcpu_op1_2(op, dst) op "w " dst -#define __pcpu_op1_4(op, dst) op "l " dst -#define __pcpu_op1_8(op, dst) op "q " dst - -#define __pcpu_op2_1(op, src, dst) op "b " src ", " dst -#define __pcpu_op2_2(op, src, dst) op "w " src ", " dst -#define __pcpu_op2_4(op, src, dst) op "l " src ", " dst -#define __pcpu_op2_8(op, src, dst) op "q " src ", " dst +#define __pcpu_op_1(op) op "b " +#define __pcpu_op_2(op) op "w " +#define __pcpu_op_4(op) op "l " +#define __pcpu_op_8(op) op "q " #define __pcpu_reg_1(mod, x) mod "q" (x) #define __pcpu_reg_2(mod, x) mod "r" (x) @@ -168,7 +137,8 @@ do { \ ({ \ __pcpu_type_##size pfo_val__; \ \ - asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), "%[val]") \ + asm qual (__pcpu_op_##size("mov") \ + __percpu_arg([var]) ", %[val]" \ : [val] __pcpu_reg_##size("=", pfo_val__) \ : [var] "m" (__my_cpu_var(_var))); \ \ @@ -184,7 +154,8 @@ do { \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ - asm qual(__pcpu_op2_##size("mov", "%[val]", __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("mov") "%[val], " \ + __percpu_arg([var]) \ : [var] "=m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) @@ -201,7 +172,8 @@ do { \ ({ \ __pcpu_type_##size pfo_val__; \ \ - asm(__pcpu_op2_##size("mov", __force_percpu_arg(a[var]), "%[val]") \ + asm(__pcpu_op_##size("mov") \ + __force_percpu_arg(a[var]) ", %[val]" \ : [val] __pcpu_reg_##size("=", pfo_val__) \ : [var] "i" (&(_var))); \ \ @@ -210,7 +182,7 @@ do { \ #define percpu_unary_op(size, qual, op, _var) \ ({ \ - asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ + asm qual (__pcpu_op_##size(op) __percpu_arg([var]) \ : [var] "+m" (__my_cpu_var(_var))); \ }) @@ -223,7 +195,7 @@ do { \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ - asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ + asm qual (__pcpu_op_##size(op) "%[val], " __percpu_arg([var]) \ : [var] "+m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) @@ -259,8 +231,8 @@ do { \ ({ \ __pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \ \ - asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("xadd") "%[tmp], " \ + __percpu_arg([var]) \ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ [var] "+m" (__my_cpu_var(_var)) \ : : "memory"); \ @@ -303,8 +275,8 @@ do { \ __pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ \ - asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \ + __percpu_arg([var]) \ : [oval] "+a" (pco_old__), \ [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ @@ -320,8 +292,8 @@ do { \ __pcpu_type_##size pco_old__ = *pco_oval__; \ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ \ - asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ + asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \ + __percpu_arg([var]) \ CC_SET(z) \ : CC_OUT(z) (success), \ [oval] "+a" (pco_old__), \ @@ -348,15 +320,14 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ - "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - : [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -378,17 +349,16 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ - "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ - CC_SET(z) \ - : CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ \ @@ -419,15 +389,14 @@ do { \ old__.var = _oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ - "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - : [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ \ old__.var; \ }) @@ -449,19 +418,19 @@ do { \ old__.var = *_oval; \ new__.var = _nval; \ \ - asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ - "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ - CC_SET(z) \ - : CC_OUT(z) (success), \ - [var] "+m" (__my_cpu_var(_var)), \ - "+a" (old__.low), \ - "+d" (old__.high) \ - : "b" (new__.low), \ - "c" (new__.high), \ - "S" (&(_var)) \ - : "memory"); \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ if (unlikely(!success)) \ *_oval = old__.var; \ + \ likely(success); \ }) @@ -582,7 +551,7 @@ do { \ * it is accessed while this_cpu_read_stable() allows the value to be cached. * this_cpu_read_stable() is more efficient and can be used if its value * is guaranteed to be valid across CPUs. The current users include - * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are + * current_task and cpu_current_top_of_stack, both of which are * actually per-thread variables implemented as per-CPU variables and * thus stable for the duration of the respective task. */ @@ -617,9 +586,9 @@ do { \ #include <asm-generic/percpu.h> /* We can use this directly for local CPU (faster). */ -DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); +DECLARE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 0ba8d20f2d1d..812dac3f79f0 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -141,6 +141,12 @@ #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) #define PEBS_DATACFG_LBR_SHIFT 24 +#define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_CNTR_SHIFT 32 +#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) +#define PEBS_DATACFG_FIX_SHIFT 48 +#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) +#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -482,6 +488,15 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +struct pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +#define INTEL_CNTR_METRICS 0x3 + /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ @@ -509,6 +524,8 @@ struct pebs_xmm { #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) +#define IBS_CAPS_OPLDLAT (1U<<12) +#define IBS_CAPS_OPDTLBPGSIZE (1U<<19) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -534,8 +551,11 @@ struct pebs_xmm { * The lower 7 bits of the current count are random bits * preloaded by hardware and ignored in software */ +#define IBS_OP_LDLAT_EN (1ULL<<63) +#define IBS_OP_LDLAT_THRSH (0xFULL<<59) #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) +#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd4841231bb9..a33147520044 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -29,11 +29,6 @@ static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif -/* - * Flags to use when allocating a user page table page. - */ -extern gfp_t __userpte_alloc_gfp; - #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 4a12c276b181..66425424ce91 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H #define _ASM_X86_PGTABLE_2LEVEL_DEFS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> typedef unsigned long pteval_t; @@ -16,7 +16,7 @@ typedef union { pteval_t pte; pteval_t pte_low; } pte_t; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define SHARED_KERNEL_PMD 0 diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 80911349519e..9d5b257d44e3 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H #define _ASM_X86_PGTABLE_3LEVEL_DEFS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> typedef u64 pteval_t; @@ -25,7 +25,7 @@ typedef union { }; pmdval_t pmd; } pmd_t; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h index a0c1525f1b6f..e12e52ae8083 100644 --- a/arch/x86/include/asm/pgtable-invert.h +++ b/arch/x86/include/asm/pgtable-invert.h @@ -2,7 +2,7 @@ #ifndef _ASM_PGTABLE_INVERT_H #define _ASM_PGTABLE_INVERT_H 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * A clear pte value is special, and doesn't get inverted. @@ -36,6 +36,6 @@ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) return val; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 593f10aabd45..7bd6bd6df4a1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -15,7 +15,7 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/spinlock.h> #include <asm/x86_init.h> #include <asm/pkru.h> @@ -973,7 +973,7 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) } #endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifdef CONFIG_X86_32 @@ -982,7 +982,7 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) # include <asm/pgtable_64.h> #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/log2.h> @@ -1233,12 +1233,12 @@ static inline int pgd_none(pgd_t pgd) } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int direct_gbpages; void init_mem_mapping(void); @@ -1812,6 +1812,6 @@ bool arch_is_platform_page(u64 paddr); WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ set_pgd(pgdp, pgd); \ }) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 7d4ad8907297..b612cc57a4d3 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -13,7 +13,7 @@ * This file contains the functions and defines necessary to modify and use * the i386 page table tree. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/processor.h> #include <linux/threads.h> #include <asm/paravirt.h> @@ -45,7 +45,7 @@ do { \ flush_tlb_one_kernel((vaddr)); \ } while (0) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * This is used to calculate the .brk reservation for initial pagetables. diff --git a/arch/x86/include/asm/pgtable_32_areas.h b/arch/x86/include/asm/pgtable_32_areas.h index b6355416a15a..921148b42967 100644 --- a/arch/x86/include/asm/pgtable_32_areas.h +++ b/arch/x86/include/asm/pgtable_32_areas.h @@ -13,7 +13,7 @@ */ #define VMALLOC_OFFSET (8 * 1024 * 1024) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern bool __vmalloc_start_set; /* set once high_memory is set */ #endif diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index d1426b64c1b9..b89f8f1194a9 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -5,7 +5,7 @@ #include <linux/const.h> #include <asm/pgtable_64_types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This file contains the functions and defines necessary to modify and use @@ -270,7 +270,7 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end) #include <asm/pgtable-invert.h> -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) @@ -291,5 +291,5 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) i = i + 1 ; \ .endr -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index ec68f8369bdc..5bb782d856f2 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -4,7 +4,7 @@ #include <asm/sparsemem.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <asm/kaslr.h> @@ -44,7 +44,7 @@ static inline bool pgtable_l5_enabled(void) extern unsigned int pgdir_shift; extern unsigned int ptrs_per_p4d; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define SHARED_KERNEL_PMD 0 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4b804531b03c..b74ec5c3643b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -33,6 +33,7 @@ #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ +#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 #ifdef CONFIG_X86_64 @@ -64,6 +65,7 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) @@ -164,7 +166,7 @@ * to have the WB mode at index 0 (all bits clear). This is the default * right now and likely would break too much if changed. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ enum page_cache_mode { _PAGE_CACHE_MODE_WB = 0, _PAGE_CACHE_MODE_WC = 1, @@ -177,7 +179,7 @@ enum page_cache_mode { }; #endif -#define _PAGE_CC (_AT(pteval_t, cc_mask)) +#define _PAGE_CC (_AT(pteval_t, cc_get_mask())) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) @@ -239,7 +241,7 @@ enum page_cache_mode { #define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC) #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC) @@ -262,7 +264,7 @@ enum page_cache_mode { #define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * early identity mapping pte attrib macros. @@ -281,7 +283,7 @@ enum page_cache_mode { # include <asm/pgtable_64_types.h> #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> @@ -580,6 +582,6 @@ extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long page_flags); extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 919909d8cb77..578441db09f0 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -4,10 +4,11 @@ #include <asm/rmwcc.h> #include <asm/percpu.h> -#include <asm/current.h> #include <linux/static_call_types.h> +DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count); + /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -23,18 +24,18 @@ */ static __always_inline int preempt_count(void) { - return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED; + return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED; } static __always_inline void preempt_count_set(int pc) { int old, new; - old = raw_cpu_read_4(pcpu_hot.preempt_count); + old = raw_cpu_read_4(__preempt_count); do { new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new)); + } while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new)); } /* @@ -43,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \ + per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* @@ -57,17 +58,17 @@ static __always_inline void preempt_count_set(int pc) static __always_inline void set_preempt_need_resched(void) { - raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED); + raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); } static __always_inline void clear_preempt_need_resched(void) { - raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED); + raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED); } static __always_inline bool test_preempt_need_resched(void) { - return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED); + return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED); } /* @@ -76,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void) static __always_inline void __preempt_count_add(int val) { - raw_cpu_add_4(pcpu_hot.preempt_count, val); + raw_cpu_add_4(__preempt_count, val); } static __always_inline void __preempt_count_sub(int val) { - raw_cpu_add_4(pcpu_hot.preempt_count, -val); + raw_cpu_add_4(__preempt_count, -val); } /* @@ -91,7 +92,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", __my_cpu_var(pcpu_hot.preempt_count), e, + return GEN_UNARY_RMWcc("decl", __my_cpu_var(__preempt_count), e, __percpu_arg([var])); } @@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void) */ static __always_inline bool should_resched(int preempt_offset) { - return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset); + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } #ifdef CONFIG_PREEMPTION diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0cd10182e90..5d2f7e5aff26 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -60,18 +60,13 @@ struct vm86; # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -enum tlb_infos { - ENTRIES, - NR_INFO -}; - -extern u16 __read_mostly tlb_lli_4k[NR_INFO]; -extern u16 __read_mostly tlb_lli_2m[NR_INFO]; -extern u16 __read_mostly tlb_lli_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4k[NR_INFO]; -extern u16 __read_mostly tlb_lld_2m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_1g[NR_INFO]; +extern u16 __read_mostly tlb_lli_4k; +extern u16 __read_mostly tlb_lli_2m; +extern u16 __read_mostly tlb_lli_4m; +extern u16 __read_mostly tlb_lld_4k; +extern u16 __read_mostly tlb_lld_2m; +extern u16 __read_mostly tlb_lld_4m; +extern u16 __read_mostly tlb_lld_1g; /* * CPU type and hardware bug flags. Kept separately for each CPU. @@ -234,7 +229,7 @@ static inline unsigned long long l1tf_pfn_limit(void) void init_cpu_devs(void); void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); -extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void identify_secondary_cpu(unsigned int cpu); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); @@ -420,37 +415,33 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); +DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr); #ifdef CONFIG_X86_64 -struct fixed_percpu_data { - /* - * GCC hardcodes the stack canary as %gs:40. Since the - * irq_stack is the object at %gs:0, we reserve the bottom - * 48 bytes of the irq stack for the canary. - * - * Once we are willing to require -mstack-protector-guard-symbol= - * support for x86_64 stackprotector, we can get rid of this. - */ - char gs_base[40]; - unsigned long stack_canary; -}; +DECLARE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse); +#else +DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr); +#endif -DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; -DECLARE_INIT_PER_CPU(fixed_percpu_data); +DECLARE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack); +/* const-qualified alias provided by the linker. */ +DECLARE_PER_CPU_CACHE_HOT(const unsigned long __percpu_seg_override, + const_cpu_current_top_of_stack); +#ifdef CONFIG_X86_64 static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); +#ifdef CONFIG_SMP + return per_cpu_offset(cpu); +#else + return 0; +#endif } extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); -#else /* X86_64 */ -#ifdef CONFIG_STACKPROTECTOR -DECLARE_PER_CPU(unsigned long, __stack_chk_guard); -#endif -#endif /* !X86_64 */ +#endif /* X86_64 */ struct perf_event; @@ -561,9 +552,9 @@ static __always_inline unsigned long current_top_of_stack(void) * entry trampoline. */ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) - return this_cpu_read_const(const_pcpu_hot.top_of_stack); + return this_cpu_read_const(const_cpu_current_top_of_stack); - return this_cpu_read_stable(pcpu_hot.top_of_stack); + return this_cpu_read_stable(cpu_current_top_of_stack); } static __always_inline bool on_thread_stack(void) @@ -668,8 +659,6 @@ static __always_inline void prefetchw(const void *x) .sysenter_cs = __KERNEL_CS, \ } -#define KSTK_ESP(task) (task_pt_regs(task)->sp) - #else extern unsigned long __top_init_kernel_stack[]; @@ -677,8 +666,6 @@ extern unsigned long __top_init_kernel_stack[]; .sp = (unsigned long)&__top_init_kernel_stack, \ } -extern unsigned long KSTK_ESP(struct task_struct *task); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, @@ -692,6 +679,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) +#define KSTK_ESP(task) (task_pt_regs(task)->sp) /* Get/set a process' ability to use the timestamp counter instruction */ #define GET_TSC_CTL(adr) get_tsc_mode((adr)) @@ -757,6 +745,7 @@ extern enum l1tf_mitigations l1tf_mitigation; enum mds_mitigations { MDS_MITIGATION_OFF, + MDS_MITIGATION_AUTO, MDS_MITIGATION_FULL, MDS_MITIGATION_VMWERV, }; diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index 365798cb4408..5d0dbab85264 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -8,7 +8,7 @@ #ifndef _ASM_X86_PROM_H #define _ASM_X86_PROM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/of.h> #include <linux/types.h> @@ -33,5 +33,5 @@ static inline void x86_flattree_get_config(void) { } extern char cmd_line[COMMAND_LINE_SIZE]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 484f4f0131a5..05224a695872 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -15,7 +15,6 @@ void entry_SYSCALL_64(void); void entry_SYSCALL_64_safe_stack(void); void entry_SYSRETQ_unsafe_stack(void); void entry_SYSRETQ_end(void); -long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif #ifdef CONFIG_X86_32 @@ -41,6 +40,6 @@ void x86_configure_nx(void); extern int reboot_force; -long do_arch_prctl_common(int option, unsigned long arg2); +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index ab167c96b9ab..88d0a1ab1f77 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PTI_H #define _ASM_X86_PTI_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION extern void pti_init(void); @@ -11,5 +11,5 @@ extern void pti_finalize(void); static inline void pti_check_boottime_disable(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PTI_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 5a83fbd9bc0b..50f75467f73d 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -6,7 +6,7 @@ #include <asm/page_types.h> #include <uapi/asm/ptrace.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __i386__ struct pt_regs { @@ -469,5 +469,5 @@ extern int do_set_thread_area(struct task_struct *p, int idx, # define do_set_thread_area_64(p, s, t) (0) #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h index 5528e9325049..2fee5e9f1ccc 100644 --- a/arch/x86/include/asm/purgatory.h +++ b/arch/x86/include/asm/purgatory.h @@ -2,10 +2,10 @@ #ifndef _ASM_X86_PURGATORY_H #define _ASM_X86_PURGATORY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/purgatory.h> extern void purgatory(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_PURGATORY_H */ diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 1436226efe3e..b9fece5fc96d 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PVCLOCK_ABI_H #define _ASM_X86_PVCLOCK_ABI_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * These structs MUST NOT be changed. @@ -44,5 +44,5 @@ struct pvclock_wall_clock { #define PVCLOCK_GUEST_STOPPED (1 << 1) /* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */ #define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 87e5482acd0d..f607081a022a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -9,7 +9,7 @@ #define TH_FLAGS_SME_ACTIVE_BIT 0 #define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <asm/io.h> @@ -95,6 +95,6 @@ void reserve_real_mode(void); void load_trampoline_pgtable(void); void init_real_mode(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h deleted file mode 100644 index e9187ddd3d1f..000000000000 --- a/arch/x86/include/asm/required-features.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#define _ASM_X86_REQUIRED_FEATURES_H - -/* Define minimum CPUID feature set for kernel These bits are checked - really early to actually display a visible error message before the - kernel dies. Make sure to assign features to the proper mask! - - Some requirements that are not in CPUID yet are also in the - CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. - - The real information is in arch/x86/Kconfig.cpu, this just converts - the CONFIGs into a bitmask */ - -#ifndef CONFIG_MATH_EMULATION -# define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) -#else -# define NEED_FPU 0 -#endif - -#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -# define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) -#else -# define NEED_PAE 0 -#endif - -#ifdef CONFIG_X86_CMPXCHG64 -# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) -#else -# define NEED_CX8 0 -#endif - -#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) -# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) -#else -# define NEED_CMOV 0 -#endif - -# define NEED_3DNOW 0 - -#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) -# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) -#else -# define NEED_NOPL 0 -#endif - -#ifdef CONFIG_MATOM -# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) -#else -# define NEED_MOVBE 0 -#endif - -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL -/* Paravirtualized systems may not have PSE or PGE available */ -#define NEED_PSE 0 -#define NEED_PGE 0 -#else -#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) -#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) -#endif -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) -#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) -#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) -#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) -#define NEED_LM (1<<(X86_FEATURE_LM & 31)) -#else -#define NEED_PSE 0 -#define NEED_MSR 0 -#define NEED_PGE 0 -#define NEED_FXSR 0 -#define NEED_XMM 0 -#define NEED_XMM2 0 -#define NEED_LM 0 -#endif - -#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ - NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ - NEED_XMM|NEED_XMM2) -#define SSE_MASK (NEED_XMM|NEED_XMM2) - -#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) - -#define REQUIRED_MASK2 0 -#define REQUIRED_MASK3 (NEED_NOPL) -#define REQUIRED_MASK4 (NEED_MOVBE) -#define REQUIRED_MASK5 0 -#define REQUIRED_MASK6 0 -#define REQUIRED_MASK7 0 -#define REQUIRED_MASK8 0 -#define REQUIRED_MASK9 0 -#define REQUIRED_MASK10 0 -#define REQUIRED_MASK11 0 -#define REQUIRED_MASK12 0 -#define REQUIRED_MASK13 0 -#define REQUIRED_MASK14 0 -#define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 -#define REQUIRED_MASK17 0 -#define REQUIRED_MASK18 0 -#define REQUIRED_MASK19 0 -#define REQUIRED_MASK20 0 -#define REQUIRED_MASK21 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 8b1b6ce1e51b..011bf67a1866 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -4,8 +4,10 @@ #ifdef CONFIG_X86_CPU_RESCTRL -#include <linux/sched.h> #include <linux/jump_label.h> +#include <linux/percpu.h> +#include <linux/resctrl_types.h> +#include <linux/sched.h> /* * This value can never be a valid CLOSID, and is used when mapping a @@ -40,6 +42,7 @@ DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); extern bool rdt_alloc_capable; extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); @@ -79,6 +82,21 @@ static inline void resctrl_arch_disable_mon(void) static_branch_dec_cpuslocked(&rdt_enable_key); } +static inline bool resctrl_arch_is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool resctrl_arch_is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool resctrl_arch_is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * @@ -96,8 +114,8 @@ static inline void resctrl_arch_disable_mon(void) static inline void __resctrl_sched_in(struct task_struct *tsk) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 closid = state->default_closid; - u32 rmid = state->default_rmid; + u32 closid = READ_ONCE(state->default_closid); + u32 rmid = READ_ONCE(state->default_rmid); u32 tmp; /* @@ -132,6 +150,13 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) return val * scale; } +static inline void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, + u32 rmid) +{ + WRITE_ONCE(per_cpu(pqr_state.default_closid, cpu), closid); + WRITE_ONCE(per_cpu(pqr_state.default_rmid, cpu), rmid); +} + static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) { @@ -178,6 +203,11 @@ static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, void *ctx) { }; +u64 resctrl_arch_get_prefetch_disable_bits(void); +int resctrl_arch_pseudo_lock_fn(void *_plr); +int resctrl_arch_measure_cycles_lat_fn(void *_plr); +int resctrl_arch_measure_l2_residency(void *_plr); +int resctrl_arch_measure_l3_residency(void *_plr); void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 363266cbcada..3821ee3fae35 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -29,7 +29,7 @@ cc_label: c = true; \ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c; \ - asm volatile (fullop CC_SET(cc) \ + asm_inline volatile (fullop CC_SET(cc) \ : [var] "+m" (_var), CC_OUT(cc) (c) \ : __VA_ARGS__ : clobbers); \ c; \ diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h index 6652ebddfd02..8d983cfd06ea 100644 --- a/arch/x86/include/asm/runtime-const.h +++ b/arch/x86/include/asm/runtime-const.h @@ -2,6 +2,18 @@ #ifndef _ASM_RUNTIME_CONST_H #define _ASM_RUNTIME_CONST_H +#ifdef __ASSEMBLY__ + +.macro RUNTIME_CONST_PTR sym reg + movq $0x0123456789abcdef, %\reg + 1: + .pushsection runtime_ptr_\sym, "a" + .long 1b - 8 - . + .popsection +.endm + +#else /* __ASSEMBLY__ */ + #define runtime_const_ptr(sym) ({ \ typeof(sym) __ret; \ asm_inline("mov %1,%0\n1:\n" \ @@ -58,4 +70,5 @@ static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), } } +#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 9d6411c65920..77d8f49b92bd 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -233,7 +233,7 @@ #define VDSO_CPUNODE_BITS 12 #define VDSO_CPUNODE_MASK 0xfff -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Helper functions to store/load CPU and node numbers */ @@ -265,7 +265,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) *node = (p >> VDSO_CPUNODE_BITS); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef __KERNEL__ @@ -286,7 +286,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) */ #define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; extern void early_ignore_irq(void); @@ -350,7 +350,7 @@ static inline void __loadsegment_fs(unsigned short value) #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index cc62ef70ccc0..8d9f1c9aaa4c 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H -#include <linux/mm.h> #include <asm/page.h> #include <asm-generic/set_memory.h> @@ -38,7 +37,6 @@ int set_memory_rox(unsigned long addr, int numpages); * The caller is required to take care of these. */ -int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot); int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); int _set_memory_wt(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 85f4fde3515c..ad9212df0ec0 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -27,7 +27,7 @@ #define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */ #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/cache.h> #include <asm/bootparam.h> @@ -46,6 +46,7 @@ void setup_bios_corruption_check(void); void early_platform_quirks(void); extern unsigned long saved_video_mode; +extern unsigned long acpi_realmode_flags; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); @@ -141,7 +142,7 @@ extern bool builtin_cmdline_added __ro_after_init; #define builtin_cmdline_added 0 #endif -#else /* __ASSEMBLY */ +#else /* __ASSEMBLER__ */ .macro __RESERVE_BRK name, size .pushsection .bss..brk, "aw" @@ -153,6 +154,6 @@ SYM_DATA_END(__brk_\name) #define RESERVE_BRK(name, size) __RESERVE_BRK name, size -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SETUP_H */ diff --git a/arch/x86/include/asm/setup_data.h b/arch/x86/include/asm/setup_data.h index 77c51111a893..7bb16f843c93 100644 --- a/arch/x86/include/asm/setup_data.h +++ b/arch/x86/include/asm/setup_data.h @@ -4,7 +4,7 @@ #include <uapi/asm/setup_data.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct pci_setup_rom { struct setup_data data; @@ -27,6 +27,6 @@ struct efi_setup_data { u64 reserved[8]; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index fcbbef484a78..a28ff6b14145 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -106,7 +106,7 @@ #define TDX_PS_1G 2 #define TDX_PS_NR (TDX_PS_1G + 1) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/compiler_attributes.h> @@ -177,5 +177,5 @@ static __always_inline u64 hcall_func(u64 exit_reason) return exit_reason; } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h index 4cb77e004615..ba6f2fe43848 100644 --- a/arch/x86/include/asm/shstk.h +++ b/arch/x86/include/asm/shstk.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_SHSTK_H #define _ASM_X86_SHSTK_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> struct task_struct; @@ -37,6 +37,6 @@ static inline int shstk_update_last_frame(unsigned long val) { return 0; } static inline bool shstk_is_enabled(void) { return false; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SHSTK_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 4a4043ca6493..c72d46175374 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_SIGNAL_H #define _ASM_X86_SIGNAL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/linkage.h> /* Most things should be clean enough to redefine this at will, if care @@ -28,9 +28,9 @@ typedef struct { #define SA_IA32_ABI 0x02000000u #define SA_X32_ABI 0x01000000u -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include <uapi/asm/signal.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define __ARCH_HAS_SA_RESTORER @@ -101,5 +101,5 @@ struct pt_regs; #endif /* !__i386__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SIGNAL_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 2de1e5a75c57..daea94c2993c 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -13,7 +13,7 @@ #include <asm/cpufeatures.h> #include <asm/alternative.h> -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define ASM_CLAC \ ALTERNATIVE "", "clac", X86_FEATURE_SMAP @@ -21,7 +21,7 @@ #define ASM_STAC \ ALTERNATIVE "", "stac", X86_FEATURE_SMAP -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ static __always_inline void clac(void) { @@ -61,6 +61,6 @@ static __always_inline void smap_restore(unsigned long flags) #define ASM_STAC \ ALTERNATIVE("", "stac", X86_FEATURE_SMAP) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SMAP_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ca073f40698f..0c1c68039d6f 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -1,12 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SMP_H #define _ASM_X86_SMP_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/cpumask.h> +#include <linux/thread_info.h> #include <asm/cpumask.h> -#include <asm/current.h> -#include <asm/thread_info.h> + +DECLARE_PER_CPU_CACHE_HOT(int, cpu_number); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); @@ -114,13 +115,12 @@ void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); void smp_kick_mwait_play_dead(void); +void __noreturn mwait_play_dead(unsigned int eax_hint); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -void smp_store_cpu_info(int id); - asmlinkage __visible void smp_reboot_interrupt(void); __visible void smp_reschedule_interrupt(struct pt_regs *regs); __visible void smp_call_function_interrupt(struct pt_regs *regs); @@ -133,14 +133,8 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); * This function is needed by all SMP systems. It must _always_ be valid * from the initial startup. */ -#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number) -#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number) - -#ifdef CONFIG_X86_32 -extern int safe_smp_processor_id(void); -#else -# define safe_smp_processor_id() smp_processor_id() -#endif +#define raw_smp_processor_id() this_cpu_read(cpu_number) +#define __smp_processor_id() __this_cpu_read(cpu_number) static inline struct cpumask *cpu_llc_shared_mask(int cpu) { @@ -164,6 +158,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return (struct cpumask *)cpumask_of(0); } + +static inline void __noreturn mwait_play_dead(unsigned int eax_hint) { BUG(); } #endif /* CONFIG_SMP */ #ifdef CONFIG_DEBUG_NMI_SELFTEST @@ -175,7 +171,7 @@ extern void nmi_selftest(void); extern unsigned int smpboot_control; extern unsigned long apic_mmio_base; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* Control bits for startup_64 */ #define STARTUP_READ_APICID 0x80000000 diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 03e7c2d49559..6266d6b9e0b8 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -42,14 +42,14 @@ static __always_inline void native_write_cr2(unsigned long val) asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } -static inline unsigned long __native_read_cr3(void) +static __always_inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } -static inline void native_write_cr3(unsigned long val) +static __always_inline void native_write_cr3(unsigned long val) { asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } @@ -176,9 +176,8 @@ static __always_inline void clflush(volatile void *__p) static inline void clflushopt(volatile void *__p) { - alternative_io(".byte 0x3e; clflush %0", - ".byte 0x66; clflush %0", - X86_FEATURE_CLFLUSHOPT, + alternative_io("ds clflush %0", + "clflushopt %0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } @@ -186,14 +185,11 @@ static inline void clwb(volatile void *__p) { volatile struct { char x[64]; } *p = __p; - asm volatile(ALTERNATIVE_2( - ".byte 0x3e; clflush (%[pax])", - ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ - X86_FEATURE_CLFLUSHOPT, - ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ - X86_FEATURE_CLWB) - : [p] "+m" (*p) - : [pax] "a" (p)); + asm_inline volatile(ALTERNATIVE_2( + "ds clflush %0", + "clflushopt %0", X86_FEATURE_CLFLUSHOPT, + "clwb %0", X86_FEATURE_CLWB) + : "+m" (*p)); } #ifdef CONFIG_X86_USER_SHADOW_STACK diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h deleted file mode 100644 index e0975e9c4f47..000000000000 --- a/arch/x86/include/asm/sta2x11.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Header file for STMicroelectronics ConneXt (STA2X11) IOHub - */ -#ifndef __ASM_STA2X11_H -#define __ASM_STA2X11_H - -#include <linux/pci.h> - -/* This needs to be called from the MFD to configure its sub-devices */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev); - -#endif /* __ASM_STA2X11_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 00473a650f51..cd761b14eb02 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -2,26 +2,10 @@ /* * GCC stack protector support. * - * Stack protector works by putting predefined pattern at the start of + * Stack protector works by putting a predefined pattern at the start of * the stack frame and verifying that it hasn't been overwritten when - * returning from the function. The pattern is called stack canary - * and unfortunately gcc historically required it to be at a fixed offset - * from the percpu segment base. On x86_64, the offset is 40 bytes. - * - * The same segment is shared by percpu area and stack canary. On - * x86_64, percpu symbols are zero based and %gs (64-bit) points to the - * base of percpu area. The first occupant of the percpu area is always - * fixed_percpu_data which contains stack_canary at the appropriate - * offset. On x86_32, the stack canary is just a regular percpu - * variable. - * - * Putting percpu data in %fs on 32-bit is a minor optimization compared to - * using %gs. Since 32-bit userspace normally has %fs == 0, we are likely - * to load 0 into %fs on exit to usermode, whereas with percpu data in - * %gs, we are likely to load a non-null %gs on return to user mode. - * - * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector - * support, we can remove some of this complexity. + * returning from the function. The pattern is called the stack canary + * and is a unique value for each task. */ #ifndef _ASM_STACKPROTECTOR_H @@ -36,6 +20,8 @@ #include <linux/sched.h> +DECLARE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard); + /* * Initialize the stackprotector canary value. * @@ -51,25 +37,13 @@ static __always_inline void boot_init_stack_canary(void) { unsigned long canary = get_random_canary(); -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); -#endif - current->stack_canary = canary; -#ifdef CONFIG_X86_64 - this_cpu_write(fixed_percpu_data.stack_canary, canary); -#else this_cpu_write(__stack_chk_guard, canary); -#endif } static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) { -#ifdef CONFIG_X86_64 - per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary; -#else per_cpu(__stack_chk_guard, cpu) = idle->stack_canary; -#endif } #else /* STACKPROTECTOR */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 9d0b324eab21..79e9695dc13e 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -21,6 +21,7 @@ extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); void *__memset(void *s, int c, size_t n); +KCFI_REFERENCE(__memset); /* * KMSAN needs to instrument as much code as possible. Use C versions of @@ -70,6 +71,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t count); void *__memmove(void *dest, const void *src, size_t count); +KCFI_REFERENCE(__memmove); int memcmp(const void *cs, const void *ct, size_t count); size_t strlen(const char *s); diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 6d8d6bc183b7..cd21a0405ac5 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -31,7 +31,7 @@ */ static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(bts) " %1,%0" + asm volatile("lock " __ASM_SIZE(bts) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -49,7 +49,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(btr) " %1,%0" + asm volatile("lock " __ASM_SIZE(btr) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -66,7 +66,7 @@ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) */ static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(btc) " %1,%0" + asm volatile("lock " __ASM_SIZE(btc) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -82,7 +82,7 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) */ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(bts), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -95,7 +95,7 @@ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btr), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -108,7 +108,7 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btc), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btc), *addr, c, "Ir", nr); } #define sync_test_bit(nr, addr) test_bit(nr, addr) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index b4b16dafd55e..65394aa9b49f 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -30,7 +30,7 @@ #define TDX_SUCCESS 0ULL #define TDX_RND_NO_ENTROPY 0x8000020300000000ULL -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <uapi/asm/mce.h> @@ -126,5 +126,5 @@ static inline int tdx_enable(void) { return -ENODEV; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a55c214f3ba6..9282465eea21 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -54,7 +54,7 @@ * - this struct should fit entirely inside of one cache line * - this struct shares the supervisor stack pages */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct task_struct; #include <asm/cpufeature.h> #include <linux/atomic.h> @@ -73,7 +73,7 @@ struct thread_info { .flags = 0, \ } -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #include <asm/asm-offsets.h> @@ -161,7 +161,7 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Walks up the stack frames to make sure that the specified object is @@ -213,7 +213,7 @@ static inline int arch_within_stack_frames(const void * const stack, #endif } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * Thread-synchronous status. @@ -224,7 +224,7 @@ static inline int arch_within_stack_frames(const void * const stack, */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ @@ -242,6 +242,6 @@ static inline int arch_within_stack_frames(const void * const stack, extern void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 77f52bc1578a..866ea78ba156 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,6 +6,9 @@ static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> +#include <linux/kernel.h> +#include <vdso/bits.h> +#include <vdso/page.h> static inline void tlb_flush(struct mmu_gather *tlb) { @@ -25,4 +28,139 @@ static inline void invlpg(unsigned long addr) asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } +enum addr_stride { + PTE_STRIDE = 0, + PMD_STRIDE = 1 +}; + +/* + * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination + * of the three. For example: + * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address + * - FLAG_PCID: invalidate all TLB entries matching the PCID + * + * The first is used to invalidate (kernel) mappings at a particular + * address across all processes. + * + * The latter invalidates all TLB entries matching a PCID. + */ +#define INVLPGB_FLAG_VA BIT(0) +#define INVLPGB_FLAG_PCID BIT(1) +#define INVLPGB_FLAG_ASID BIT(2) +#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) +#define INVLPGB_FLAG_FINAL_ONLY BIT(4) +#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) + +/* The implied mode when all bits are clear: */ +#define INVLPGB_MODE_ALL_NONGLOBALS 0UL + +#ifdef CONFIG_BROADCAST_TLB_FLUSH +/* + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. + * + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can + * be done in a parallel fashion. + * + * The instruction takes the number of extra pages to invalidate, beyond the + * first page, while __invlpgb gets the more human readable number of pages to + * invalidate. + * + * The bits in rax[0:2] determine respectively which components of the address + * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* + * address in the specified range matches. + * + * Since it is desired to only flush TLB entries for the ASID that is executing + * the instruction (a host/hypervisor or a guest), the ASID valid bit should + * always be set. On a host/hypervisor, the hardware will use the ASID value + * specified in EDX[15:0] (which should be 0). On a guest, the hardware will + * use the actual ASID value of the guest. + * + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from + * this CPU have completed. + */ +static inline void __invlpgb(unsigned long asid, unsigned long pcid, + unsigned long addr, u16 nr_pages, + enum addr_stride stride, u8 flags) +{ + u64 rax = addr | flags | INVLPGB_FLAG_ASID; + u32 ecx = (stride << 31) | (nr_pages - 1); + u32 edx = (pcid << 16) | asid; + + /* The low bits in rax are for flags. Verify addr is clean. */ + VM_WARN_ON_ONCE(addr & ~PAGE_MASK); + + /* INVLPGB; supported in binutils >= 2.36. */ + asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx)); +} + +static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) +{ + __invlpgb(asid, pcid, 0, 1, 0, flags); +} + +static inline void __tlbsync(void) +{ + /* + * TLBSYNC waits for INVLPGB instructions originating on the same CPU + * to have completed. Print a warning if the task has been migrated, + * and might not be waiting on all the INVLPGBs issued during this TLB + * invalidation sequence. + */ + cant_migrate(); + + /* TLBSYNC: supported in binutils >= 0.36. */ + asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); +} +#else +/* Some compilers (I'm looking at you clang!) simply can't do DCE */ +static inline void __invlpgb(unsigned long asid, unsigned long pcid, + unsigned long addr, u16 nr_pages, + enum addr_stride s, u8 flags) { } +static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { } +static inline void __tlbsync(void) { } +#endif + +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, + unsigned long addr, + u16 nr, bool stride) +{ + enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE; + u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA; + + __invlpgb(0, pcid, addr, nr, str, flags); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) +{ + __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invlpgb_flush_all(void) +{ + /* + * TLBSYNC at the end needs to make sure all flushes done on the + * current CPU have been executed system-wide. Therefore, make + * sure nothing gets migrated in-between but disable preemption + * as it is cheaper. + */ + guard(preempt)(); + __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL); + __tlbsync(); +} + +/* Flush addr, including globals, for all PCIDs. */ +static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) +{ + __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invlpgb_flush_all_nonglobals(void) +{ + guard(preempt)(); + __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS); + __tlbsync(); +} #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h index 1ad56eb3e8a8..80aaf64ff25f 100644 --- a/arch/x86/include/asm/tlbbatch.h +++ b/arch/x86/include/asm/tlbbatch.h @@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch { * the PFNs being flushed.. */ struct cpumask cpumask; + /* + * Set if pages were unmapped from any MM, even one that does not + * have active CPUs in its cpumask. + */ + bool unmapped_pages; }; #endif /* _ARCH_X86_TLBBATCH_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 02fc2aa06e9e..a9af8759de34 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -6,6 +6,7 @@ #include <linux/mmu_notifier.h> #include <linux/sched.h> +#include <asm/barrier.h> #include <asm/processor.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> @@ -183,6 +184,9 @@ static inline void cr4_init_shadow(void) extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; +/* How many pages can be invalidated with one INVLPGB. */ +extern u16 invlpgb_count_max; + extern void initialize_tlbstate_and_flush(void); /* @@ -231,6 +235,71 @@ void flush_tlb_one_kernel(unsigned long addr); void flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); +static inline bool is_dyn_asid(u16 asid) +{ + return asid < TLB_NR_DYN_ASIDS; +} + +static inline bool is_global_asid(u16 asid) +{ + return !is_dyn_asid(asid); +} + +#ifdef CONFIG_BROADCAST_TLB_FLUSH +static inline u16 mm_global_asid(struct mm_struct *mm) +{ + u16 asid; + + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return 0; + + asid = smp_load_acquire(&mm->context.global_asid); + + /* mm->context.global_asid is either 0, or a global ASID */ + VM_WARN_ON_ONCE(asid && is_dyn_asid(asid)); + + return asid; +} + +static inline void mm_init_global_asid(struct mm_struct *mm) +{ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + mm->context.global_asid = 0; + mm->context.asid_transition = false; + } +} + +static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) +{ + /* + * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> + * finish_asid_transition() needs to observe asid_transition = true + * once it observes global_asid. + */ + mm->context.asid_transition = true; + smp_store_release(&mm->context.global_asid, asid); +} + +static inline void mm_clear_asid_transition(struct mm_struct *mm) +{ + WRITE_ONCE(mm->context.asid_transition, false); +} + +static inline bool mm_in_asid_transition(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return false; + + return mm && READ_ONCE(mm->context.asid_transition); +} +#else +static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } +static inline void mm_init_global_asid(struct mm_struct *mm) { } +static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } +static inline void mm_clear_asid_transition(struct mm_struct *mm) { } +static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; } +#endif /* CONFIG_BROADCAST_TLB_FLUSH */ + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif @@ -242,7 +311,7 @@ void flush_tlb_multi(const struct cpumask *cpumask, flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ - : PAGE_SHIFT, false) + : PAGE_SHIFT, true) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, @@ -284,6 +353,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + batch->unmapped_pages = true; mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index ec134b719144..6c79ee7c0957 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -229,11 +229,11 @@ static inline bool topology_is_primary_thread(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_primary_thread_mask); } +#define topology_is_primary_thread topology_is_primary_thread #else /* CONFIG_SMP */ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_smt_threads(void) { return 1; } -static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } #endif /* !CONFIG_SMP */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 1f1deaecd364..869b88061801 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -35,8 +35,6 @@ static inline int get_si_code(unsigned long condition) return TRAP_BRKPT; } -extern int panic_on_unrecovered_nmi; - void math_emulate(struct math_emu_info *); bool fault_in_kernel_space(unsigned long address); diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 85cc57cb6539..8f4579c5a6f8 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -5,7 +5,7 @@ #include "orc_types.h" -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro UNWIND_HINT_END_OF_STACK UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK @@ -88,6 +88,6 @@ #define UNWIND_HINT_RESTORE \ UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index d7f6592b74a9..80be0da733df 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -18,12 +18,6 @@ struct vdso_image { unsigned long extable_base, extable_len; const void *extable; - long sym_vvar_start; /* Negative offset to the vvar area */ - - long sym_vvar_page; - long sym_pvclock_page; - long sym_hvclock_page; - long sym_timens_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h index 2bf9c0e970c3..ff1c11b9fa27 100644 --- a/arch/x86/include/asm/vdso/getrandom.h +++ b/arch/x86/include/asm/vdso/getrandom.h @@ -5,7 +5,7 @@ #ifndef __ASM_VDSO_GETRANDOM_H #define __ASM_VDSO_GETRANDOM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/unistd.h> @@ -27,16 +27,6 @@ static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsig return ret; } -extern struct vdso_rng_data vdso_rng_data - __attribute__((visibility("hidden"))); - -static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) -{ - if (IS_ENABLED(CONFIG_TIME_NS) && __arch_get_vdso_data()->clock_mode == VDSO_CLOCKMODE_TIMENS) - return (void *)&vdso_rng_data + ((void *)&timens_page - (void *)__arch_get_vdso_data()); - return &vdso_rng_data; -} - -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 375a34b0f365..73b2e7ee8f0f 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -10,7 +10,7 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <uapi/linux/time.h> #include <asm/vgtod.h> @@ -19,12 +19,6 @@ #include <asm/pvclock.h> #include <clocksource/hyperv_timer.h> -extern struct vdso_data vvar_page - __attribute__((visibility("hidden"))); - -extern struct vdso_data timens_page - __attribute__((visibility("hidden"))); - #define VDSO_HAS_TIME 1 #define VDSO_HAS_CLOCK_GETRES 1 @@ -59,14 +53,6 @@ extern struct ms_hyperv_tsc_page hvclock_page __attribute__((visibility("hidden"))); #endif -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return &timens_page; -} -#endif - #ifndef BUILD_VDSO32 static __always_inline @@ -250,7 +236,7 @@ static u64 vread_hvclock(void) #endif static inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { if (likely(clock_mode == VDSO_CLOCKMODE_TSC)) return (u64)rdtsc_ordered() & S64_MAX; @@ -275,12 +261,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode, return U64_MAX; } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return &vvar_page; -} - -static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool arch_vdso_clocksource_ok(const struct vdso_clock *vc) { return true; } @@ -319,37 +300,37 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) * declares everything with the MSB/Sign-bit set as invalid. Therefore the * effective mask is S64_MAX. */ -static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base) +static __always_inline u64 vdso_calc_ns(const struct vdso_clock *vc, u64 cycles, u64 base) { - u64 delta = cycles - vd->cycle_last; + u64 delta = cycles - vc->cycle_last; /* * Negative motion and deltas which can cause multiplication * overflow require special treatment. This check covers both as - * negative motion is guaranteed to be greater than @vd::max_cycles + * negative motion is guaranteed to be greater than @vc::max_cycles * due to unsigned comparison. * * Due to the MSB/Sign-bit being used as invalid marker (see * arch_vdso_cycles_ok() above), the effective mask is S64_MAX, but that * case is also unlikely and will also take the unlikely path here. */ - if (unlikely(delta > vd->max_cycles)) { + if (unlikely(delta > vc->max_cycles)) { /* * Due to the above mentioned TSC wobbles, filter out * negative motion. Per the above masking, the effective * sign bit is now bit 62. */ if (delta & (1ULL << 62)) - return base >> vd->shift; + return base >> vc->shift; /* Handle multiplication overflow gracefully */ - return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, vd->shift); + return mul_u64_u32_add_u64_shr(delta & S64_MAX, vc->mult, base, vc->shift); } - return ((delta * vd->mult) + base) >> vd->shift; + return ((delta * vc->mult) + base) >> vc->shift; } #define vdso_calc_ns vdso_calc_ns -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h index 2cbce97d29ea..c9b2ba7a9ec4 100644 --- a/arch/x86/include/asm/vdso/processor.h +++ b/arch/x86/include/asm/vdso/processor.h @@ -5,7 +5,7 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static __always_inline void rep_nop(void) @@ -22,6 +22,6 @@ struct getcpu_cache; notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index 37b4a70559a8..4aa311a923f2 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -2,40 +2,21 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#define __VDSO_RND_DATA_OFFSET 640 -#define __VVAR_PAGES 4 +#define __VDSO_PAGES 6 #define VDSO_NR_VCLOCK_PAGES 2 +#define VDSO_VCLOCK_PAGES_START(_b) ((_b) + (__VDSO_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE) #define VDSO_PAGE_PVCLOCK_OFFSET 0 #define VDSO_PAGE_HVCLOCK_OFFSET 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <vdso/datapage.h> #include <asm/vgtod.h> -extern struct vdso_data *vdso_data; - -/* - * Update the vDSO data page to keep in sync with kernel timekeeping. - */ -static __always_inline -struct vdso_data *__x86_get_k_vdso_data(void) -{ - return vdso_data; -} -#define __arch_get_k_vdso_data __x86_get_k_vdso_data - -static __always_inline -struct vdso_rng_data *__x86_get_k_vdso_rng_data(void) -{ - return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; -} -#define __arch_get_k_vdso_rng_data __x86_get_k_vdso_rng_data - /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec3..5d471253c755 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -15,8 +15,6 @@ #define MODULE_PROC_FAMILY "586TSC " #elif defined CONFIG_M586MMX #define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -33,8 +31,6 @@ #define MODULE_PROC_FAMILY "K6 " #elif defined CONFIG_MK7 #define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index baca0b00ef76..a078a2b0f032 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -72,7 +72,7 @@ #endif #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Explicitly size integers that represent pfns in the public interface * with Xen so that on ARM we can have one ABI that works for 32 and 64 * bit guests. */ @@ -137,7 +137,7 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); #define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl)) #define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct trap_info { uint8_t vector; /* exception vector */ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ @@ -186,7 +186,7 @@ struct arch_shared_info { uint32_t wc_sec_hi; #endif }; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_X86_32 #include <asm/xen/interface_32.h> @@ -196,7 +196,7 @@ struct arch_shared_info { #include <asm/pvclock-abi.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. @@ -376,7 +376,7 @@ struct xen_pmu_arch { } c; }; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * Prefix forces emulation of some non-trapping instructions. diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h index dc40578abded..74d9768a9cf7 100644 --- a/arch/x86/include/asm/xen/interface_32.h +++ b/arch/x86/include/asm/xen/interface_32.h @@ -44,7 +44,7 @@ */ #define __HYPERVISOR_VIRT_START 0xF5800000 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct cpu_user_regs { uint32_t ebx; @@ -85,7 +85,7 @@ typedef struct xen_callback xen_callback_t; #define XEN_CALLBACK(__cs, __eip) \ ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) }) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h index c10f279aae93..38a19edb81a3 100644 --- a/arch/x86/include/asm/xen/interface_64.h +++ b/arch/x86/include/asm/xen/interface_64.h @@ -77,7 +77,7 @@ #define VGCF_in_syscall (1<<_VGCF_in_syscall) #define VGCF_IN_SYSCALL VGCF_in_syscall -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct iret_context { /* Top of stack (%rsp at point of hypercall). */ @@ -143,7 +143,7 @@ typedef unsigned long xen_callback_t; #define XEN_CALLBACK(__cs, __rip) \ ((unsigned long)(__rip)) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_XEN_INTERFACE_64_H */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 9b82eebd7add..dafbf581c515 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -26,7 +26,7 @@ #define XLF_5LEVEL_ENABLED (1<<6) #define XLF_MEM_ENCRYPTION (1<<7) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/screen_info.h> @@ -210,6 +210,6 @@ enum x86_hardware_subarch { X86_NR_SUBARCHS, }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 2f491efe3a12..55bc66867156 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -54,7 +54,7 @@ */ #define E820_RESERVED_KERN 128 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> struct e820entry { __u64 addr; /* start of memory segment */ @@ -76,7 +76,7 @@ struct e820map { #define BIOS_ROM_BASE 0xffe00000 #define BIOS_ROM_END 0xffffffff -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_E820_H */ diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h index d62ac5db093b..a82c039d8e6a 100644 --- a/arch/x86/include/uapi/asm/ldt.h +++ b/arch/x86/include/uapi/asm/ldt.h @@ -12,7 +12,7 @@ /* The size of each LDT entry. */ #define LDT_ENTRY_SIZE 8 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS * not to the default values if you still want to do syscalls. This @@ -44,5 +44,5 @@ struct user_desc { #define MODIFY_LDT_CONTENTS_STACK 1 #define MODIFY_LDT_CONTENTS_CODE 2 -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_LDT_H */ diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h index e7516b402a00..4b8917ca28fe 100644 --- a/arch/x86/include/uapi/asm/msr.h +++ b/arch/x86/include/uapi/asm/msr.h @@ -2,7 +2,7 @@ #ifndef _UAPI_ASM_X86_MSR_H #define _UAPI_ASM_X86_MSR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/ioctl.h> @@ -10,5 +10,5 @@ #define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8]) #define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8]) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_MSR_H */ diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 16074b9c93bb..5823584dea13 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -25,7 +25,7 @@ #else /* __i386__ */ -#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) +#if defined(__ASSEMBLER__) || defined(__FRAME_OFFSETS) /* * C ABI says these regs are callee-preserved. They aren't saved on kernel entry * unless syscall needs a complete, fully filled "struct pt_regs". @@ -57,7 +57,7 @@ #define EFLAGS 144 #define RSP 152 #define SS 160 -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* top of stack page */ #define FRAME_SIZE 168 @@ -87,7 +87,7 @@ #define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #endif diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index 85165c0edafc..e0b5b4f6226b 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -7,7 +7,7 @@ #include <asm/processor-flags.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __i386__ /* this struct defines the way the registers are stored on the @@ -81,6 +81,6 @@ struct pt_regs { -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h index b111b0c18544..50c45ead4e7c 100644 --- a/arch/x86/include/uapi/asm/setup_data.h +++ b/arch/x86/include/uapi/asm/setup_data.h @@ -18,7 +18,7 @@ #define SETUP_INDIRECT (1<<31) #define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> @@ -78,6 +78,6 @@ struct ima_setup_data { __u64 size; } __attribute__((packed)); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index f777346450ec..1067efabf18b 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -2,7 +2,7 @@ #ifndef _UAPI_ASM_X86_SIGNAL_H #define _UAPI_ASM_X86_SIGNAL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/compiler.h> @@ -16,7 +16,7 @@ struct siginfo; typedef unsigned long sigset_t; #endif /* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define SIGHUP 1 @@ -68,7 +68,7 @@ typedef unsigned long sigset_t; #include <asm-generic/signal-defs.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # ifndef __KERNEL__ @@ -106,6 +106,6 @@ typedef struct sigaltstack { __kernel_size_t ss_size; } stack_t; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_SIGNAL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b43eb7e384eb..84cfa179802c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,8 @@ KCOV_INSTRUMENT_unwind_orc.o := n KCOV_INSTRUMENT_unwind_frame.o := n KCOV_INSTRUMENT_unwind_guess.o := n +CFLAGS_head32.o := -fno-stack-protector +CFLAGS_head64.o := -fno-stack-protector CFLAGS_irq.o := -I $(src)/../include/asm/trace obj-y += head_$(BITS).o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5854f0b8f0f1..d5ac34186555 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -13,9 +13,11 @@ #include <linux/sched.h> #include <acpi/processor.h> +#include <asm/cpu_device_id.h> #include <asm/cpuid.h> #include <asm/mwait.h> #include <asm/special_insns.h> +#include <asm/smp.h> /* * Initialize bm_flags based on the CPU cache properties @@ -47,12 +49,11 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, /* * On all recent Intel platforms, ARB_DISABLE is a nop. * So, set bm_control to zero to indicate that ARB_DISABLE - * is not required while entering C3 type state on - * P4, Core and beyond CPUs + * is not required while entering C3 type state. */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) - flags->bm_control = 0; + (c->x86 > 15 || (c->x86_vfm >= INTEL_CORE2_MEROM && c->x86_vfm <= INTEL_FAM6_LAST))) + flags->bm_control = 0; if (c->x86_vendor == X86_VENDOR_CENTAUR) { if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && @@ -205,6 +206,16 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + unsigned int cpu = smp_processor_id(); + struct cstate_entry *percpu_entry; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + mwait_play_dead(percpu_entry->states[cx->index].eax); +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead); + void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S index 4e498d28cdc8..aefb9cb583ad 100644 --- a/arch/x86/kernel/acpi/madt_playdead.S +++ b/arch/x86/kernel/acpi/madt_playdead.S @@ -14,6 +14,7 @@ * rsi: PGD of the identity mapping */ SYM_FUNC_START(asm_acpi_mp_play_dead) + ANNOTATE_NOENDBR /* Turn off global entries. Following CR3 write will flush them. */ movq %cr4, %rdx andq $~(X86_CR4_PGE), %rdx diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c index d5ef6215583b..f36f28405dcc 100644 --- a/arch/x86/kernel/acpi/madt_wakeup.c +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -70,58 +70,6 @@ static void __init free_pgt_page(void *pgt, void *dummy) return memblock_free(pgt, PAGE_SIZE); } -/* - * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at - * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches - * to the identity mapping and the function has be present at the same spot in - * the virtual address space before and after switching page tables. - */ -static int __init init_transition_pgtable(pgd_t *pgd) -{ - pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; - unsigned long vaddr, paddr; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - vaddr = (unsigned long)asm_acpi_mp_play_dead; - pgd += pgd_index(vaddr); - if (!pgd_present(*pgd)) { - p4d = (p4d_t *)alloc_pgt_page(NULL); - if (!p4d) - return -ENOMEM; - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); - } - p4d = p4d_offset(pgd, vaddr); - if (!p4d_present(*p4d)) { - pud = (pud_t *)alloc_pgt_page(NULL); - if (!pud) - return -ENOMEM; - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); - } - pud = pud_offset(p4d, vaddr); - if (!pud_present(*pud)) { - pmd = (pmd_t *)alloc_pgt_page(NULL); - if (!pmd) - return -ENOMEM; - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); - } - pmd = pmd_offset(pud, vaddr); - if (!pmd_present(*pmd)) { - pte = (pte_t *)alloc_pgt_page(NULL); - if (!pte) - return -ENOMEM; - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); - } - pte = pte_offset_kernel(pmd, vaddr); - - paddr = __pa(vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); - - return 0; -} - static int __init acpi_mp_setup_reset(u64 reset_vector) { struct x86_mapping_info info = { @@ -130,6 +78,7 @@ static int __init acpi_mp_setup_reset(u64 reset_vector) .page_flag = __PAGE_KERNEL_LARGE_EXEC, .kernpg_flag = _KERNPG_TABLE_NOENC, }; + unsigned long mstart, mend; pgd_t *pgd; pgd = alloc_pgt_page(NULL); @@ -137,8 +86,6 @@ static int __init acpi_mp_setup_reset(u64 reset_vector) return -ENOMEM; for (int i = 0; i < nr_pfn_mapped; i++) { - unsigned long mstart, mend; - mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { @@ -147,14 +94,24 @@ static int __init acpi_mp_setup_reset(u64 reset_vector) } } - if (kernel_ident_mapping_init(&info, pgd, - PAGE_ALIGN_DOWN(reset_vector), - PAGE_ALIGN(reset_vector + 1))) { + mstart = PAGE_ALIGN_DOWN(reset_vector); + mend = mstart + PAGE_SIZE; + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { kernel_ident_mapping_free(&info, pgd); return -ENOMEM; } - if (init_transition_pgtable(pgd)) { + /* + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping + * at the same place as in the kernel page tables. + * asm_acpi_mp_play_dead() switches to the identity mapping and the + * function must be present at the same spot in the virtual address space + * before and after switching page tables. + */ + info.offset = __START_KERNEL_map - phys_base; + mstart = PAGE_ALIGN_DOWN(__pa(asm_acpi_mp_play_dead)); + mend = mstart + PAGE_SIZE; + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { kernel_ident_mapping_free(&info, pgd); return -ENOMEM; } diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index b200a193beeb..04f561f75e99 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -17,6 +17,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ SYM_FUNC_START(wakeup_long64) + ANNOTATE_NOENDBR movq saved_magic(%rip), %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c71b575bf229..bf82c6f7d690 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -392,10 +392,8 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, - struct module *mod) +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { - u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; @@ -405,14 +403,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, } if (a->instrlen != 6 || - wr_instr[0] != CALL_RIP_REL_OPCODE || - wr_instr[1] != CALL_RIP_REL_MODRM) { + instr[0] != CALL_RIP_REL_OPCODE || + instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ - disp = *(s32 *)(wr_instr + 2); + disp = *(s32 *)(instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ @@ -450,8 +448,7 @@ static inline u8 * instr_va(struct alt_instr *i) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end, - struct module *mod) + struct alt_instr *end) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; @@ -480,7 +477,6 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; - u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -494,11 +490,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, } instr = instr_va(a); - wr_instr = module_writable_address(mod, instr); - replacement = (u8 *)&a->repl_offset + a->repl_offset; - wr_replacement = module_writable_address(mod, replacement); - BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -509,9 +501,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - memcpy(insn_buff, wr_instr, a->instrlen); + memcpy(insn_buff, instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); - text_poke_early(wr_instr, insn_buff, a->instrlen); + text_poke_early(instr, insn_buff, a->instrlen); continue; } @@ -521,12 +513,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); - memcpy(insn_buff, wr_replacement, a->replacementlen); + memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { - insn_buff_sz = alt_replace_call(instr, insn_buff, a, - mod); + insn_buff_sz = alt_replace_call(instr, insn_buff, a); if (insn_buff_sz < 0) continue; } @@ -536,11 +527,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(wr_instr, insn_buff, insn_buff_sz); + text_poke_early(instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -731,20 +722,18 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -752,6 +741,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, op2 = insn.opcode.bytes[1]; switch (op1) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + /* See cfi_paranoid. */ + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: break; @@ -772,9 +766,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } @@ -810,8 +804,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; @@ -820,13 +813,12 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -846,41 +838,59 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } -#else -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } -#endif /* CONFIG_MITIGATION_RETHUNK */ +#else /* !CONFIG_MITIGATION_RETHUNK: */ +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* !CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ +#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr); +__noendbr bool is_endbr(u32 *val) +{ + u32 endbr; + + __get_kernel_nofault(&endbr, val, u32, Efault); + return __is_endbr(endbr); + +Efault: + return false; +} -static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) +#ifdef CONFIG_FINEIBT + +static __noendbr bool exact_endbr(u32 *val) { - u32 endbr, poison = gen_endbr_poison(); + u32 endbr; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) - return; + __get_kernel_nofault(&endbr, val, u32, Efault); + return endbr == gen_endbr(); + +Efault: + return false; +} - if (!is_endbr(endbr)) { - WARN_ON_ONCE(warn); +#endif + +static void poison_cfi(void *addr); + +static void __init_or_module poison_endbr(void *addr) +{ + u32 poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(!is_endbr(addr))) return; - } DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); @@ -889,7 +899,7 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(wr_addr, &poison, 4); + text_poke_early(addr, &poison, 4); } /* @@ -898,36 +908,39 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, wr_addr, true); + poison_endbr(addr); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16, wr_addr - 16); + poison_cfi(addr - 16); } } -#else +#else /* !CONFIG_X86_KERNEL_IBT: */ -void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } -#endif /* CONFIG_X86_KERNEL_IBT */ +#endif /* !CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_CFI_AUTO_DEFAULT -#define __CFI_DEFAULT CFI_AUTO +# define __CFI_DEFAULT CFI_AUTO #elif defined(CONFIG_CFI_CLANG) -#define __CFI_DEFAULT CFI_KCFI +# define __CFI_DEFAULT CFI_KCFI #else -#define __CFI_DEFAULT CFI_OFF +# define __CFI_DEFAULT CFI_OFF #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; +#ifdef CONFIG_FINEIBT_BHI +bool cfi_bhi __ro_after_init = false; +#endif + #ifdef CONFIG_CFI_CLANG struct bpf_insn; @@ -935,11 +948,7 @@ struct bpf_insn; extern unsigned int __bpf_prog_runX(const void *ctx, const struct bpf_insn *insn); -/* - * Force a reference to the external symbol so the compiler generates - * __kcfi_typid. - */ -__ADDRESSABLE(__bpf_prog_runX); +KCFI_REFERENCE(__bpf_prog_runX); /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ asm ( @@ -956,7 +965,7 @@ asm ( /* Must match bpf_callback_t */ extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); -__ADDRESSABLE(__bpf_callback_fn); +KCFI_REFERENCE(__bpf_callback_fn); /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ asm ( @@ -991,6 +1000,21 @@ u32 cfi_get_func_hash(void *func) return hash; } + +int cfi_get_func_arity(void *func) +{ + bhi_thunk *target; + s32 disp; + + if (cfi_mode != CFI_FINEIBT && !cfi_bhi) + return 0; + + if (get_kernel_nofault(disp, func - 4)) + return 0; + + target = func + disp; + return target - __bhi_args; +} #endif #ifdef CONFIG_FINEIBT @@ -998,6 +1022,8 @@ u32 cfi_get_func_hash(void *func) static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; +static bool cfi_paranoid __ro_after_init = false; + /* * Re-hash the CFI hash with a boot-time seed while making sure the result is * not a valid ENDBR instruction. @@ -1005,7 +1031,7 @@ static u32 cfi_seed __ro_after_init; static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; - while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) @@ -1037,6 +1063,25 @@ static __init int cfi_parse_cmdline(char *str) cfi_mode = CFI_FINEIBT; } else if (!strcmp(str, "norand")) { cfi_rand = false; + } else if (!strcmp(str, "warn")) { + pr_alert("CFI mismatch non-fatal!\n"); + cfi_warn = true; + } else if (!strcmp(str, "paranoid")) { + if (cfi_mode == CFI_FINEIBT) { + cfi_paranoid = true; + } else { + pr_err("Ignoring paranoid; depends on fineibt.\n"); + } + } else if (!strcmp(str, "bhi")) { +#ifdef CONFIG_FINEIBT_BHI + if (cfi_mode == CFI_FINEIBT) { + cfi_bhi = true; + } else { + pr_err("Ignoring bhi; depends on fineibt.\n"); + } +#else + pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n"); +#endif } else { pr_err("Ignoring unknown cfi option (%s).", str); } @@ -1054,9 +1099,9 @@ early_param("cfi", cfi_parse_cmdline); * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%r10d // 7 - * nop jz 1f // 2 - * nop ud2 // 2 - * nop 1: nop // 1 + * nop jne __cfi_\func+6 // 2 + * nop nop3 // 3 + * nop * nop * nop * nop @@ -1068,34 +1113,53 @@ early_param("cfi", cfi_parse_cmdline); * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 - * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4 * je 1f // 2 nop4 // 4 * ud2 // 2 - * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6 * */ -asm( ".pushsection .rodata \n" - "fineibt_preamble_start: \n" - " endbr64 \n" - " subl $0x12345678, %r10d \n" - " je fineibt_preamble_end \n" - " ud2 \n" - " nop \n" - "fineibt_preamble_end: \n" +/* + * <fineibt_preamble_start>: + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d + * b: 75 f9 jne 6 <fineibt_preamble_start+0x6> + * d: 0f 1f 00 nopl (%rax) + * + * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as + * (bad) on x86_64 and raises #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + "fineibt_preamble_bhi: \n" + " jne fineibt_preamble_start+6 \n" + ASM_NOP3 + "fineibt_preamble_end: \n" ".popsection\n" ); extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_bhi[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) +#define fineibt_preamble_ud 6 #define fineibt_preamble_hash 7 +/* + * <fineibt_caller_start>: + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * a: 0f 1f 40 00 nopl 0x0(%rax) + */ asm( ".pushsection .rodata \n" "fineibt_caller_start: \n" " movl $0x12345678, %r10d \n" - " sub $16, %r11 \n" + " lea -0x10(%r11), %r11 \n" ASM_NOP4 "fineibt_caller_end: \n" ".popsection \n" @@ -1109,13 +1173,62 @@ extern u8 fineibt_caller_end[]; #define fineibt_caller_jmp (fineibt_caller_size - 2) -static u32 decode_preamble_hash(void *addr) +/* + * Since FineIBT does hash validation on the callee side it is prone to + * circumvention attacks where a 'naked' ENDBR instruction exists that + * is not part of the fineibt_preamble sequence. + * + * Notably the x86 entry points must be ENDBR and equally cannot be + * fineibt_preamble. + * + * The fineibt_paranoid caller sequence adds additional caller side + * hash validation. This stops such circumvention attacks dead, but at the cost + * of adding a load. + * + * <fineibt_paranoid_start>: + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11 + * e: 75 fd jne d <fineibt_paranoid_start+0xd> + * 10: 41 ff d3 call *%r11 + * 13: 90 nop + * + * Notably LEA does not modify flags and can be reordered with the CMP, + * avoiding a dependency. Again, using a non-taken (backwards) branch + * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the + * Jcc.d8, causing #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_paranoid_start: \n" + " movl $0x12345678, %r10d \n" + " cmpl -9(%r11), %r10d \n" + " lea -0x10(%r11), %r11 \n" + " jne fineibt_paranoid_start+0xd \n" + "fineibt_paranoid_ind: \n" + " call *%r11 \n" + " nop \n" + "fineibt_paranoid_end: \n" + ".popsection \n" +); + +extern u8 fineibt_paranoid_start[]; +extern u8 fineibt_paranoid_ind[]; +extern u8 fineibt_paranoid_end[]; + +#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start) +#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) +#define fineibt_paranoid_ud 0xd + +static u32 decode_preamble_hash(void *addr, int *reg) { u8 *p = addr; - /* b8 78 56 34 12 mov $0x12345678,%eax */ - if (p[0] == 0xb8) + /* b8+reg 78 56 34 12 movl $0x12345678,\reg */ + if (p[0] >= 0xb8 && p[0] < 0xc0) { + if (reg) + *reg = p[0] - 0xb8; return *(u32 *)(addr + 1); + } return 0; /* invalid hash value */ } @@ -1124,11 +1237,11 @@ static u32 decode_caller_hash(void *addr) { u8 *p = addr; - /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */ if (p[0] == 0x41 && p[1] == 0xba) return -*(u32 *)(addr + 2); - /* e8 0c 78 56 34 12 jmp.d8 +12 */ + /* e8 0c 88 a9 cb ed jmp.d8 +12 */ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) return -*(u32 *)(addr + 2); @@ -1136,7 +1249,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_disable_callers(s32 *start, s32 *end) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1148,23 +1261,20 @@ static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); - + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, jmp, 2); + text_poke_early(addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_enable_callers(s32 *start, s32 *end) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1174,126 +1284,212 @@ static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, mov, 2); + text_poke_early(addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(wr_addr); + hash = decode_preamble_hash(addr, NULL); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(wr_addr + 1, &hash, 4); + text_poke_early(addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) +static void cfi_fineibt_bhi_preamble(void *addr, int arity) +{ + if (!arity) + return; + + if (!cfi_warn && arity == 1) { + /* + * Crazy scheme to allow arity-1 inline: + * + * __cfi_foo: + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d + * b: 49 0f 45 fa cmovne %r10, %rdi + * f: 75 f5 jne __cfi_foo+6 + * 11: 0f 1f 00 nopl (%rax) + * + * Code that direct calls to foo()+0, decodes the tail end as: + * + * foo: + * 0: f5 cmc + * 1: 0f 1f 00 nopl (%rax) + * + * which clobbers CF, but does not affect anything ABI + * wise. + * + * Notably, this scheme is incompatible with permissive CFI + * because the CMOVcc is unconditional and RDI will have been + * clobbered. + */ + const u8 magic[9] = { + 0x49, 0x0f, 0x45, 0xfa, + 0x75, 0xf5, + BYTES_NOP3, + }; + + text_poke_early(addr + fineibt_preamble_bhi, magic, 9); + + return; + } + + text_poke_early(addr + fineibt_preamble_bhi, + text_gen_insn(CALL_INSN_OPCODE, + addr + fineibt_preamble_bhi, + __bhi_args[arity]), + CALL_INSN_SIZE); +} + +static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); + int arity; u32 hash; - hash = decode_preamble_hash(wr_addr); + /* + * When the function doesn't start with ENDBR the compiler will + * have determined there are no indirect calls to it and we + * don't need no CFI either. + */ + if (!is_endbr(addr + 16)) + continue; + + hash = decode_preamble_hash(addr, &arity); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; - text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + + WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity, + "kCFI preamble has wrong register at: %pS %*ph\n", + addr, 5, addr); + + if (cfi_bhi) + cfi_fineibt_bhi_preamble(addr, arity); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) +static void cfi_rewrite_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr + 16, wr_addr + 16, false); + if (!exact_endbr(addr + 16)) + continue; + + poison_endbr(addr + 16); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(wr_addr + 2, &hash, 4); + text_poke_early(addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; + BUG_ON(fineibt_paranoid_size != 20); + for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; + struct insn insn; + u8 bytes[20]; u32 hash; + int ret; + u8 op; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); - if (hash) { - text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); + hash = decode_caller_hash(addr); + if (!hash) + continue; + + if (!cfi_paranoid) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + /* rely on apply_retpolines() */ + continue; + } + + /* cfi_paranoid */ + ret = insn_decode_kernel(&insn, addr + fineibt_caller_size); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) { + WARN_ON_ONCE(1); + continue; } - /* rely on apply_retpolines() */ + + memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); + memcpy(bytes + fineibt_caller_hash, &hash, 4); + + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + + text_poke_early(addr, bytes, fineibt_paranoid_size); } return 0; } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { - bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, @@ -1302,8 +1498,15 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, if (cfi_mode == CFI_AUTO) { cfi_mode = CFI_KCFI; - if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) { + /* + * FRED has much saner context on exception entry and + * is less easy to take advantage of. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + cfi_paranoid = true; cfi_mode = CFI_FINEIBT; + } } /* @@ -1311,7 +1514,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_disable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1322,11 +1525,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } - ret = cfi_rand_preamble(start_cfi, end_cfi, mod); + ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rand_callers(start_retpoline, end_retpoline); if (ret) goto err; } @@ -1338,7 +1541,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_enable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1348,20 +1551,23 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); + ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi, mod); + cfi_rewrite_endbr(start_cfi, end_cfi); - if (builtin) - pr_info("Using FineIBT CFI\n"); + if (builtin) { + pr_info("Using %sFineIBT%s CFI\n", + cfi_paranoid ? "paranoid " : "", + cfi_bhi ? "+BHI" : ""); + } return; default: @@ -1377,11 +1583,25 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr, void *wr_addr) +static void poison_cfi(void *addr) { + /* + * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes, + * some (static) functions for which they can determine the address + * is never taken do not get a __cfi prefix, but *DO* get an ENDBR. + * + * As such, these functions will get sealed, but we need to be careful + * to not unconditionally scribble the previous function. + */ switch (cfi_mode) { case CFI_FINEIBT: /* + * FineIBT prefix should start with an ENDBR. + */ + if (!is_endbr(addr)) + break; + + /* * __cfi_\func: * osp nopl (%rax) * subl $0, %r10d @@ -1389,17 +1609,23 @@ static void poison_cfi(void *addr, void *wr_addr) * ud2 * 1: nop */ - poison_endbr(addr, wr_addr, false); - poison_hash(wr_addr + fineibt_preamble_hash); + poison_endbr(addr); + poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: /* + * kCFI prefix should start with a valid hash. + */ + if (!decode_preamble_hash(addr, NULL)) + break; + + /* * __cfi_\func: * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(wr_addr + 1); + poison_hash(addr + 1); break; default: @@ -1407,24 +1633,135 @@ static void poison_cfi(void *addr, void *wr_addr) } } -#else +/* + * When regs->ip points to a 0xEA byte in the FineIBT preamble, + * return true and fill out target and type. + * + * We check the preamble by checking for the ENDBR instruction relative to the + * 0xEA instruction. + */ +static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_preamble_ud; + u32 hash; + + if (!exact_endbr((void *)addr)) + return false; + + *target = addr + fineibt_preamble_size; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * Since regs->ip points to the middle of an instruction; it cannot + * continue with the normal fixup. + */ + regs->ip = *target; + + return true; + +Efault: + return false; +} + +/* + * regs->ip points to one of the UD2 in __bhi_args[]. + */ +static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr; + u32 hash; + + if (!cfi_bhi) + return false; + + if (regs->ip < (unsigned long)__bhi_args || + regs->ip >= (unsigned long)__bhi_args_end) + return false; + + /* + * Fetch the return address from the stack, this points to the + * FineIBT preamble. Since the CALL instruction is in the 5 last + * bytes of the preamble, the return address is in fact the target + * address. + */ + __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); + *target = addr; + + addr -= fineibt_preamble_size; + if (!exact_endbr((void *)addr)) + return false; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * The UD2 sites are constructed with a RET immediately following, + * as such the non-fatal case can use the regular fixup. + */ + return true; + +Efault: + return false; +} + +/* + * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] + * sequence. + */ +static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_paranoid_ud; + u32 hash; + + if (!cfi_paranoid || !is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) + return false; + + __get_kernel_nofault(&hash, addr + fineibt_caller_hash, u32, Efault); + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + +Efault: + return false; +} + +bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + if (decode_fineibt_paranoid(regs, target, type)) + return true; + + if (decode_fineibt_bhi(regs, target, type)) + return true; + + return decode_fineibt_preamble(regs, target, type); +} + +#else /* !CONFIG_FINEIBT: */ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr) { } +static void poison_cfi(void *addr) { } #endif -#endif +#endif /* !CONFIG_FINEIBT */ void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, mod); + start_cfi, end_cfi, + /* .builtin = */ false); } #ifdef CONFIG_SMP @@ -1721,27 +2058,27 @@ void __init alternative_instructions(void) paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, NULL); + __cfi_sites, __cfi_sites_end, true); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); - apply_returns(__return_sites, __return_sites_end, NULL); - - apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); /* - * Now all calls are established. Apply the call thunks if - * required. + * Adjust all CALL instructions to point to func()-10, including + * those in .altinstr_replacement. */ callthunks_patch_builtin_calls(); + apply_alternatives(__alt_instructions, __alt_instructions_end); + /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 67e773744edb..6d12a9b69432 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -73,7 +73,6 @@ static int amd_cache_northbridges(void) amd_northbridges.nb = nb; for (i = 0; i < amd_northbridges.num; i++) { - node_to_amd_nb(i)->root = amd_node_get_root(i); node_to_amd_nb(i)->misc = amd_node_get_func(i, 3); /* diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index d2ec7fd555c5..b670fa85c61b 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -8,6 +8,7 @@ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com> */ +#include <linux/debugfs.h> #include <asm/amd_node.h> /* @@ -93,10 +94,14 @@ static struct pci_dev **amd_roots; /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); +static bool smn_exclusive; #define SMN_INDEX_OFFSET 0x60 #define SMN_DATA_OFFSET 0x64 +#define HSMP_INDEX_OFFSET 0xc4 +#define HSMP_DATA_OFFSET 0xc8 + /* * SMN accesses may fail in ways that are difficult to detect here in the called * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do @@ -146,6 +151,9 @@ static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, b if (!root) return err; + if (!smn_exclusive) + return err; + guard(mutex)(&smn_mutex); err = pci_write_config_dword(root, i_off, address); @@ -179,6 +187,93 @@ int __must_check amd_smn_write(u16 node, u32 address, u32 value) } EXPORT_SYMBOL_GPL(amd_smn_write); +int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write) +{ + return __amd_smn_rw(HSMP_INDEX_OFFSET, HSMP_DATA_OFFSET, node, address, value, write); +} +EXPORT_SYMBOL_GPL(amd_smn_hsmp_rdwr); + +static struct dentry *debugfs_dir; +static u16 debug_node; +static u32 debug_address; + +static ssize_t smn_node_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + u16 node; + int ret; + + ret = kstrtou16_from_user(userbuf, count, 0, &node); + if (ret) + return ret; + + if (node >= amd_num_nodes()) + return -ENODEV; + + debug_node = node; + return count; +} + +static int smn_node_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%08x\n", debug_node); + return 0; +} + +static ssize_t smn_address_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + int ret; + + ret = kstrtouint_from_user(userbuf, count, 0, &debug_address); + if (ret) + return ret; + + return count; +} + +static int smn_address_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%08x\n", debug_address); + return 0; +} + +static int smn_value_show(struct seq_file *m, void *v) +{ + u32 val; + int ret; + + ret = amd_smn_read(debug_node, debug_address, &val); + if (ret) + return ret; + + seq_printf(m, "0x%08x\n", val); + return 0; +} + +static ssize_t smn_value_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + u32 val; + int ret; + + ret = kstrtouint_from_user(userbuf, count, 0, &val); + if (ret) + return ret; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + ret = amd_smn_write(debug_node, debug_address, val); + if (ret) + return ret; + + return count; +} + +DEFINE_SHOW_STORE_ATTRIBUTE(smn_node); +DEFINE_SHOW_STORE_ATTRIBUTE(smn_address); +DEFINE_SHOW_STORE_ATTRIBUTE(smn_value); + static int amd_cache_roots(void) { u16 node, num_nodes = amd_num_nodes(); @@ -193,6 +288,48 @@ static int amd_cache_roots(void) return 0; } +static int reserve_root_config_spaces(void) +{ + struct pci_dev *root = NULL; + struct pci_bus *bus = NULL; + + while ((bus = pci_find_next_bus(bus))) { + /* Root device is Device 0 Function 0 on each Primary Bus. */ + root = pci_get_slot(bus, 0); + if (!root) + continue; + + if (root->vendor != PCI_VENDOR_ID_AMD && + root->vendor != PCI_VENDOR_ID_HYGON) + continue; + + pci_dbg(root, "Reserving PCI config space\n"); + + /* + * There are a few SMN index/data pairs and other registers + * that shouldn't be accessed by user space. + * So reserve the entire PCI config space for simplicity rather + * than covering specific registers piecemeal. + */ + if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) { + pci_err(root, "Failed to reserve config space\n"); + return -EEXIST; + } + } + + smn_exclusive = true; + return 0; +} + +static bool enable_dfs; + +static int __init amd_smn_enable_dfs(char *str) +{ + enable_dfs = true; + return 1; +} +__setup("amd_smn_debugfs_enable", amd_smn_enable_dfs); + static int __init amd_smn_init(void) { int err; @@ -209,6 +346,18 @@ static int __init amd_smn_init(void) if (err) return err; + err = reserve_root_config_spaces(); + if (err) + return err; + + if (enable_dfs) { + debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir); + + debugfs_create_file("node", 0600, debugfs_dir, NULL, &smn_node_fops); + debugfs_create_file("address", 0600, debugfs_dir, NULL, &smn_address_fops); + debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops); + } + return 0; } diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 3bf0487cf3b7..52d1808ee360 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -23,8 +23,5 @@ obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o endif -# APIC probe will depend on the listing order here -obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o - # For 32bit, probe_32 need to be listed last obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e893dc6f11c1..62584a347931 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1371,8 +1371,6 @@ void __init apic_intr_mode_init(void) x86_64_probe_apic(); - x86_32_install_bigsmp(); - if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1674,7 +1672,6 @@ static __init void apic_read_boot_cpu_id(bool x2apic) boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } topology_register_boot_apic(boot_cpu_physical_apicid); - x86_32_probe_bigsmp_early(); } #ifdef CONFIG_X86_X2APIC @@ -2014,8 +2011,8 @@ static bool __init detect_init_APIC(void) case X86_VENDOR_HYGON: break; case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC))) + if ((boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)) || + boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) break; goto no_apic; default: diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c deleted file mode 100644 index 9285d500d5b4..000000000000 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. - * - * Drives the local APIC in "clustered mode". - */ -#include <linux/cpumask.h> -#include <linux/dmi.h> -#include <linux/smp.h> - -#include <asm/apic.h> -#include <asm/io_apic.h> - -#include "local.h" - -static u32 bigsmp_get_apic_id(u32 x) -{ - return (x >> 24) & 0xFF; -} - -static void bigsmp_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void bigsmp_send_IPI_all(int vector) -{ - default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); -} - -static int dmi_bigsmp; /* can be set by dmi scanners */ - -static int hp_ht_bigsmp(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); - dmi_bigsmp = 1; - - return 0; -} - - -static const struct dmi_system_id bigsmp_dmi_table[] = { - { hp_ht_bigsmp, "HP ProLiant DL760 G2", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P44-"), - } - }, - - { hp_ht_bigsmp, "HP ProLiant DL740", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P47-"), - } - }, - { } /* NULL entry stops DMI scanning */ -}; - -static int probe_bigsmp(void) -{ - return dmi_check_system(bigsmp_dmi_table); -} - -static struct apic apic_bigsmp __ro_after_init = { - - .name = "bigsmp", - .probe = probe_bigsmp, - - .dest_mode_logical = false, - - .disable_esr = 1, - - .cpu_present_to_apicid = default_cpu_present_to_apicid, - - .max_apic_id = 0xFE, - .get_apic_id = bigsmp_get_apic_id, - - .calc_dest_apicid = apic_default_calc_apicid, - - .send_IPI = default_send_IPI_single_phys, - .send_IPI_mask = default_send_IPI_mask_sequence_phys, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = bigsmp_send_IPI_allbutself, - .send_IPI_all = bigsmp_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi = native_apic_mem_eoi, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = apic_mem_wait_icr_idle, - .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, -}; - -bool __init apic_bigsmp_possible(bool cmdline_override) -{ - return apic == &apic_bigsmp || !cmdline_override; -} - -void __init apic_bigsmp_force(void) -{ - if (apic != &apic_bigsmp) - apic_install_driver(&apic_bigsmp); -} - -apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 5da693d633b7..98a57cb4aa86 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -3,6 +3,7 @@ #include <linux/cpumask.h> #include <linux/delay.h> #include <linux/smp.h> +#include <linux/string_choices.h> #include <asm/io_apic.h> @@ -23,7 +24,7 @@ __setup("no_ipi_broadcast=", apic_ipi_shorthand); static int __init print_ipi_mode(void) { pr_info("IPI shorthand broadcast: %s\n", - apic_ipi_shorthand_off ? "disabled" : "enabled"); + str_disabled_enabled(apic_ipi_shorthand_off)); return 0; } late_initcall(print_ipi_mode); @@ -287,34 +288,4 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } - -#ifdef CONFIG_SMP -static int convert_apicid_to_cpu(u32 apic_id) -{ - int i; - - for_each_possible_cpu(i) { - if (per_cpu(x86_cpu_to_apicid, i) == apic_id) - return i; - } - return -1; -} - -int safe_smp_processor_id(void) -{ - u32 apicid; - int cpuid; - - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - - apicid = read_apic_id(); - if (apicid == BAD_APICID) - return 0; - - cpuid = convert_apicid_to_cpu(apicid); - - return cpuid >= 0 ? cpuid : 0; -} -#endif #endif diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 842fe28496be..bdcf609eb283 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -65,17 +65,4 @@ void default_send_IPI_self(int vector); void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); -void x86_32_probe_bigsmp_early(void); -void x86_32_install_bigsmp(void); -#else -static inline void x86_32_probe_bigsmp_early(void) { } -static inline void x86_32_install_bigsmp(void) { } -#endif - -#ifdef CONFIG_X86_BIGSMP -bool apic_bigsmp_possible(bool cmdline_selected); -void apic_bigsmp_force(void); -#else -static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; -static inline void apic_bigsmp_force(void) { } #endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f75ee345c02d..87bc9e7ca5d6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -93,35 +93,6 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init x86_32_probe_bigsmp_early(void) -{ - if (nr_cpu_ids <= 8 || xen_pv_domain()) - return; - - if (IS_ENABLED(CONFIG_X86_BIGSMP)) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(boot_cpu_apic_version)) - break; - /* P4 and above */ - fallthrough; - case X86_VENDOR_HYGON: - case X86_VENDOR_AMD: - if (apic_bigsmp_possible(cmdline_apic)) - return; - break; - } - } - pr_info("Limiting to 8 possible CPUs\n"); - set_nr_cpu_ids(8); -} - -void __init x86_32_install_bigsmp(void) -{ - if (nr_cpu_ids > 8 && !xen_pv_domain()) - apic_bigsmp_force(); -} - void __init x86_32_probe_apic(void) { if (!cmdline_apic) { diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 736f62812f5c..72fa4bb78f0a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd, return err ? err : IRQ_SET_MASK_OK; } +static void free_moved_vector(struct apic_chip_data *apicd) +{ + unsigned int vector = apicd->prev_vector; + unsigned int cpu = apicd->prev_cpu; + bool managed = apicd->is_managed; + + /* + * Managed interrupts are usually not migrated away + * from an online CPU, but CPU isolation 'managed_irq' + * can make that happen. + * 1) Activation does not take the isolation into account + * to keep the code simple + * 2) Migration away from an isolated CPU can happen when + * a non-isolated CPU which is in the calculated + * affinity mask comes online. + */ + trace_vector_free_moved(apicd->irq, cpu, vector, managed); + irq_matrix_free(vector_matrix, cpu, vector, managed); + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + hlist_del_init(&apicd->clist); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; +} + +/* + * Called from fixup_irqs() with @desc->lock held and interrupts disabled. + */ +static void apic_force_complete_move(struct irq_data *irqd) +{ + unsigned int cpu = smp_processor_id(); + struct apic_chip_data *apicd; + unsigned int vector; + + guard(raw_spinlock)(&vector_lock); + apicd = apic_chip_data(irqd); + if (!apicd) + return; + + /* + * If prev_vector is empty or the descriptor is neither currently + * nor previously on the outgoing CPU no action required. + */ + vector = apicd->prev_vector; + if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) + return; + + /* + * This is tricky. If the cleanup of the old vector has not been + * done yet, then the following setaffinity call will fail with + * -EBUSY. This can leave the interrupt in a stale state. + * + * All CPUs are stuck in stop machine with interrupts disabled so + * calling __irq_complete_move() would be completely pointless. + * + * 1) The interrupt is in move_in_progress state. That means that we + * have not seen an interrupt since the io_apic was reprogrammed to + * the new vector. + * + * 2) The interrupt has fired on the new vector, but the cleanup IPIs + * have not been processed yet. + */ + if (apicd->move_in_progress) { + /* + * In theory there is a race: + * + * set_ioapic(new_vector) <-- Interrupt is raised before update + * is effective, i.e. it's raised on + * the old vector. + * + * So if the target cpu cannot handle that interrupt before + * the old vector is cleaned up, we get a spurious interrupt + * and in the worst case the ioapic irq line becomes stale. + * + * But in case of cpu hotplug this should be a non issue + * because if the affinity update happens right before all + * cpus rendezvous in stop machine, there is no way that the + * interrupt can be blocked on the target cpu because all cpus + * loops first with interrupts enabled in stop machine, so the + * old vector is not yet cleaned up when the interrupt fires. + * + * So the only way to run into this issue is if the delivery + * of the interrupt on the apic/system bus would be delayed + * beyond the point where the target cpu disables interrupts + * in stop machine. I doubt that it can happen, but at least + * there is a theoretical chance. Virtualization might be + * able to expose this, but AFAICT the IOAPIC emulation is not + * as stupid as the real hardware. + * + * Anyway, there is nothing we can do about that at this point + * w/o refactoring the whole fixup_irq() business completely. + * We print at least the irq number and the old vector number, + * so we have the necessary information when a problem in that + * area arises. + */ + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", + irqd->irq, vector); + } + free_moved_vector(apicd); +} + #else -# define apic_set_affinity NULL +# define apic_set_affinity NULL +# define apic_force_complete_move NULL #endif static int apic_retrigger_irq(struct irq_data *irqd) @@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data, } static struct irq_chip lapic_controller = { - .name = "APIC", - .irq_ack = apic_ack_edge, - .irq_set_affinity = apic_set_affinity, - .irq_compose_msi_msg = x86_vector_msi_compose_msg, - .irq_retrigger = apic_retrigger_irq, + .name = "APIC", + .irq_ack = apic_ack_edge, + .irq_set_affinity = apic_set_affinity, + .irq_compose_msi_msg = x86_vector_msi_compose_msg, + .irq_force_complete_move = apic_force_complete_move, + .irq_retrigger = apic_retrigger_irq, }; #ifdef CONFIG_SMP -static void free_moved_vector(struct apic_chip_data *apicd) -{ - unsigned int vector = apicd->prev_vector; - unsigned int cpu = apicd->prev_cpu; - bool managed = apicd->is_managed; - - /* - * Managed interrupts are usually not migrated away - * from an online CPU, but CPU isolation 'managed_irq' - * can make that happen. - * 1) Activation does not take the isolation into account - * to keep the code simple - * 2) Migration away from an isolated CPU can happen when - * a non-isolated CPU which is in the calculated - * affinity mask comes online. - */ - trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - hlist_del_init(&apicd->clist); - apicd->prev_vector = 0; - apicd->move_in_progress = 0; -} - static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) { struct apic_chip_data *apicd; @@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg) __vector_schedule_cleanup(apicd); } -/* - * Called from fixup_irqs() with @desc->lock held and interrupts disabled. - */ -void irq_force_complete_move(struct irq_desc *desc) -{ - unsigned int cpu = smp_processor_id(); - struct apic_chip_data *apicd; - struct irq_data *irqd; - unsigned int vector; - - /* - * The function is called for all descriptors regardless of which - * irqdomain they belong to. For example if an IRQ is provided by - * an irq_chip as part of a GPIO driver, the chip data for that - * descriptor is specific to the irq_chip in question. - * - * Check first that the chip_data is what we expect - * (apic_chip_data) before touching it any further. - */ - irqd = irq_domain_get_irq_data(x86_vector_domain, - irq_desc_get_irq(desc)); - if (!irqd) - return; - - raw_spin_lock(&vector_lock); - apicd = apic_chip_data(irqd); - if (!apicd) - goto unlock; - - /* - * If prev_vector is empty or the descriptor is neither currently - * nor previously on the outgoing CPU no action required. - */ - vector = apicd->prev_vector; - if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) - goto unlock; - - /* - * This is tricky. If the cleanup of the old vector has not been - * done yet, then the following setaffinity call will fail with - * -EBUSY. This can leave the interrupt in a stale state. - * - * All CPUs are stuck in stop machine with interrupts disabled so - * calling __irq_complete_move() would be completely pointless. - * - * 1) The interrupt is in move_in_progress state. That means that we - * have not seen an interrupt since the io_apic was reprogrammed to - * the new vector. - * - * 2) The interrupt has fired on the new vector, but the cleanup IPIs - * have not been processed yet. - */ - if (apicd->move_in_progress) { - /* - * In theory there is a race: - * - * set_ioapic(new_vector) <-- Interrupt is raised before update - * is effective, i.e. it's raised on - * the old vector. - * - * So if the target cpu cannot handle that interrupt before - * the old vector is cleaned up, we get a spurious interrupt - * and in the worst case the ioapic irq line becomes stale. - * - * But in case of cpu hotplug this should be a non issue - * because if the affinity update happens right before all - * cpus rendezvous in stop machine, there is no way that the - * interrupt can be blocked on the target cpu because all cpus - * loops first with interrupts enabled in stop machine, so the - * old vector is not yet cleaned up when the interrupt fires. - * - * So the only way to run into this issue is if the delivery - * of the interrupt on the apic/system bus would be delayed - * beyond the point where the target cpu disables interrupts - * in stop machine. I doubt that it can happen, but at least - * there is a theoretical chance. Virtualization might be - * able to expose this, but AFAICT the IOAPIC emulation is not - * as stupid as the real hardware. - * - * Anyway, there is nothing we can do about that at this point - * w/o refactoring the whole fixup_irq() business completely. - * We print at least the irq number and the old vector number, - * so we have the necessary information when a problem in that - * area arises. - */ - pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", - irqd->irq, vector); - } - free_moved_vector(apicd); -unlock: - raw_spin_unlock(&vector_lock); -} - #ifdef CONFIG_HOTPLUG_CPU /* * Note, this is not accurate accounting, but at least good enough to diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a98020bf31bb..ad4ea6fb3b6c 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -107,11 +107,6 @@ static void __used common(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); - OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); - OFFSET(X86_current_task, pcpu_hot, current_task); -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING - OFFSET(X86_call_depth, pcpu_hot, call_depth); -#endif #if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) /* Offset for fields in aria_ctx */ BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bb65371ea9df..590b6cd0eac0 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -54,11 +54,5 @@ int main(void) BLANK(); #undef ENTRY - BLANK(); - -#ifdef CONFIG_STACKPROTECTOR - OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); - BLANK(); -#endif return 0; } diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 3fed7ae58b60..73274d76ce16 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -8,6 +8,7 @@ #include <linux/string.h> #include <linux/spinlock.h> #include <linux/acpi.h> +#include <linux/bitops.h> #include <asm/io.h> #include <linux/mc146818rtc.h> @@ -20,27 +21,13 @@ int sbf_port __initdata = -1; /* set via acpi_boot_init() */ -static int __init parity(u8 v) -{ - int x = 0; - int i; - - for (i = 0; i < 8; i++) { - x ^= (v & 1); - v >>= 1; - } - - return x; -} - static void __init sbf_write(u8 v) { unsigned long flags; if (sbf_port != -1) { - v &= ~SBF_PARITY; - if (!parity(v)) - v |= SBF_PARITY; + if (!parity8(v)) + v ^= SBF_PARITY; printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); @@ -66,14 +53,14 @@ static u8 __init sbf_read(void) return v; } -static int __init sbf_value_valid(u8 v) +static bool __init sbf_value_valid(u8 v) { if (v & SBF_RESERVED) /* Reserved bits */ - return 0; - if (!parity(v)) - return 0; + return false; + if (!parity8(v)) + return false; - return 1; + return true; } static int __init sbf_init(void) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 8418a892d195..25ae54250112 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -240,21 +240,10 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) } static __init_or_module void -patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end, - const struct core_text *ct) -{ - struct alt_instr *a; - - for (a = start; a < end; a++) - patch_call((void *)&a->instr_offset + a->instr_offset, ct); -} - -static __init_or_module void callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) { prdbg("Patching call sites %s\n", ct->name); patch_call_sites(cs->call_start, cs->call_end, ct); - patch_alt_call_sites(cs->alt_start, cs->alt_end, ct); prdbg("Patching call sites done%s\n", ct->name); } @@ -263,8 +252,6 @@ void __init callthunks_patch_builtin_calls(void) struct callthunk_sites cs = { .call_start = __call_sites, .call_end = __call_sites_end, - .alt_start = __alt_instructions, - .alt_end = __alt_instructions_end }; if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c index e6bf78fac146..77086cf565ec 100644 --- a/arch/x86/kernel/cfi.c +++ b/arch/x86/kernel/cfi.c @@ -67,16 +67,30 @@ static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, */ enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { - unsigned long target; + unsigned long target, addr = regs->ip; u32 type; - if (!is_cfi_trap(regs->ip)) - return BUG_TRAP_TYPE_NONE; + switch (cfi_mode) { + case CFI_KCFI: + if (!is_cfi_trap(addr)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, addr); + + break; - if (!decode_cfi_insn(regs, &target, &type)) - return report_cfi_failure_noaddr(regs, regs->ip); + case CFI_FINEIBT: + if (!decode_fineibt_insn(regs, &target, &type)) + return BUG_TRAP_TYPE_NONE; + + break; + + default: + return BUG_TRAP_TYPE_NONE; + } - return report_cfi_failure(regs, regs->ip, &target, type); + return report_cfi_failure(regs, addr, &target, type); } /* diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 54194f5995de..79569f72b8ee 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -29,6 +29,8 @@ #include "cpu.h" +u16 invlpgb_count_max __ro_after_init; + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -632,7 +634,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c) * (model = 0x14) and later actually support it. * (AMD Erratum #110, docId: 25759). */ - if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); if (!rdmsrl_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); @@ -1073,6 +1075,10 @@ static void init_amd(struct cpuinfo_x86 *c) /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + + /* Enable Translation Cache Extension */ + if (cpu_has(c, X86_FEATURE_TCE)) + msr_set_bit(MSR_EFER, _EFER_TCE); } #ifdef CONFIG_X86_32 @@ -1105,8 +1111,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB @@ -1119,26 +1125,30 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { /* Erratum 658 */ if (c->x86 == 0x15 && c->x86_model <= 0x1f) { - tlb_lli_2m[ENTRIES] = 1024; + tlb_lli_2m = 1024; } else { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; + + tlb_lli_4m = tlb_lli_2m >> 1; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + /* Max number of pages INVLPGB can invalidate in one shot */ + if (cpu_has(c, X86_FEATURE_INVLPGB)) + invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1; } static const struct cpu_dev amd_cpu_dev = { diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index f642de2ebdac..6cf31a1649c4 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -498,7 +498,7 @@ void arch_scale_freq_tick(void) */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) -unsigned int arch_freq_get_on_cpu(int cpu) +int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned int seq, freq; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a5d0998d7604..4386aa6c69e1 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -113,6 +113,10 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +/* Control IBPB on vCPU load */ +DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb); +EXPORT_SYMBOL_GPL(switch_vcpu_ibpb); + /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -234,7 +238,7 @@ static void x86_amd_ssb_disable(void) /* Default mitigation for MDS-affected CPUs */ static enum mds_mitigations mds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { @@ -243,6 +247,40 @@ static const char * const mds_strings[] = { [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_AUTO, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF; + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_AUTO, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF; + +enum rfds_mitigations { + RFDS_MITIGATION_OFF, + RFDS_MITIGATION_AUTO, + RFDS_MITIGATION_VERW, + RFDS_MITIGATION_UCODE_NEEDED, +}; + +/* Default mitigation for Register File Data Sampling */ +static enum rfds_mitigations rfds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; + static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { @@ -250,6 +288,9 @@ static void __init mds_select_mitigation(void) return; } + if (mds_mitigation == MDS_MITIGATION_AUTO) + mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; @@ -286,16 +327,6 @@ early_param("mds", mds_cmdline); #undef pr_fmt #define pr_fmt(fmt) "TAA: " fmt -enum taa_mitigations { - TAA_MITIGATION_OFF, - TAA_MITIGATION_UCODE_NEEDED, - TAA_MITIGATION_VERW, - TAA_MITIGATION_TSX_DISABLED, -}; - -/* Default mitigation for TAA-affected CPUs */ -static enum taa_mitigations taa_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { @@ -386,15 +417,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "MMIO Stale Data: " fmt -enum mmio_mitigations { - MMIO_MITIGATION_OFF, - MMIO_MITIGATION_UCODE_NEEDED, - MMIO_MITIGATION_VERW, -}; - -/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ -static enum mmio_mitigations mmio_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { @@ -483,16 +505,6 @@ early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Register File Data Sampling: " fmt -enum rfds_mitigations { - RFDS_MITIGATION_OFF, - RFDS_MITIGATION_VERW, - RFDS_MITIGATION_UCODE_NEEDED, -}; - -/* Default mitigation for Register File Data Sampling */ -static enum rfds_mitigations rfds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF; - static const char * const rfds_strings[] = { [RFDS_MITIGATION_OFF] = "Vulnerable", [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File", @@ -508,6 +520,9 @@ static void __init rfds_select_mitigation(void) if (rfds_mitigation == RFDS_MITIGATION_OFF) return; + if (rfds_mitigation == RFDS_MITIGATION_AUTO) + rfds_mitigation = RFDS_MITIGATION_VERW; + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else @@ -1293,9 +1308,13 @@ static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { + enum spectre_v2_user_cmd mode; char arg[20]; int ret, i; + mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? + SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; @@ -1308,7 +1327,7 @@ spectre_v2_parse_user_cmdline(void) ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + return mode; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { @@ -1318,8 +1337,8 @@ spectre_v2_parse_user_cmdline(void) } } - pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + pr_err("Unknown user space protection option (%s). Switching to default\n", arg); + return mode; } static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) @@ -1331,16 +1350,11 @@ static void __init spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; - bool smt_possible = IS_ENABLED(CONFIG_SMP); enum spectre_v2_user_cmd cmd; if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || - cpu_smt_control == CPU_SMT_NOT_SUPPORTED) - smt_possible = false; - cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: @@ -1364,7 +1378,7 @@ spectre_v2_user_select_mitigation(void) /* Initialize Indirect Branch Prediction Barrier */ if (boot_cpu_has(X86_FEATURE_IBPB)) { - setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + static_branch_enable(&switch_vcpu_ibpb); spectre_v2_user_ibpb = mode; switch (cmd) { @@ -1401,7 +1415,7 @@ spectre_v2_user_select_mitigation(void) * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || - !smt_possible || + !cpu_smt_possible() || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && !boot_cpu_has(X86_FEATURE_AUTOIBRS))) return; @@ -1973,6 +1987,7 @@ void cpu_bugs_smt_update(void) switch (mds_mitigation) { case MDS_MITIGATION_FULL: + case MDS_MITIGATION_AUTO: case MDS_MITIGATION_VMWERV: if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) pr_warn_once(MDS_MSG_SMT); @@ -1984,6 +1999,7 @@ void cpu_bugs_smt_update(void) switch (taa_mitigation) { case TAA_MITIGATION_VERW: + case TAA_MITIGATION_AUTO: case TAA_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(TAA_MSG_SMT); @@ -1995,6 +2011,7 @@ void cpu_bugs_smt_update(void) switch (mmio_mitigation) { case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_AUTO: case MMIO_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(MMIO_MSG_SMT); @@ -2522,6 +2539,7 @@ enum srso_mitigation { SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, SRSO_MITIGATION_IBPB_ON_VMEXIT, + SRSO_MITIGATION_BP_SPEC_REDUCE, }; enum srso_mitigation_cmd { @@ -2539,7 +2557,8 @@ static const char * const srso_strings[] = { [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", - [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only", + [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; @@ -2578,7 +2597,7 @@ static void __init srso_select_mitigation(void) srso_cmd == SRSO_CMD_OFF) { if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; - return; + goto out; } if (has_microcode) { @@ -2590,7 +2609,7 @@ static void __init srso_select_mitigation(void) */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); - return; + goto out; } if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { @@ -2670,6 +2689,12 @@ static void __init srso_select_mitigation(void) ibpb_on_vmexit: case SRSO_CMD_IBPB_ON_VMEXIT: + if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) { + pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n"); + srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE; + break; + } + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); @@ -2691,7 +2716,15 @@ ibpb_on_vmexit: } out: - pr_info("%s\n", srso_strings[srso_mitigation]); + /* + * Clear the feature flag if this mitigation is not selected as that + * feature flag controls the BpSpecReduce MSR bit toggling in KVM. + */ + if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE) + setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE); + + if (srso_mitigation != SRSO_MITIGATION_NONE) + pr_info("%s\n", srso_strings[srso_mitigation]); } #undef pr_fmt diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 6cba85c79d42..97222efb4d2a 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -192,7 +192,13 @@ static void __split_lock_reenable(struct work_struct *work) { sld_update_msr(true); } -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); +/* + * In order for each CPU to schedule its delayed work independently of the + * others, delayed work struct must be per-CPU. This is not required when + * sysctl_sld_mitigate is enabled because of the semaphore that limits + * the number of simultaneously scheduled delayed works to 1. + */ +static DEFINE_PER_CPU(struct delayed_work, sl_reenable); /* * If a CPU goes offline with pending delayed work to re-enable split lock @@ -213,7 +219,7 @@ static int splitlock_cpu_offline(unsigned int cpu) static void split_lock_warn(unsigned long ip) { - struct delayed_work *work; + struct delayed_work *work = NULL; int cpu; if (!current->reported_split_lock) @@ -235,11 +241,17 @@ static void split_lock_warn(unsigned long ip) if (down_interruptible(&buslock_sem) == -EINTR) return; work = &sl_reenable_unlock; - } else { - work = &sl_reenable; } cpu = get_cpu(); + + if (!work) { + work = this_cpu_ptr(&sl_reenable); + /* Deferred initialization of per-CPU struct */ + if (!work->work.func) + INIT_DELAYED_WORK(work, __split_lock_reenable); + } + schedule_delayed_work_on(cpu, work, 2); /* Disable split lock detection on this CPU to make progress */ diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index a6c6bccfa8b8..b3a520959b51 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -8,21 +8,19 @@ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ -#include <linux/slab.h> #include <linux/cacheinfo.h> +#include <linux/capability.h> #include <linux/cpu.h> #include <linux/cpuhotplug.h> -#include <linux/sched.h> -#include <linux/capability.h> -#include <linux/sysfs.h> #include <linux/pci.h> #include <linux/stop_machine.h> +#include <linux/sysfs.h> -#include <asm/cpufeature.h> -#include <asm/cacheinfo.h> #include <asm/amd_nb.h> -#include <asm/smp.h> +#include <asm/cacheinfo.h> +#include <asm/cpufeature.h> #include <asm/mtrr.h> +#include <asm/smp.h> #include <asm/tlbflush.h> #include "cpu.h" @@ -31,7 +29,6 @@ #define LVL_1_DATA 2 #define LVL_2 3 #define LVL_3 4 -#define LVL_TRACE 5 /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -96,10 +93,6 @@ static const struct _cache_table cache_table[] = { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ - { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ - { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ - { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ @@ -787,19 +780,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) } } } - /* - * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for - * trace cache - */ - if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) { + + /* Don't use CPUID(2) if CPUID(4) is supported. */ + if (!ci->num_leaves && c->cpuid_level > 1) { /* supports eax=2 call */ int j, n; unsigned int regs[4]; unsigned char *dp = (unsigned char *)regs; - int only_trace = 0; - - if (ci->num_leaves && c->x86 == 15) - only_trace = 1; /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; @@ -820,8 +807,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) /* look up this descriptor in the table */ while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { - if (only_trace && cache_table[k].cache_type != LVL_TRACE) - break; switch (cache_table[k].cache_type) { case LVL_1_INST: l1i += cache_table[k].size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7cce91b19fb2..12126adbc3a9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -667,8 +667,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) if (!warn) continue; - pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", - x86_cap_flag(df->feature), df->level); + pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); } } @@ -846,13 +846,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } -u16 __read_mostly tlb_lli_4k[NR_INFO]; -u16 __read_mostly tlb_lli_2m[NR_INFO]; -u16 __read_mostly tlb_lli_4m[NR_INFO]; -u16 __read_mostly tlb_lld_4k[NR_INFO]; -u16 __read_mostly tlb_lld_2m[NR_INFO]; -u16 __read_mostly tlb_lld_4m[NR_INFO]; -u16 __read_mostly tlb_lld_1g[NR_INFO]; +u16 __read_mostly tlb_lli_4k; +u16 __read_mostly tlb_lli_2m; +u16 __read_mostly tlb_lli_4m; +u16 __read_mostly tlb_lld_4k; +u16 __read_mostly tlb_lld_2m; +u16 __read_mostly tlb_lld_4m; +u16 __read_mostly tlb_lld_1g; static void cpu_detect_tlb(struct cpuinfo_x86 *c) { @@ -860,12 +860,10 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) this_cpu->c_detect_tlb(c); pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", - tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], - tlb_lli_4m[ENTRIES]); + tlb_lli_4k, tlb_lli_2m, tlb_lli_4m); pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", - tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], - tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); + tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g); } void get_cpu_vendor(struct cpuinfo_x86 *c) @@ -1164,7 +1162,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), @@ -1205,6 +1203,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \ X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues) +#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \ + X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues) + #define VULNBL_AMD(family, blacklist) \ VULNBL(AMD, family, X86_MODEL_ANY, blacklist) @@ -1253,9 +1254,9 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS), + VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS), + VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS), @@ -1331,8 +1332,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) { setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && @@ -1479,15 +1482,96 @@ static void detect_nopl(void) #endif } +static inline bool parse_set_clear_cpuid(char *arg, bool set) +{ + char *opt; + int taint = 0; + + while (arg) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&arg, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. It doesn't make sense for a bug not to have a + * name so don't handle bug flags here. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + + if (set) { + pr_warn("setcpuid: force-enabling CPU feature flag:"); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU feature flag:"); + setup_clear_cpu_cap(bit); + } + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" %d:%d\n", bit >> 5, bit & 31); + else + pr_cont(" %s\n", x86_cap_flags[bit]); + + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + + for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) { + const char *flag; + const char *kind; + + if (bit < 32 * NCAPINTS) { + flag = x86_cap_flags[bit]; + kind = "feature"; + } else { + kind = "bug"; + flag = x86_bug_flags[bit - (32 * NCAPINTS)]; + } + + if (!flag) + continue; + + if (strcmp(flag, opt)) + continue; + + if (set) { + pr_warn("setcpuid: force-enabling CPU %s flag: %s\n", + kind, flag); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n", + kind, flag); + setup_clear_cpu_cap(bit); + } + taint++; + found = true; + break; + } + + if (!found) + pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt); + } + + return taint; +} + + /* * We parse cpu parameters early because fpu__init_system() is executed * before parse_early_param(). */ static void __init cpu_parse_early_param(void) { + bool cpuid_taint = false; char arg[128]; - char *argptr = arg, *opt; - int arglen, taint = 0; + int arglen; #ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) @@ -1519,61 +1603,17 @@ static void __init cpu_parse_early_param(void) setup_clear_cpu_cap(X86_FEATURE_FRED); arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); - if (arglen <= 0) - return; + if (arglen > 0) + cpuid_taint |= parse_set_clear_cpuid(arg, false); - pr_info("Clearing CPUID bits:"); - - while (argptr) { - bool found __maybe_unused = false; - unsigned int bit; + arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg)); + if (arglen > 0) + cpuid_taint |= parse_set_clear_cpuid(arg, true); - opt = strsep(&argptr, ","); - - /* - * Handle naked numbers first for feature flags which don't - * have names. - */ - if (!kstrtouint(opt, 10, &bit)) { - if (bit < NCAPINTS * 32) { - - /* empty-string, i.e., ""-defined feature flags */ - if (!x86_cap_flags[bit]) - pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); - else - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); - - setup_clear_cpu_cap(bit); - taint++; - } - /* - * The assumption is that there are no feature names with only - * numbers in the name thus go to the next argument. - */ - continue; - } - - for (bit = 0; bit < 32 * NCAPINTS; bit++) { - if (!x86_cap_flag(bit)) - continue; - - if (strcmp(x86_cap_flag(bit), opt)) - continue; - - pr_cont(" %s", opt); - setup_clear_cpu_cap(bit); - taint++; - found = true; - break; - } - - if (!found) - pr_cont(" (unknown: %s)", opt); - } - pr_cont("\n"); - - if (taint) + if (cpuid_taint) { + pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n"); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } } /* @@ -1610,6 +1650,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; filter_cpuid_features(c, false); + check_cpufeature_deps(c); if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); @@ -1870,6 +1911,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Filter out anything that depends on CPUID levels we don't have */ filter_cpuid_features(c, true); + /* Check for unmet dependencies based on the CPUID dependency table */ + check_cpufeature_deps(c); + /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { const char *p; @@ -1962,9 +2006,15 @@ static __init void identify_boot_cpu(void) lkgs_init(); } -void identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(unsigned int cpu) { - BUG_ON(c == &boot_cpu_data); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; + c->cpu_index = cpu; + identify_cpu(c); #ifdef CONFIG_X86_32 enable_sep_cpu(); @@ -1975,6 +2025,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) update_gds_msr(); tsx_ap_init(); + c->initialized = true; } void print_cpu_info(struct cpuinfo_x86 *c) @@ -2005,27 +2056,40 @@ void print_cpu_info(struct cpuinfo_x86 *c) } /* - * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy - * function prevents it from becoming an environment variable for init. + * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param(). + * These dummy functions prevent them from becoming an environment variable for + * init. */ + static __init int setup_clearcpuid(char *arg) { return 1; } __setup("clearcpuid=", setup_clearcpuid); -DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { - .current_task = &init_task, - .preempt_count = INIT_PREEMPT_COUNT, - .top_of_stack = TOP_OF_INIT_STACK, -}; -EXPORT_PER_CPU_SYMBOL(pcpu_hot); -EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); +static __init int setup_setcpuid(char *arg) +{ + return 1; +} +__setup("setcpuid=", setup_setcpuid); + +DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); +EXPORT_PER_CPU_SYMBOL(const_current_task); + +DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + +DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, - fixed_percpu_data) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); +/* + * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING + * so that this space is reserved in the hot cache section even when the + * mitigation is disabled. + */ +DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); +EXPORT_PER_CPU_SYMBOL(__x86_call_depth); static void wrmsrl_cstar(unsigned long val) { @@ -2089,18 +2153,15 @@ void syscall_init(void) if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_syscall_init(); } - -#else /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard); #ifndef CONFIG_SMP EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif -#endif /* CONFIG_X86_64 */ - /* * Clear all 6 debug registers: */ diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1beccefbaff9..51deb60a9d26 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -33,14 +33,6 @@ struct cpu_dev { #endif }; -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; - /* unsigned int ways; */ - char info[128]; -}; - #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __section(".x86_cpu_dev.init") = \ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index df838e3bdbe0..a2fbea0be535 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -147,3 +147,38 @@ void setup_clear_cpu_cap(unsigned int feature) { do_clear_cpu_cap(NULL, feature); } + +/* + * Return the feature "name" if available, otherwise return + * the X86_FEATURE_* numerals to make it easier to identify + * the feature. + */ +static const char *x86_feature_name(unsigned int feature, char *buf) +{ + if (x86_cap_flags[feature]) + return x86_cap_flags[feature]; + + snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32); + + return buf; +} + +void check_cpufeature_deps(struct cpuinfo_x86 *c) +{ + char feature_buf[16], depends_buf[16]; + const struct cpuid_dep *d; + + for (d = cpuid_deps; d->feature; d++) { + if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) { + /* + * Only warn about the first unmet dependency on the + * first CPU where it is encountered to avoid spamming + * the kernel log. + */ + pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n", + smp_processor_id(), + x86_feature_name(d->feature, feature_buf), + x86_feature_name(d->depends, depends_buf)); + } + } +} diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index cacfd3f6abef..1976fef2dfe5 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -16,8 +16,8 @@ static int cpu_debug_show(struct seq_file *m, void *p) if (!c->initialized) return 0; - seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); - seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: 0x%x\n", c->topo.apicid); seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index c5191b06f9f2..6af4a4a90a52 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -240,26 +240,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; } static const struct cpu_dev hygon_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 134368a3f4b1..4cbb2e69bea1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,40 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/kernel.h> -#include <linux/pgtable.h> -#include <linux/string.h> #include <linux/bitops.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/sched/clock.h> -#include <linux/thread_info.h> #include <linux/init.h> -#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/minmax.h> +#include <linux/smp.h> +#include <linux/string.h> + +#ifdef CONFIG_X86_64 +#include <linux/topology.h> +#endif -#include <asm/cpufeature.h> -#include <asm/msr.h> #include <asm/bugs.h> +#include <asm/cpu_device_id.h> +#include <asm/cpufeature.h> #include <asm/cpu.h> +#include <asm/hwcap2.h> #include <asm/intel-family.h> #include <asm/microcode.h> -#include <asm/hwcap2.h> -#include <asm/elf.h> -#include <asm/cpu_device_id.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include <asm/numa.h> +#include <asm/resctrl.h> #include <asm/thermal.h> - -#ifdef CONFIG_X86_64 -#include <linux/topology.h> -#endif +#include <asm/uaccess.h> #include "cpu.h" -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/mpspec.h> -#include <asm/apic.h> -#endif - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -195,7 +186,7 @@ void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; - if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd)) + if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN) return; /* @@ -210,10 +201,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) c->microcode = intel_get_microcode_revision(); @@ -256,8 +243,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) #endif /* CPUID workaround for 0F33/0F34 CPU */ - if (c->x86 == 0xF && c->x86_model == 0x3 - && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) + if (c->x86_vfm == INTEL_P4_PRESCOTT && + (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) c->x86_phys_bits = 36; /* @@ -266,10 +253,16 @@ static void early_init_intel(struct cpuinfo_x86 *c) * * It is also reliable across cores and sockets. (but not across * cabinets - we turn it off in that case explicitly.) + * + * Use a model-specific check for some older CPUs that have invariant + * TSC but may not report it architecturally via 8000_0007. */ if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) || + (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -298,12 +291,19 @@ static void early_init_intel(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_PAT); /* - * If fast string is not enabled in IA32_MISC_ENABLE for any reason, - * clear the fast string and enhanced fast string CPU capabilities. + * Modern CPUs are generally expected to have a sane fast string + * implementation. However, BIOSes typically have a knob to tweak + * the architectural MISC_ENABLE.FAST_STRING enable bit. + * + * Adhere to the preference and program the Linux-defined fast + * string flag and enhanced fast string capabilities accordingly. */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) { rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { + /* X86_FEATURE_ERMS is set based on CPUID */ + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } else { pr_info("Disabled fast string operations\n"); setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); setup_clear_cpu_cap(X86_FEATURE_ERMS); @@ -350,9 +350,7 @@ static void bsp_init_intel(struct cpuinfo_x86 *c) int ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 1 && + if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO && boot_cpu_data.x86_stepping < 8) { pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; @@ -369,9 +367,8 @@ static void intel_smp_check(struct cpuinfo_x86 *c) /* * Mask B, Pentium, but not Pentium MMX */ - if (c->x86 == 5 && - c->x86_stepping >= 1 && c->x86_stepping <= 4 && - c->x86_model <= 3) { + if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX && + c->x86_stepping >= 1 && c->x86_stepping <= 4) { /* * Remember we have B step Pentia with bugs */ @@ -398,7 +395,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (c->x86 == 5 && c->x86_model < 9) { + if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); @@ -413,7 +410,8 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until * model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633) + if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) || + c->x86_vfm < INTEL_PENTIUM_II_KLAMATH) clear_cpu_cap(c, X86_FEATURE_SEP); /* @@ -431,7 +429,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ - if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) { + if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); @@ -445,27 +443,20 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * integrated APIC (see 11AP erratum in "Pentium Processor * Specification Update"). */ - if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && + if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 && (c->x86_stepping < 0x6 || c->x86_stepping == 0xb)) set_cpu_bug(c, X86_BUG_11AP); - #ifdef CONFIG_X86_INTEL_USERCOPY /* - * Set up the preferred alignment for movsl bulk memory moves + * MOVSL bulk memory moves can be slow when source and dest are not + * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment. + * + * Set the preferred alignment for Pentium Pro and newer processors, as + * it has only been tested on these. */ - switch (c->x86) { - case 4: /* 486: untested */ - break; - case 5: /* Old Pentia: untested */ - break; - case 6: /* PII/PIII only like movsl with 8-byte alignment */ + if (c->x86_vfm >= INTEL_PENTIUM_PRO) movsl_mask.mask = 7; - break; - case 15: /* P4 is OK down to 8-byte alignment */ - movsl_mask.mask = 7; - break; - } #endif intel_smp_check(c); @@ -563,8 +554,6 @@ static void init_intel(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); #else /* * Names for the Pentium II/Celeron processors @@ -622,14 +611,14 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) * to determine which, so we use a boottime override * for the 512kb model, and assume 256 otherwise. */ - if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) + if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0) size = 256; /* * Intel Quark SoC X1000 contains a 4-way set associative * 16K cache with a 16 byte cache line and 256 lines per tag */ - if ((c->x86 == 5) && (c->x86_model == 9)) + if (c->x86_vfm == INTEL_QUARK_X1000) size = 16; return size; } @@ -667,50 +656,58 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) */ #define TLB_0x63_2M_4M_ENTRIES 32 +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; +}; + static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, - { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, - { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, - { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, - { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, - { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, - { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative" - " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" }, - { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, - { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, - { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, - { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, - { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, - { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, - { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, - { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */ + { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */ + { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */ + { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */ + { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */ + { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */ + { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */ + { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */ + { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */ + { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */ + { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */ + { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */ + { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */ + { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */ + { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */ + { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */ + { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */ + { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */ + { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */ + { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */ + { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */ { 0x00, 0, 0 } }; static void intel_tlb_lookup(const unsigned char desc) { + unsigned int entries; unsigned char k; + if (desc == 0) return; @@ -722,81 +719,58 @@ static void intel_tlb_lookup(const unsigned char desc) if (intel_tlb_table[k].tlb_type == 0) return; + entries = intel_tlb_table[k].entries; switch (intel_tlb_table[k].tlb_type) { case STLB_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); break; case STLB_4K_2M: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_INST_ALL: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); break; case TLB_INST_4M: - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_2M_4M: - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_DATA_4K: case TLB_DATA0_4K: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k = max(tlb_lld_4k, entries); break; case TLB_DATA_4M: case TLB_DATA0_4M: - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_2M_4M: case TLB_DATA0_2M_4M: - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_4K_4M: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_1G_2M_4M: - if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) - tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; - if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) - tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES); + tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES); fallthrough; case TLB_DATA_1G: - if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_1g = max(tlb_lld_1g, entries); break; } } @@ -891,34 +865,3 @@ static const struct cpu_dev intel_cpu_dev = { }; cpu_dev_register(intel_cpu_dev); - -#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 - -/** - * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU - * - * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in - * a hybrid processor. If the processor is not hybrid, returns 0. - */ -u8 get_this_hybrid_cpu_type(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; -} - -/** - * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU - * - * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. - * If the processor is not hybrid, returns 0. - */ -u32 get_this_hybrid_cpu_native_id(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) & - (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); -} diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 4f3c65429f82..6af1e8baeb0f 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -6,6 +6,34 @@ #include <linux/slab.h> /** + * x86_match_vendor_cpu_type - helper function to match the hardware defined + * cpu-type for a single entry in the x86_cpu_id + * table. Note, this function does not match the + * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and + * TOPO_CPU_TYPE_PERFORMANCE. + * @c: Pointer to the cpuinfo_x86 structure of the CPU to match. + * @m: Pointer to the x86_cpu_id entry to match against. + * + * Return: true if the cpu-type matches, false otherwise. + */ +static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m) +{ + if (m->type == X86_CPU_TYPE_ANY) + return true; + + /* Hybrid CPUs are special, they are assumed to match all cpu-types */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return true; + + if (c->x86_vendor == X86_VENDOR_INTEL) + return m->type == c->topo.intel_type; + if (c->x86_vendor == X86_VENDOR_AMD) + return m->type == c->topo.amd_type; + + return false; +} + +/** * x86_match_cpu - match current CPU against an array of x86_cpu_ids * @match: Pointer to array of x86_cpu_ids. Last entry terminated with * {}. @@ -50,6 +78,8 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; + if (!x86_match_vendor_cpu_type(c, m)) + continue; return m; } return NULL; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 0dc00c9894c7..1f14c3308b6b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -584,6 +584,28 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +static bool mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + mce_work_trigger(); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return true; + } + + return false; +} + static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -1773,28 +1795,6 @@ static void mce_timer_delete_all(void) del_timer_sync(&per_cpu(mce_timer, cpu)); } -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -bool mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return true; - } - return false; -} -EXPORT_SYMBOL_GPL(mce_notify_irq); - static void __mcheck_cpu_mce_banks_init(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 313fe682db33..06e3cf7229ce 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -229,7 +229,6 @@ static int raise_local(void) } else if (m->status) { pr_info("Starting machine check poll CPU %d\n", cpu); raise_poll(m); - mce_notify_irq(); pr_info("Machine check poll done on CPU %d\n", cpu); } else m->finished = 0; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index f3d534807d91..819199bc0119 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -74,7 +74,7 @@ void intel_collect_cpu_info(struct cpu_signature *sig) sig->pf = 0; sig->rev = intel_get_microcode_revision(); - if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) { + if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) { unsigned int val[2]; /* get processor flags from MSR 0x17 */ diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 2fdfda2b60e4..e2c6b471d230 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -9,9 +9,11 @@ #include <linux/io.h> #include <linux/mm.h> #include <linux/cc_platform.h> +#include <linux/string_choices.h> #include <asm/processor-flags.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> #include <asm/hypervisor.h> #include <asm/mshyperv.h> #include <asm/tlbflush.h> @@ -646,10 +648,10 @@ static void __init print_mtrr_state(void) pr_info("MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - pr_info("MTRR fixed ranges %sabled:\n", - ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? - "en" : "dis"); + pr_info("MTRR fixed ranges %s:\n", + str_enabled_disabled( + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED))); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -661,8 +663,8 @@ static void __init print_mtrr_state(void) /* tail */ print_fixed_last(); } - pr_info("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); + pr_info("MTRR variable ranges %s:\n", + str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)); high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { @@ -1025,8 +1027,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, * For Intel PPro stepping <= 7 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF */ - if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 1 && + if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO && boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a5c506f6da7f..4049235b1bfe 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) char *ptr; char line[LINE_SIZE]; int length; - size_t linelen; memset(line, 0, LINE_SIZE); @@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) if (length < 0) return length; - linelen = strlen(line); - ptr = line + linelen - 1; - if (linelen && *ptr == '\n') + ptr = line + length - 1; + if (length && *ptr == '\n') *ptr = '\0'; if (!strncmp(line, "disable=", 8)) { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 41ed01f46bd9..6571d432cbe3 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = arch_freq_get_on_cpu(cpu); + int freq = arch_freq_get_on_cpu(cpu); - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); + if (freq < 0) + seq_puts(m, "cpu MHz\t\t: Unknown\n"); + else + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index 4a06c37b9cf1..0c13b0befd8a 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o -obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o +obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o +obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o +obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 3d1735ed8d1f..cf29681d01e0 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -44,12 +44,6 @@ static DEFINE_MUTEX(domain_list_lock); DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state); /* - * Used to store the max resource name width and max resource data width - * to display the schemata in a tabular format - */ -int max_name_width, max_data_width; - -/* * Global boolean for rdt_alloc which is true if any * resource allocation is enabled. */ @@ -62,7 +56,7 @@ static void mba_wrmsr_amd(struct msr_param *m); #define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains) #define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains) -struct rdt_hw_resource rdt_resources_all[] = { +struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L3] = { .r_resctrl = { @@ -72,9 +66,7 @@ struct rdt_hw_resource rdt_resources_all[] = { .mon_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3), .mon_domains = mon_domain_init(RDT_RESOURCE_L3), - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, + .schema_fmt = RESCTRL_SCHEMA_BITMAP, }, .msr_base = MSR_IA32_L3_CBM_BASE, .msr_update = cat_wrmsr, @@ -86,9 +78,7 @@ struct rdt_hw_resource rdt_resources_all[] = { .name = "L2", .ctrl_scope = RESCTRL_L2_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, + .schema_fmt = RESCTRL_SCHEMA_BITMAP, }, .msr_base = MSR_IA32_L2_CBM_BASE, .msr_update = cat_wrmsr, @@ -100,9 +90,7 @@ struct rdt_hw_resource rdt_resources_all[] = { .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), - .parse_ctrlval = parse_bw, - .format_str = "%d=%*u", - .fflags = RFTYPE_RES_MB, + .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, [RDT_RESOURCE_SMBA] = @@ -112,9 +100,7 @@ struct rdt_hw_resource rdt_resources_all[] = { .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), - .parse_ctrlval = parse_bw, - .format_str = "%d=%*u", - .fflags = RFTYPE_RES_MB, + .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, }; @@ -127,6 +113,14 @@ u32 resctrl_arch_system_num_rmid_idx(void) return r->num_rmid; } +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &rdt_resources_all[l].r_resctrl; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. @@ -161,7 +155,6 @@ static inline void cache_alloc_hsw_probe(void) return; hw_res->num_closid = 4; - r->default_ctrl = max_cbm; r->cache.cbm_len = 20; r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; @@ -174,7 +167,7 @@ static inline void cache_alloc_hsw_probe(void) bool is_mba_sc(struct rdt_resource *r) { if (!r) - return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc; + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); /* * The software controller support is only applicable to MBA resource. @@ -217,7 +210,7 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; - r->default_ctrl = MAX_MBA_BW; + r->membw.max_bw = MAX_MBA_BW; r->membw.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { r->membw.delay_linear = true; @@ -228,16 +221,12 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) return false; r->membw.arch_needs_linear = false; } - r->data_width = 3; if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; else r->membw.throttle_mode = THREAD_THROTTLE_MAX; - resctrl_file_fflags_init("thread_throttle_mode", - RFTYPE_CTRL_INFO | RFTYPE_RES_MB); - r->alloc_capable = true; return true; @@ -256,7 +245,7 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); hw_res->num_closid = edx + 1; - r->default_ctrl = 1 << eax; + r->membw.max_bw = 1 << eax; /* AMD does not use delay */ r->membw.delay_linear = false; @@ -269,8 +258,6 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; - /* Max value is 2048, Data width should be 4 in decimal */ - r->data_width = 4; r->alloc_capable = true; @@ -283,14 +270,13 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) union cpuid_0x10_1_eax eax; union cpuid_0x10_x_ecx ecx; union cpuid_0x10_x_edx edx; - u32 ebx; + u32 ebx, default_ctrl; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; - r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; - r->cache.shareable_bits = ebx & r->default_ctrl; - r->data_width = (r->cache.cbm_len + 3) / 4; + default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & default_ctrl; if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) r->cache.arch_has_sparse_bitmasks = ecx.split.noncont; r->alloc_capable = true; @@ -337,7 +323,7 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) return MAX_MBA_BW - bw; pr_warn_once("Non Linear delay-bw map not supported but queried\n"); - return r->default_ctrl; + return MAX_MBA_BW; } static void mba_wrmsr_intel(struct msr_param *m) @@ -361,36 +347,6 @@ static void cat_wrmsr(struct msr_param *m) wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } -struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r) -{ - struct rdt_ctrl_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r) -{ - struct rdt_mon_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - u32 resctrl_arch_get_num_closid(struct rdt_resource *r) { return resctrl_to_arch_res(r)->num_closid; @@ -405,36 +361,6 @@ void rdt_ctrl_update(void *arg) hw_res->msr_update(m); } -/* - * rdt_find_domain - Search for a domain id in a resource domain list. - * - * Search the domain list to find the domain id. If the domain id is - * found, return the domain. NULL otherwise. If the domain id is not - * found (and NULL returned) then the first domain with id bigger than - * the input id can be returned to the caller via @pos. - */ -struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id, - struct list_head **pos) -{ - struct rdt_domain_hdr *d; - struct list_head *l; - - list_for_each(l, h) { - d = list_entry(l, struct rdt_domain_hdr, list); - /* When id is found, return its domain. */ - if (id == d->id) - return d; - /* Stop searching when finding id's position in sorted list. */ - if (id < d->id) - break; - } - - if (pos) - *pos = l; - - return NULL; -} - static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); @@ -446,7 +372,7 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) * For Memory Allocation: Set b/w requested to 100% */ for (i = 0; i < hw_res->num_closid; i++, dc++) - *dc = r->default_ctrl; + *dc = resctrl_get_default_ctrl(r); } static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) @@ -494,13 +420,13 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { size_t tsize; - if (is_mbm_total_enabled()) { + if (resctrl_arch_is_mbm_total_enabled()) { tsize = sizeof(*hw_dom->arch_mbm_total); hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_total) return -ENOMEM; } - if (is_mbm_local_enabled()) { + if (resctrl_arch_is_mbm_local_enabled()) { tsize = sizeof(*hw_dom->arch_mbm_local); hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_local) { @@ -545,7 +471,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos); + hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos); if (hdr) { if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) return; @@ -600,7 +526,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) return; } - hdr = rdt_find_domain(&r->mon_domains, id, &add_pos); + hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); if (hdr) { if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) return; @@ -665,7 +591,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - hdr = rdt_find_domain(&r->ctrl_domains, id, NULL); + hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL); if (!hdr) { pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n", id, cpu, r->name); @@ -711,7 +637,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - hdr = rdt_find_domain(&r->mon_domains, id, NULL); + hdr = resctrl_find_domain(&r->mon_domains, id, NULL); if (!hdr) { pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n", id, cpu, r->name); @@ -786,20 +712,6 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) return 0; } -/* - * Choose a width for the resource name and resource data based on the - * resource that has widest name and cbm. - */ -static __init void rdt_init_padding(void) -{ - struct rdt_resource *r; - - for_each_alloc_capable_rdt_resource(r) { - if (r->data_width > max_data_width) - max_data_width = r->data_width; - } -} - enum { RDT_FLAG_CMT, RDT_FLAG_MBM_TOTAL, @@ -885,6 +797,21 @@ bool __init rdt_cpu_has(int flag) return ret; } +__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + if (!rdt_cpu_has(X86_FEATURE_BMEC)) + return false; + + switch (evt) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL); + case QOS_L3_MBM_LOCAL_EVENT_ID: + return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL); + default: + return false; + } +} + static __init bool get_mem_config(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA]; @@ -963,11 +890,6 @@ static __init bool get_rdt_mon_resources(void) if (!rdt_mon_features) return false; - if (is_mbm_local_enabled()) - mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (is_mbm_total_enabled()) - mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; - return !rdt_get_mon_l3_config(r); } @@ -1086,7 +1008,7 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) } } -static int __init resctrl_late_init(void) +static int __init resctrl_arch_late_init(void) { struct rdt_resource *r; int state, ret; @@ -1102,8 +1024,6 @@ static int __init resctrl_late_init(void) if (!get_rdt_resources()) return -ENODEV; - rdt_init_padding(); - state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", resctrl_arch_online_cpu, @@ -1111,7 +1031,7 @@ static int __init resctrl_late_init(void) if (state < 0) return state; - ret = rdtgroup_init(); + ret = resctrl_init(); if (ret) { cpuhp_remove_state(state); return ret; @@ -1127,18 +1047,13 @@ static int __init resctrl_late_init(void) return 0; } -late_initcall(resctrl_late_init); +late_initcall(resctrl_arch_late_init); -static void __exit resctrl_exit(void) +static void __exit resctrl_arch_exit(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - cpuhp_remove_state(rdt_online); - rdtgroup_exit(); - - if (r->mon_capable) - rdt_put_mon_l3_config(); + resctrl_exit(); } -__exitcall(resctrl_exit); +__exitcall(resctrl_arch_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 536351159cc2..0a0ac5f6112e 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -23,6 +23,15 @@ #include "internal.h" +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + +typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, + struct resctrl_schema *s, + struct rdt_ctrl_domain *d); + /* * Check whether MBA bandwidth percentage value is correct. The value is * checked against the minimum and max bandwidth values specified by the @@ -54,9 +63,9 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } - if (bw < r->membw.min_bw || bw > r->default_ctrl) { + if (bw < r->membw.min_bw || bw > r->membw.max_bw) { rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->default_ctrl); + bw, r->membw.min_bw, r->membw.max_bw); return false; } @@ -64,8 +73,8 @@ static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } -int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) +static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) { struct resctrl_staged_config *cfg; u32 closid = data->rdtgrp->closid; @@ -104,8 +113,9 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, */ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) { - unsigned long first_bit, zero_bit, val; + u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit, val; int ret; ret = kstrtoul(buf, 16, &val); @@ -114,7 +124,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return false; } - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) { + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { rdt_last_cmd_puts("Mask out of range\n"); return false; } @@ -143,8 +153,8 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) +static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) { struct rdtgroup *rdtgrp = data->rdtgrp; struct resctrl_staged_config *cfg; @@ -210,6 +220,7 @@ static int parse_line(char *line, struct resctrl_schema *s, struct rdtgroup *rdtgrp) { enum resctrl_conf_type t = s->conf_type; + ctrlval_parser_t *parse_ctrlval = NULL; struct resctrl_staged_config *cfg; struct rdt_resource *r = s->res; struct rdt_parse_data data; @@ -220,6 +231,18 @@ static int parse_line(char *line, struct resctrl_schema *s, /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + parse_ctrlval = &parse_cbm; + break; + case RESCTRL_SCHEMA_RANGE: + parse_ctrlval = &parse_bw; + break; + } + + if (WARN_ON_ONCE(!parse_ctrlval)) + return -EINVAL; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); @@ -240,7 +263,7 @@ next: if (d->hdr.id == dom_id) { data.buf = dom; data.rdtgrp = rdtgrp; - if (r->parse_ctrlval(&data, s, d)) + if (parse_ctrlval(&data, s, d)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { cfg = &d->staged_config[t]; @@ -264,25 +287,12 @@ next: return -EINVAL; } -static u32 get_config_index(u32 closid, enum resctrl_conf_type type) -{ - switch (type) { - default: - case CDP_NONE: - return closid; - case CDP_CODE: - return closid * 2 + 1; - case CDP_DATA: - return closid * 2; - } -} - int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - u32 idx = get_config_index(closid, t); + u32 idx = resctrl_get_config_index(closid, t); struct msr_param msr_param; if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask)) @@ -319,7 +329,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) if (!cfg->have_new_ctrl) continue; - idx = get_config_index(closid, t); + idx = resctrl_get_config_index(closid, t); if (cfg->new_ctrl == hw_dom->ctrl_val[idx]) continue; hw_dom->ctrl_val[idx] = cfg->new_ctrl; @@ -439,7 +449,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); - u32 idx = get_config_index(closid, type); + u32 idx = resctrl_get_config_index(closid, type); return hw_dom->ctrl_val[idx]; } @@ -465,8 +475,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo ctrl_val = resctrl_arch_get_config(r, dom, closid, schema->conf_type); - seq_printf(s, r->format_str, dom->hdr.id, max_data_width, - ctrl_val); + seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); sep = true; } seq_puts(s, "\n"); @@ -537,12 +546,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (is_mbm_local_enabled()) + if (resctrl_arch_is_mbm_local_enabled()) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (is_mbm_total_enabled()) + if (resctrl_arch_is_mbm_total_enabled()) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; @@ -588,6 +597,28 @@ int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, return ret; } +struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, + struct list_head **pos) +{ + struct rdt_domain_hdr *d; + struct list_head *l; + + list_for_each(l, h) { + d = list_entry(l, struct rdt_domain_hdr, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first) @@ -649,7 +680,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) resid = md.u.rid; domid = md.u.domid; evtid = md.u.evtid; - r = &rdt_resources_all[resid].r_resctrl; + r = resctrl_arch_get_resource(resid); if (md.u.sum) { /* @@ -673,7 +704,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * This file provides data from a single domain. Search * the resource to find the domain with "domid". */ - hdr = rdt_find_domain(&r->mon_domains, domid, NULL); + hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { ret = -ENOENT; goto out; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 20c898f09b7e..c44c5b496355 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -32,30 +32,6 @@ */ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) -/* Reads to Local DRAM Memory */ -#define READS_TO_LOCAL_MEM BIT(0) - -/* Reads to Remote DRAM Memory */ -#define READS_TO_REMOTE_MEM BIT(1) - -/* Non-Temporal Writes to Local Memory */ -#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) - -/* Non-Temporal Writes to Remote Memory */ -#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) - -/* Reads to Local Memory the system identifies as "Slow Memory" */ -#define READS_TO_LOCAL_S_MEM BIT(4) - -/* Reads to Remote Memory the system identifies as "Slow Memory" */ -#define READS_TO_REMOTE_S_MEM BIT(5) - -/* Dirty Victims to All Types of Memory */ -#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) - -/* Max event bits supported */ -#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) - /** * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that * aren't marked nohz_full @@ -180,7 +156,6 @@ struct rmid_read { void *arch_mon_ctx; }; -extern unsigned int rdt_mon_features; extern struct list_head resctrl_schema_all; extern bool resctrl_mounted; @@ -234,43 +209,6 @@ struct mongroup { }; /** - * struct pseudo_lock_region - pseudo-lock region information - * @s: Resctrl schema for the resource to which this - * pseudo-locked region belongs - * @d: RDT domain to which this pseudo-locked region - * belongs - * @cbm: bitmask of the pseudo-locked region - * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread - * completion - * @thread_done: variable used by waitqueue to test if pseudo-locking - * thread completed - * @cpu: core associated with the cache on which the setup code - * will be run - * @line_size: size of the cache lines - * @size: size of pseudo-locked region in bytes - * @kmem: the kernel memory associated with pseudo-locked region - * @minor: minor number of character device associated with this - * region - * @debugfs_dir: pointer to this region's directory in the debugfs - * filesystem - * @pm_reqs: Power management QoS requests related to this region - */ -struct pseudo_lock_region { - struct resctrl_schema *s; - struct rdt_ctrl_domain *d; - u32 cbm; - wait_queue_head_t lock_thread_wq; - int thread_done; - int cpu; - unsigned int line_size; - unsigned int size; - void *kmem; - unsigned int minor; - struct dentry *debugfs_dir; - struct list_head pm_reqs; -}; - -/** * struct rdtgroup - store rdtgroup's data in resctrl file system. * @kn: kernfs node * @rdtgroup_list: linked list for all rdtgroups @@ -326,10 +264,7 @@ struct rdtgroup { /* List of all resource groups */ extern struct list_head rdt_all_groups; -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); -void __exit rdtgroup_exit(void); +extern int max_name_width; /** * struct rftype - describe each file in the resctrl file system @@ -433,37 +368,6 @@ struct msr_param { u32 high; }; -static inline bool is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - -static inline bool is_mbm_enabled(void) -{ - return (is_mbm_total_enabled() || is_mbm_local_enabled()); -} - -static inline bool is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - -struct rdt_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /** * struct rdt_hw_resource - arch private attributes of a resctrl resource * @r_resctrl: Attributes of the resource used directly by resctrl. @@ -476,8 +380,6 @@ struct rdt_parse_data { * @msr_update: Function pointer to update QOS MSRs * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. - * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth - * Monitoring Event Configuration (BMEC) is supported. * @cdp_enabled: CDP state of this resource * * Members of this structure are either private to the architecture @@ -491,7 +393,6 @@ struct rdt_hw_resource { void (*msr_update)(struct msr_param *m); unsigned int mon_scale; unsigned int mbm_width; - unsigned int mbm_cfg_mask; bool cdp_enabled; }; @@ -500,11 +401,6 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r return container_of(r, struct rdt_hw_resource, r_resctrl); } -int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d); -int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d); - extern struct mutex rdtgroup_mutex; extern struct rdt_hw_resource rdt_resources_all[]; @@ -512,24 +408,6 @@ extern struct rdtgroup rdtgroup_default; extern struct dentry *debugfs_resctrl; extern enum resctrl_event_id mba_mbps_default_event; -enum resctrl_res_level { - RDT_RESOURCE_L3, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - RDT_RESOURCE_SMBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res) -{ - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res); - - hw_res++; - return &hw_res->r_resctrl; -} - static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) { return rdt_resources_all[l].cdp_enabled; @@ -539,27 +417,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); -/* - * To return the common struct rdt_resource, which is contained in struct - * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource. - */ -#define for_each_rdt_resource(r) \ - for (r = &rdt_resources_all[0].r_resctrl; \ - r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \ - r = resctrl_inc(r)) - -#define for_each_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_capable || r->mon_capable) - -#define for_each_alloc_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_capable) - -#define for_each_mon_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->mon_capable) - /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { struct { @@ -604,8 +461,6 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn); int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, umode_t mask); -struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id, - struct list_head **pos); ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, @@ -620,28 +475,19 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain unsigned long cbm); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); int rdtgroup_tasks_assigned(struct rdtgroup *r); -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); -int rdt_pseudo_lock_init(void); -void rdt_pseudo_lock_release(void); -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r); -struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); -void __exit rdt_put_mon_l3_config(void); +void resctrl_mon_resource_exit(void); bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, cpumask_t *cpumask, int evtid, int first); +int __init resctrl_mon_resource_init(void); void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); @@ -658,4 +504,45 @@ void resctrl_file_fflags_init(const char *config, unsigned long fflags); void rdt_staged_configs_clear(void); bool closid_allocated(unsigned int closid); int resctrl_find_cleanest_closid(void); + +#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); +int rdt_pseudo_lock_init(void); +void rdt_pseudo_lock_release(void); +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); +#else +static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) +{ + return false; +} + +static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) +{ + return false; +} + +static inline int rdt_pseudo_lock_init(void) { return 0; } +static inline void rdt_pseudo_lock_release(void) { } +static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } +#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ + #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 94a1d9780461..a93ed7d2a160 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -295,11 +295,11 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - if (is_mbm_total_enabled()) + if (resctrl_arch_is_mbm_total_enabled()) memset(hw_dom->arch_mbm_total, 0, sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - if (is_mbm_local_enabled()) + if (resctrl_arch_is_mbm_local_enabled()) memset(hw_dom->arch_mbm_local, 0, sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); } @@ -365,7 +365,7 @@ static void limbo_release_entry(struct rmid_entry *entry) */ void __check_limbo(struct rdt_mon_domain *d, bool force_free) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; u32 idx, cur_idx = 1; @@ -521,7 +521,7 @@ int alloc_rmid(u32 closid) static void add_rmid_to_limbo(struct rmid_entry *entry) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdt_mon_domain *d; u32 idx; @@ -569,7 +569,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (is_llc_occupancy_enabled()) + if (resctrl_arch_is_llc_occupancy_enabled()) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -718,6 +718,22 @@ void mon_event_count(void *info) rr->err = 0; } +static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, + struct rdt_resource *r) +{ + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + /* * Feedback loop for MBA software controller (mba_sc) * @@ -761,7 +777,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) struct rdtgroup *entry; u32 cur_bw, user_bw; - r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); evt_id = rgrp->mba_mbps_event; closid = rgrp->closid; @@ -852,10 +868,10 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (is_mbm_total_enabled()) + if (resctrl_arch_is_mbm_total_enabled()) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - if (is_mbm_local_enabled()) + if (resctrl_arch_is_mbm_local_enabled()) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); } @@ -925,7 +941,7 @@ void mbm_handle_overflow(struct work_struct *work) if (!resctrl_mounted || !resctrl_arch_mon_capable()) goto out_unlock; - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { @@ -1027,7 +1043,7 @@ static int dom_data_init(struct rdt_resource *r) /* * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in rdtgroup_init(). + * control group, which will be setup later in resctrl_init(). */ idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); @@ -1040,10 +1056,13 @@ out_unlock: return err; } -static void __exit dom_data_exit(void) +static void dom_data_exit(struct rdt_resource *r) { mutex_lock(&rdtgroup_mutex); + if (!r->mon_capable) + goto out_unlock; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { kfree(closid_num_dirty_rmid); closid_num_dirty_rmid = NULL; @@ -1052,6 +1071,7 @@ static void __exit dom_data_exit(void) kfree(rmid_ptrs); rmid_ptrs = NULL; +out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -1081,11 +1101,11 @@ static void l3_mon_evt_init(struct rdt_resource *r) { INIT_LIST_HEAD(&r->evt_list); - if (is_llc_occupancy_enabled()) + if (resctrl_arch_is_llc_occupancy_enabled()) list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (is_mbm_total_enabled()) + if (resctrl_arch_is_mbm_total_enabled()) list_add_tail(&mbm_total_event.list, &r->evt_list); - if (is_mbm_local_enabled()) + if (resctrl_arch_is_mbm_local_enabled()) list_add_tail(&mbm_local_event.list, &r->evt_list); } @@ -1172,12 +1192,56 @@ static __init int snc_get_config(void) return ret; } +/** + * resctrl_mon_resource_init() - Initialise global monitoring structures. + * + * Allocate and initialise global monitor resources that do not belong to a + * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. + * Called once during boot after the struct rdt_resource's have been configured + * but before the filesystem is mounted. + * Resctrl's cpuhp callbacks may be called before this point to bring a domain + * online. + * + * Returns 0 for success, or -ENOMEM. + */ +int __init resctrl_mon_resource_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + int ret; + + if (!r->mon_capable) + return 0; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { + mbm_total_event.configurable = true; + resctrl_file_fflags_init("mbm_total_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + } + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { + mbm_local_event.configurable = true; + resctrl_file_fflags_init("mbm_local_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + } + + if (resctrl_arch_is_mbm_local_enabled()) + mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else if (resctrl_arch_is_mbm_total_enabled()) + mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + + return 0; +} + int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; - int ret; snc_nodes_per_l3_cache = snc_get_config(); @@ -1207,39 +1271,24 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - ret = dom_data_init(r); - if (ret) - return ret; - if (rdt_cpu_has(X86_FEATURE_BMEC)) { u32 eax, ebx, ecx, edx; /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; - - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - mbm_total_event.configurable = true; - resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - mbm_local_event.configurable = true; - resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } + r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } - l3_mon_evt_init(r); - r->mon_capable = true; return 0; } -void __exit rdt_put_mon_l3_config(void) +void resctrl_mon_resource_exit(void) { - dom_data_exit(); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + dom_data_exit(r); } void __init intel_rdt_mbm_apply_quirk(void) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 42cc162f7fc9..01fa7890b43f 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -61,7 +61,8 @@ static const struct class pseudo_lock_class = { }; /** - * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported + * platforms * @void: It takes no parameters. * * Capture the list of platforms that have been validated to support @@ -75,14 +76,16 @@ static const struct class pseudo_lock_class = { * in the SDM. * * When adding a platform here also add support for its cache events to - * measure_cycles_perf_fn() + * resctrl_arch_measure_l*_residency() * * Return: * If platform is supported, the bits to disable hardware prefetchers, 0 * if platform is not supported. */ -static u64 get_prefetch_disable_bits(void) +u64 resctrl_arch_get_prefetch_disable_bits(void) { + prefetch_disable_bits = 0; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 != 6) return 0; @@ -98,7 +101,8 @@ static u64 get_prefetch_disable_bits(void) * 3 DCU IP Prefetcher Disable (R/W) * 63:4 Reserved */ - return 0xF; + prefetch_disable_bits = 0xF; + break; case INTEL_ATOM_GOLDMONT: case INTEL_ATOM_GOLDMONT_PLUS: /* @@ -109,10 +113,11 @@ static u64 get_prefetch_disable_bits(void) * 2 DCU Hardware Prefetcher Disable (R/W) * 63:3 Reserved */ - return 0x5; + prefetch_disable_bits = 0x5; + break; } - return 0; + return prefetch_disable_bits; } /** @@ -408,8 +413,8 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp) } /** - * pseudo_lock_fn - Load kernel memory into cache - * @_rdtgrp: resource group to which pseudo-lock region belongs + * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache + * @_plr: the pseudo-lock region descriptor * * This is the core pseudo-locking flow. * @@ -426,10 +431,9 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp) * * Return: 0. Waiter on waitqueue will be woken on completion. */ -static int pseudo_lock_fn(void *_rdtgrp) +int resctrl_arch_pseudo_lock_fn(void *_plr) { - struct rdtgroup *rdtgrp = _rdtgrp; - struct pseudo_lock_region *plr = rdtgrp->plr; + struct pseudo_lock_region *plr = _plr; u32 rmid_p, closid_p; unsigned long i; u64 saved_msr; @@ -489,7 +493,8 @@ static int pseudo_lock_fn(void *_rdtgrp) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); + /* * Cache was flushed earlier. Now access kernel memory to read it * into cache region associated with just activated plr->closid. @@ -712,8 +717,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) * Not knowing the bits to disable prefetching implies that this * platform does not support Cache Pseudo-Locking. */ - prefetch_disable_bits = get_prefetch_disable_bits(); - if (prefetch_disable_bits == 0) { + if (resctrl_arch_get_prefetch_disable_bits() == 0) { rdt_last_cmd_puts("Pseudo-locking not supported\n"); return -EINVAL; } @@ -872,7 +876,8 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) } /** - * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory + * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read + * pseudo-locked memory * @_plr: pseudo-lock region to measure * * There is no deterministic way to test if a memory region is cached. One @@ -885,7 +890,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) * * Return: 0. Waiter on waitqueue will be woken on completion. */ -static int measure_cycles_lat_fn(void *_plr) +int resctrl_arch_measure_cycles_lat_fn(void *_plr) { struct pseudo_lock_region *plr = _plr; u32 saved_low, saved_high; @@ -1069,7 +1074,7 @@ out: return 0; } -static int measure_l2_residency(void *_plr) +int resctrl_arch_measure_l2_residency(void *_plr) { struct pseudo_lock_region *plr = _plr; struct residency_counts counts = {0}; @@ -1107,7 +1112,7 @@ out: return 0; } -static int measure_l3_residency(void *_plr) +int resctrl_arch_measure_l3_residency(void *_plr) { struct pseudo_lock_region *plr = _plr; struct residency_counts counts = {0}; @@ -1205,14 +1210,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) plr->cpu = cpu; if (sel == 1) - thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr, - cpu, "pseudo_lock_measure/%u"); + thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, + plr, cpu, "pseudo_lock_measure/%u"); else if (sel == 2) - thread = kthread_run_on_cpu(measure_l2_residency, plr, - cpu, "pseudo_lock_measure/%u"); + thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, + plr, cpu, "pseudo_lock_measure/%u"); else if (sel == 3) - thread = kthread_run_on_cpu(measure_l3_residency, plr, - cpu, "pseudo_lock_measure/%u"); + thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, + plr, cpu, "pseudo_lock_measure/%u"); else goto out; @@ -1307,7 +1312,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) plr->thread_done = 0; - thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp, + thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, plr->cpu, "pseudo_lock/%u"); if (IS_ERR(thread)) { ret = PTR_ERR(thread); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 6419e04d8a7b..c6274d40b217 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -57,6 +57,12 @@ static struct kernfs_node *kn_mongrp; /* Kernel fs node for "mon_data" directory under root */ static struct kernfs_node *kn_mondata; +/* + * Used to store the max resource name width to display the schemata names in + * a tabular format. + */ +int max_name_width; + static struct seq_buf last_cmd_status; static char last_cmd_status_buf[512]; @@ -111,6 +117,18 @@ void rdt_staged_configs_clear(void) } } +static bool resctrl_is_mbm_enabled(void) +{ + return (resctrl_arch_is_mbm_total_enabled() || + resctrl_arch_is_mbm_local_enabled()); +} + +static bool resctrl_is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -157,7 +175,8 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && + resctrl_arch_is_llc_occupancy_enabled()) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -348,13 +367,13 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. */ -static void update_cpu_closid_rmid(void *info) +void resctrl_arch_sync_cpu_closid_rmid(void *info) { - struct rdtgroup *r = info; + struct resctrl_cpu_defaults *r = info; if (r) { this_cpu_write(pqr_state.default_closid, r->closid); - this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + this_cpu_write(pqr_state.default_rmid, r->rmid); } /* @@ -369,11 +388,20 @@ static void update_cpu_closid_rmid(void *info) * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, * * Per task closids/rmids must have been set up before calling this function. + * @r may be NULL. */ static void update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { - on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1); + struct resctrl_cpu_defaults defaults, *p = NULL; + + if (r) { + defaults.closid = r->closid; + defaults.rmid = r->mon.rmid; + p = &defaults; + } + + on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); } static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, @@ -971,7 +999,7 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of, struct resctrl_schema *s = of->kn->parent->priv; struct rdt_resource *r = s->res; - seq_printf(seq, "%x\n", r->default_ctrl); + seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); return 0; } @@ -1160,10 +1188,19 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, struct resctrl_schema *s = of->kn->parent->priv; struct rdt_resource *r = s->res; - if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) + switch (r->membw.throttle_mode) { + case THREAD_THROTTLE_PER_THREAD: seq_puts(seq, "per-thread\n"); - else + return 0; + case THREAD_THROTTLE_MAX: seq_puts(seq, "max\n"); + return 0; + case THREAD_THROTTLE_UNDEFINED: + seq_puts(seq, "undefined\n"); + return 0; + } + + WARN_ON_ONCE(1); return 0; } @@ -1425,7 +1462,8 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, goto out; } rdtgrp->mode = RDT_MODE_EXCLUSIVE; - } else if (!strcmp(buf, "pseudo-locksetup")) { + } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && + !strcmp(buf, "pseudo-locksetup")) { ret = rdtgroup_locksetup_enter(rdtgrp); if (ret) goto out; @@ -1552,11 +1590,6 @@ out: return ret; } -struct mon_config_info { - u32 evtid; - u32 mon_config; -}; - #define INVALID_CONFIG_INDEX UINT_MAX /** @@ -1581,31 +1614,32 @@ static inline unsigned int mon_event_config_index_get(u32 evtid) } } -static void mon_event_config_read(void *info) +void resctrl_arch_mon_event_config_read(void *_config_info) { - struct mon_config_info *mon_info = info; + struct resctrl_mon_config_info *config_info = _config_info; unsigned int index; u64 msrval; - index = mon_event_config_index_get(mon_info->evtid); + index = mon_event_config_index_get(config_info->evtid); if (index == INVALID_CONFIG_INDEX) { - pr_warn_once("Invalid event id %d\n", mon_info->evtid); + pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); /* Report only the valid event configuration bits */ - mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; + config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info) +static void mondata_config_read(struct resctrl_mon_config_info *mon_info) { - smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1); + smp_call_function_any(&mon_info->d->hdr.cpu_mask, + resctrl_arch_mon_event_config_read, mon_info, 1); } static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { - struct mon_config_info mon_info; + struct resctrl_mon_config_info mon_info; struct rdt_mon_domain *dom; bool sep = false; @@ -1616,9 +1650,11 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid if (sep) seq_puts(s, ";"); - memset(&mon_info, 0, sizeof(struct mon_config_info)); + memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); + mon_info.r = r; + mon_info.d = dom; mon_info.evtid = evtid; - mondata_config_read(dom, &mon_info); + mondata_config_read(&mon_info); seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); sep = true; @@ -1651,30 +1687,32 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of, return 0; } -static void mon_event_config_write(void *info) +void resctrl_arch_mon_event_config_write(void *_config_info) { - struct mon_config_info *mon_info = info; + struct resctrl_mon_config_info *config_info = _config_info; unsigned int index; - index = mon_event_config_index_get(mon_info->evtid); + index = mon_event_config_index_get(config_info->evtid); if (index == INVALID_CONFIG_INDEX) { - pr_warn_once("Invalid event id %d\n", mon_info->evtid); + pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); + wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0); } static void mbm_config_write_domain(struct rdt_resource *r, struct rdt_mon_domain *d, u32 evtid, u32 val) { - struct mon_config_info mon_info = {0}; + struct resctrl_mon_config_info mon_info = {0}; /* * Read the current config value first. If both are the same then * no need to write it again. */ + mon_info.r = r; + mon_info.d = d; mon_info.evtid = evtid; - mondata_config_read(d, &mon_info); + mondata_config_read(&mon_info); if (mon_info.mon_config == val) return; @@ -1686,7 +1724,7 @@ static void mbm_config_write_domain(struct rdt_resource *r, * are scoped at the domain level. Writing any of these MSRs * on one CPU is observed by all the CPUs in the domain. */ - smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write, + smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, &mon_info, 1); /* @@ -1703,7 +1741,6 @@ static void mbm_config_write_domain(struct rdt_resource *r, static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); char *dom_str = NULL, *id_str; unsigned long dom_id, val; struct rdt_mon_domain *d; @@ -1730,9 +1767,9 @@ next: } /* Value from user cannot be more than the supported set of events */ - if ((val & hw_res->mbm_cfg_mask) != val) { + if ((val & r->mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - hw_res->mbm_cfg_mask); + r->mbm_cfg_mask); return -EINVAL; } @@ -2036,6 +2073,28 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name) return NULL; } +static void thread_throttle_mode_init(void) +{ + enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; + struct rdt_resource *r_mba, *r_smba; + + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + if (r_mba->alloc_capable && + r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_mba->membw.throttle_mode; + + r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); + if (r_smba->alloc_capable && + r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_smba->membw.throttle_mode; + + if (throttle_mode == THREAD_THROTTLE_UNDEFINED) + return; + + resctrl_file_fflags_init("thread_throttle_mode", + RFTYPE_CTRL_INFO | RFTYPE_RES_MB); +} + void resctrl_file_fflags_init(const char *config, unsigned long fflags) { struct rftype *rft; @@ -2164,6 +2223,20 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; } +static unsigned long fflags_from_resource(struct rdt_resource *r) +{ + switch (r->rid) { + case RDT_RESOURCE_L3: + case RDT_RESOURCE_L2: + return RFTYPE_RES_CACHE; + case RDT_RESOURCE_MBA: + case RDT_RESOURCE_SMBA: + return RFTYPE_RES_MB; + } + + return WARN_ON_ONCE(1); +} + static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { struct resctrl_schema *s; @@ -2184,14 +2257,14 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - fflags = r->fflags | RFTYPE_CTRL_INFO; + fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; } for_each_mon_capable_rdt_resource(r) { - fflags = r->fflags | RFTYPE_MON_INFO; + fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) @@ -2255,7 +2328,7 @@ static void l2_qos_cfg_update(void *arg) static inline bool is_mba_linear(void) { - return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear; + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; } static int set_cache_qos_cfg(int level, bool enable) @@ -2345,10 +2418,10 @@ static void mba_sc_domain_destroy(struct rdt_resource *r, */ static bool supports_mba_mbps(void) { - struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - return (is_mbm_enabled() && + return (resctrl_is_mbm_enabled() && r->alloc_capable && is_mba_linear() && r->ctrl_scope == rmbm->mon_scope); } @@ -2359,7 +2432,7 @@ static bool supports_mba_mbps(void) */ static int set_mba_sc(bool mba_sc) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); u32 num_closid = resctrl_arch_get_num_closid(r); struct rdt_ctrl_domain *d; unsigned long fflags; @@ -2596,6 +2669,20 @@ static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type if (cl > max_name_width) max_name_width = cl; + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + s->fmt_str = "%d=%x"; + break; + case RESCTRL_SCHEMA_RANGE: + s->fmt_str = "%d=%u"; + break; + } + + if (WARN_ON_ONCE(!s->fmt_str)) { + kfree(s); + return -EINVAL; + } + INIT_LIST_HEAD(&s->list); list_add(&s->list, &resctrl_schema_all); @@ -2712,8 +2799,8 @@ static int rdt_get_tree(struct fs_context *fc) if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) resctrl_mounted = true; - if (is_mbm_enabled()) { - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + if (resctrl_is_mbm_enabled()) { + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); list_for_each_entry(dom, &r->mon_domains, hdr.list) mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, RESCTRL_PICK_ANY_CPU); @@ -2823,7 +2910,7 @@ static int rdt_init_fs_context(struct fs_context *fc) return 0; } -static int reset_all_ctrls(struct rdt_resource *r) +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_ctrl_domain *hw_dom; @@ -2847,12 +2934,12 @@ static int reset_all_ctrls(struct rdt_resource *r) hw_dom = resctrl_to_arch_ctrl_dom(d); for (i = 0; i < hw_res->num_closid; i++) - hw_dom->ctrl_val[i] = r->default_ctrl; + hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r); msr_param.dom = d; smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } - return 0; + return; } /* @@ -2971,9 +3058,10 @@ static void rdt_kill_sb(struct super_block *sb) rdt_disable_ctx(); - /*Put everything back to default values. */ + /* Put everything back to default values. */ for_each_alloc_capable_rdt_resource(r) - reset_all_ctrls(r); + resctrl_arch_reset_all_ctrls(r); + rmdir_all_sub(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; @@ -3080,7 +3168,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, if (ret) return ret; - if (!do_sum && is_mbm_event(mevt->evtid)) + if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); } @@ -3382,7 +3470,7 @@ static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) } cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = r->default_ctrl; + cfg->new_ctrl = resctrl_get_default_ctrl(r); cfg->have_new_ctrl = true; } } @@ -3696,14 +3784,21 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + u32 closid, rmid; int cpu; /* Give any tasks back to the parent group */ rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); - /* Update per cpu rmid of the moved CPUs first */ + /* + * Update per cpu closid/rmid of the moved CPUs first. + * Note: the closid will not change, but the arch code still needs it. + */ + closid = prdtgrp->closid; + rmid = prdtgrp->mon.rmid; for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); + /* * Update the MSR on moved CPUs and CPUs which have moved * task running on them. @@ -3736,6 +3831,7 @@ static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { + u32 closid, rmid; int cpu; /* Give any tasks back to the default group */ @@ -3746,10 +3842,10 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); /* Update per cpu closid and rmid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) { - per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; - per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; - } + closid = rdtgroup_default.closid; + rmid = rdtgroup_default.mon.rmid; + for_each_cpu(cpu, &rdtgrp->cpu_mask) + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); /* * Update the MSR on moved CPUs and CPUs which have moved @@ -3950,7 +4046,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) seq_puts(seq, ",cdpl2"); - if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) + if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) seq_puts(seq, ",mba_MBps"); if (resctrl_debug) @@ -4029,9 +4125,9 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, d); - if (is_mbm_enabled()) + if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4049,17 +4145,30 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d mutex_unlock(&rdtgroup_mutex); } +/** + * domain_setup_mon_state() - Initialise domain monitoring structures. + * @r: The resource for the newly online domain. + * @d: The newly online domain. + * + * Allocate monitor resources that belong to this domain. + * Called when the first CPU of a domain comes online, regardless of whether + * the filesystem is mounted. + * During boot this may be called before global allocations have been made by + * resctrl_mon_resource_init(). + * + * Returns 0 for success, or -ENOMEM. + */ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; - if (is_llc_occupancy_enabled()) { + if (resctrl_arch_is_llc_occupancy_enabled()) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (is_mbm_total_enabled()) { + if (resctrl_arch_is_mbm_total_enabled()) { tsize = sizeof(*d->mbm_total); d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { @@ -4067,7 +4176,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain return -ENOMEM; } } - if (is_mbm_local_enabled()) { + if (resctrl_arch_is_mbm_local_enabled()) { tsize = sizeof(*d->mbm_local); d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { @@ -4106,13 +4215,13 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) if (err) goto out_unlock; - if (is_mbm_enabled()) { + if (resctrl_is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, RESCTRL_PICK_ANY_CPU); } - if (is_llc_occupancy_enabled()) + if (resctrl_arch_is_llc_occupancy_enabled()) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4148,9 +4257,25 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) } } +static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, + struct rdt_resource *r) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + void resctrl_offline_cpu(unsigned int cpu) { - struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; @@ -4167,12 +4292,12 @@ void resctrl_offline_cpu(unsigned int cpu) d = get_mon_domain_from_cpu(cpu, l3); if (d) { - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(d)) { + if (resctrl_arch_is_llc_occupancy_enabled() && + cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); } @@ -4183,14 +4308,14 @@ out_unlock: } /* - * rdtgroup_init - rdtgroup initialization + * resctrl_init - resctrl filesystem initialization * * Setup resctrl file system including set up root, create mount point, - * register rdtgroup filesystem, and initialize files under root directory. + * register resctrl filesystem, and initialize files under root directory. * * Return: 0 on success or -errno */ -int __init rdtgroup_init(void) +int __init resctrl_init(void) { int ret = 0; @@ -4199,10 +4324,18 @@ int __init rdtgroup_init(void) rdtgroup_setup_default(); - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + thread_throttle_mode_init(); + + ret = resctrl_mon_resource_init(); if (ret) return ret; + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) { + resctrl_mon_resource_exit(); + return ret; + } + ret = register_filesystem(&rdt_fs_type); if (ret) goto cleanup_mountpoint; @@ -4234,13 +4367,16 @@ int __init rdtgroup_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); + resctrl_mon_resource_exit(); return ret; } -void __exit rdtgroup_exit(void) +void __exit resctrl_exit(void) { debugfs_remove_recursive(debugfs_resctrl); unregister_filesystem(&rdt_fs_type); sysfs_remove_mount_point(fs_kobj, "resctrl"); + + resctrl_mon_resource_exit(); } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 340af8155658..0be61c45400c 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -140,7 +140,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) x86_platform.guest.enc_kexec_begin(); x86_platform.guest.enc_kexec_finish(); - crash_save_cpu(regs, safe_smp_processor_id()); + crash_save_cpu(regs, smp_processor_id()); } #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index a7d562697e50..91639d1e4ec2 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -395,18 +395,13 @@ NOKPROBE_SYMBOL(oops_end); static void __die_header(const char *str, struct pt_regs *regs, long err) { - const char *pr = ""; - /* Save the regs of the first oops for the executive summary later. */ if (!die_counter) exec_summary_regs = *regs; - if (IS_ENABLED(CONFIG_PREEMPTION)) - pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; - printk(KERN_DEFAULT - "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, - ++die_counter, pr, + "Oops: %s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, + ++die_counter, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b4905d5173fd..722fd712e1cf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index f05339fee778..6c5defd6569a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin; /* diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 82b96ed9890a..57120f0749cc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -28,18 +28,13 @@ * the first 128 E820 memory entries in boot_params.e820_table and the remaining * (if any) entries of the SETUP_E820_EXT nodes. We use this to: * - * - inform the user about the firmware's notion of memory layout - * via /sys/firmware/memmap - * * - the hibernation code uses it to generate a kernel-independent CRC32 * checksum of the physical memory layout of a system. * * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version * passed to us by the bootloader - the major difference between - * e820_table_firmware[] and this one is that, the latter marks the setup_data - * list created by the EFI boot stub as reserved, so that kexec can reuse the - * setup_data information in the second kernel. Besides, e820_table_kexec[] - * might also be modified by the kexec itself to fake a mptable. + * e820_table_firmware[] and this one is that e820_table_kexec[] + * might be modified by the kexec itself to fake an mptable. * We use this to: * * - kexec, which is a bootloader in disguise, uses the original E820 @@ -47,6 +42,11 @@ * can have a restricted E820 map while the kexec()-ed kexec-kernel * can have access to full memory - etc. * + * Export the memory layout via /sys/firmware/memmap. kexec-tools uses + * the entries to create an E820 table for the kexec kernel. + * + * kexec_file_load in-kernel code uses the table for the kexec kernel. + * * - 'e820_table': this is the main E820 table that is massaged by the * low level x86 platform code, or modified by boot parameters, before * passed on to higher level MM layers. @@ -187,8 +187,7 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type) static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_TYPE_RAM: /* Fall through: */ - case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RAM: pr_cont("usable"); break; case E820_TYPE_RESERVED: pr_cont("reserved"); break; case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; case E820_TYPE_ACPI: pr_cont("ACPI data"); break; @@ -764,7 +763,7 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn) pfn = PFN_DOWN(entry->addr + entry->size); - if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM) register_nosave_region(PFN_UP(entry->addr), pfn); if (pfn >= limit_pfn) @@ -991,60 +990,6 @@ static int __init parse_memmap_opt(char *str) early_param("memmap", parse_memmap_opt); /* - * Reserve all entries from the bootloader's extensible data nodes list, - * because if present we are going to use it later on to fetch e820 - * entries from it: - */ -void __init e820__reserve_setup_data(void) -{ - struct setup_indirect *indirect; - struct setup_data *data; - u64 pa_data, pa_next; - u32 len; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - if (!data) { - pr_warn("e820: failed to memremap setup_data entry\n"); - return; - } - - len = sizeof(*data); - pa_next = data->next; - - e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - - if (data->type == SETUP_INDIRECT) { - len += data->len; - early_memunmap(data, sizeof(*data)); - data = early_memremap(pa_data, len); - if (!data) { - pr_warn("e820: failed to memremap indirect setup_data\n"); - return; - } - - indirect = (struct setup_indirect *)data->data; - - if (indirect->type != SETUP_INDIRECT) - e820__range_update(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } - - pa_data = pa_next; - early_memunmap(data, len); - } - - e820__update_table(e820_table); - - pr_info("extended physical RAM map:\n"); - e820__print_table("reserve setup_data"); -} - -/* * Called after parse_early_param(), after early parameters (such as mem=) * have been processed, in which case we already have an E820 table filled in * via the parameter callback function(s), but it's not sorted and printed yet: @@ -1063,7 +1008,6 @@ void __init e820__finish_early_params(void) static const char *__init e820_type_to_string(struct e820_entry *entry) { switch (entry->type) { - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: return "System RAM"; case E820_TYPE_ACPI: return "ACPI Tables"; case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; @@ -1079,7 +1023,6 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) { switch (entry->type) { - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; case E820_TYPE_ACPI: /* Fall-through: */ case E820_TYPE_NVS: /* Fall-through: */ @@ -1101,7 +1044,6 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ default: return IORES_DESC_NONE; @@ -1124,7 +1066,6 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res) case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; - case E820_TYPE_RESERVED_KERN: case E820_TYPE_RAM: case E820_TYPE_ACPI: case E820_TYPE_NVS: @@ -1176,9 +1117,9 @@ void __init e820__reserve_resources(void) res++; } - /* Expose the bootloader-provided memory layout to the sysfs. */ - for (i = 0; i < e820_table_firmware->nr_entries; i++) { - struct e820_entry *entry = e820_table_firmware->entries + i; + /* Expose the kexec e820 table to the sysfs. */ + for (i = 0; i < e820_table_kexec->nr_entries; i++) { + struct e820_entry *entry = e820_table_kexec->entries + i; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } @@ -1302,6 +1243,36 @@ void __init e820__memblock_setup(void) int i; u64 end; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + + /* + * At this point only the first megabyte is mapped for sure, the + * rest of the memory cannot be used for memblock resizing + */ + memblock_set_current_limit(ISA_END_ADDRESS); + /* * The bootstrap memblock region count maximum is 128 entries * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries @@ -1323,7 +1294,7 @@ void __init e820__memblock_setup(void) if (entry->type == E820_TYPE_SOFT_RESERVED) memblock_reserve(entry->addr, entry->size); - if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM) continue; memblock_add(entry->addr, entry->size); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 44f937015e1e..fc1714bad045 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -19,6 +19,7 @@ #include <linux/usb/ehci_def.h> #include <linux/usb/xhci-dbgp.h> #include <asm/pci_x86.h> +#include <linux/static_call.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -94,26 +95,28 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ #define DLL 0 /* Divisor Latch Low */ #define DLH 1 /* Divisor latch High */ -static unsigned int io_serial_in(unsigned long addr, int offset) +static __noendbr unsigned int io_serial_in(unsigned long addr, int offset) { return inb(addr + offset); } +ANNOTATE_NOENDBR_SYM(io_serial_in); -static void io_serial_out(unsigned long addr, int offset, int value) +static __noendbr void io_serial_out(unsigned long addr, int offset, int value) { outb(value, addr + offset); } +ANNOTATE_NOENDBR_SYM(io_serial_out); -static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in; -static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out; +DEFINE_STATIC_CALL(serial_in, io_serial_in); +DEFINE_STATIC_CALL(serial_out, io_serial_out); static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; - while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) + while ((static_call(serial_in)(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); - serial_out(early_serial_base, TXR, ch); + static_call(serial_out)(early_serial_base, TXR, ch); return timeout ? 0 : -1; } @@ -131,16 +134,16 @@ static __init void early_serial_hw_init(unsigned divisor) { unsigned char c; - serial_out(early_serial_base, LCR, 0x3); /* 8n1 */ - serial_out(early_serial_base, IER, 0); /* no interrupt */ - serial_out(early_serial_base, FCR, 0); /* no fifo */ - serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */ + static_call(serial_out)(early_serial_base, LCR, 0x3); /* 8n1 */ + static_call(serial_out)(early_serial_base, IER, 0); /* no interrupt */ + static_call(serial_out)(early_serial_base, FCR, 0); /* no fifo */ + static_call(serial_out)(early_serial_base, MCR, 0x3); /* DTR + RTS */ - c = serial_in(early_serial_base, LCR); - serial_out(early_serial_base, LCR, c | DLAB); - serial_out(early_serial_base, DLL, divisor & 0xff); - serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff); - serial_out(early_serial_base, LCR, c & ~DLAB); + c = static_call(serial_in)(early_serial_base, LCR); + static_call(serial_out)(early_serial_base, LCR, c | DLAB); + static_call(serial_out)(early_serial_base, DLL, divisor & 0xff); + static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff); + static_call(serial_out)(early_serial_base, LCR, c & ~DLAB); } #define DEFAULT_BAUD 9600 @@ -183,28 +186,26 @@ static __init void early_serial_init(char *s) /* Convert from baud to divisor value */ divisor = 115200 / baud; - /* These will always be IO based ports */ - serial_in = io_serial_in; - serial_out = io_serial_out; - /* Set up the HW */ early_serial_hw_init(divisor); } #ifdef CONFIG_PCI -static void mem32_serial_out(unsigned long addr, int offset, int value) +static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value) { u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ writel(value, vaddr + offset); } +ANNOTATE_NOENDBR_SYM(mem32_serial_out); -static unsigned int mem32_serial_in(unsigned long addr, int offset) +static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset) { u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ return readl(vaddr + offset); } +ANNOTATE_NOENDBR_SYM(mem32_serial_in); /* * early_pci_serial_init() @@ -278,15 +279,13 @@ static __init void early_pci_serial_init(char *s) */ if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { /* it is IO mapped */ - serial_in = io_serial_in; - serial_out = io_serial_out; early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK; write_pci_config(bus, slot, func, PCI_COMMAND, cmdreg|PCI_COMMAND_IO); } else { /* It is memory mapped - assume 32-bit alignment */ - serial_in = mem32_serial_in; - serial_out = mem32_serial_out; + static_call_update(serial_in, mem32_serial_in); + static_call_update(serial_out, mem32_serial_out); /* WARNING! assuming the address is always in the first 4G */ early_serial_base = (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1209c7aebb21..1b734a9ff088 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -60,9 +60,16 @@ bool irq_fpu_usable(void) if (WARN_ON_ONCE(in_nmi())) return false; - /* In kernel FPU usage already active? */ - if (this_cpu_read(in_kernel_fpu)) + /* + * In kernel FPU usage already active? This detects any explicitly + * nested usage in task or softirq context, which is unsupported. It + * also detects attempted usage in a hardirq that has interrupted a + * kernel-mode FPU section. + */ + if (this_cpu_read(in_kernel_fpu)) { + WARN_ON_FPU(!in_hardirq()); return false; + } /* * When not in NMI or hard interrupt context, FPU can be used in: @@ -220,7 +227,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) struct fpstate *fpstate; unsigned int size; - size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); fpstate = vzalloc(size); if (!fpstate) return false; @@ -232,8 +239,8 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) fpstate->is_guest = true; gfpu->fpstate = fpstate; - gfpu->xfeatures = fpu_user_cfg.default_features; - gfpu->perm = fpu_user_cfg.default_features; + gfpu->xfeatures = fpu_kernel_cfg.default_features; + gfpu->perm = fpu_kernel_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state @@ -420,7 +427,8 @@ EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); void kernel_fpu_begin_mask(unsigned int kfpu_mask) { - preempt_disable(); + if (!irqs_disabled()) + fpregs_lock(); WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); @@ -448,7 +456,8 @@ void kernel_fpu_end(void) WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); - preempt_enable(); + if (!irqs_disabled()) + fpregs_unlock(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index dbdb31f55fc7..975de070c9c9 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -18,7 +18,7 @@ static __always_inline __pure bool use_fxsr(void) #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else -# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; }) #endif /* Used in init.c */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 8f62e0666dea..6c69cb28b298 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -27,19 +27,14 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, struct _fpx_sw_bytes *fx_sw) { - int min_xstate_size = sizeof(struct fxregs_state) + - sizeof(struct xstate_header); void __user *fpstate = fxbuf; unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) return false; - /* Check for the first magic field and other error scenarios. */ - if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || - fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || - fx_sw->xstate_size > fx_sw->extended_size) + /* Check for the first magic field */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1) goto setfx; /* @@ -48,7 +43,7 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) + if (__get_user(magic2, (__u32 __user *)(fpstate + current->thread.fpu.fpstate->user_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 27417b685c1d..6a41d1610d8b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -259,32 +259,20 @@ static void __init setup_xstate_cache(void) } } -static void __init print_xstate_feature(u64 xstate_mask) -{ - const char *feature_name; - - if (cpu_has_xfeatures(xstate_mask, &feature_name)) - pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); -} - /* * Print out all the supported xstate features: */ static void __init print_xstate_features(void) { - print_xstate_feature(XFEATURE_MASK_FP); - print_xstate_feature(XFEATURE_MASK_SSE); - print_xstate_feature(XFEATURE_MASK_YMM); - print_xstate_feature(XFEATURE_MASK_BNDREGS); - print_xstate_feature(XFEATURE_MASK_BNDCSR); - print_xstate_feature(XFEATURE_MASK_OPMASK); - print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); - print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); - print_xstate_feature(XFEATURE_MASK_PKRU); - print_xstate_feature(XFEATURE_MASK_PASID); - print_xstate_feature(XFEATURE_MASK_CET_USER); - print_xstate_feature(XFEATURE_MASK_XTILE_CFG); - print_xstate_feature(XFEATURE_MASK_XTILE_DATA); + int i; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = BIT_ULL(i); + const char *name; + + if (cpu_has_xfeatures(mask, &name)) + pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name); + } } /* diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index aa16f1a1bbcf..0fd34f53f025 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -94,30 +94,33 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma /* XSAVE/XRSTOR wrapper functions */ #ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " +#define REX_SUFFIX "64" #else -#define REX_PREFIX +#define REX_SUFFIX #endif -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" +#define XSAVE "xsave" REX_SUFFIX " %[xa]" +#define XSAVEOPT "xsaveopt" REX_SUFFIX " %[xa]" +#define XSAVEC "xsavec" REX_SUFFIX " %[xa]" +#define XSAVES "xsaves" REX_SUFFIX " %[xa]" +#define XRSTOR "xrstor" REX_SUFFIX " %[xa]" +#define XRSTORS "xrstors" REX_SUFFIX " %[xa]" /* * After this @err contains 0 on success or the trap number when the * operation raises an exception. + * + * The [xa] input parameter below represents the struct xregs_state pointer + * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns + * above. */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ - "2:\n\t" \ + "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") /* @@ -137,12 +140,12 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma XSAVEOPT, X86_FEATURE_XSAVEOPT, \ XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ - "\n" \ + "\n\t" \ "xor %[err], %[err]\n" \ "3:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") /* @@ -156,7 +159,7 @@ static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 ma "3:\n" \ _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \ : \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") #if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 166bc0ea3bdf..cace6e8d7cc7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,13 +118,10 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) { + if (ftrace_poke_late) text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - } else { - mutex_lock(&text_mutex); - text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); - mutex_unlock(&text_mutex); - } + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; } @@ -321,7 +318,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - void *ret; + int ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -352,15 +349,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = text_poke_copy(trampoline, (void *)start_offset, size); - if (WARN_ON(!ret)) + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - text_poke_copy(ip, retq, sizeof(retq)); + memcpy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -368,7 +365,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - if (!text_poke_copy(ip, x86_nops[2], 2)) + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); + if (ret < 0) goto fail; } @@ -381,7 +379,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - text_poke_copy(ptr, &ops, sizeof(unsigned long)); + *ptr = (unsigned long)ops; op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -397,7 +395,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -407,9 +405,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - text_poke_copy_locked(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE, false); + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index d51647228596..367da3638167 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -146,12 +146,14 @@ SYM_FUNC_END(ftrace_stub_graph) #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) SYM_FUNC_START(ftrace_caller) + ANNOTATE_NOENDBR /* save_mcount_regs fills in first two parameters */ save_mcount_regs @@ -197,6 +199,7 @@ SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) SYM_FUNC_START(ftrace_regs_caller) + ANNOTATE_NOENDBR /* Save the current flags before any operations that can change them */ pushfq @@ -310,6 +313,7 @@ SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) SYM_FUNC_START(ftrace_stub_direct_tramp) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(ftrace_stub_direct_tramp) @@ -317,6 +321,7 @@ SYM_FUNC_END(ftrace_stub_direct_tramp) #else /* ! CONFIG_DYNAMIC_FTRACE */ SYM_FUNC_START(__fentry__) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT cmpq $ftrace_stub, ftrace_trace_function diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 22c9ba305ac1..fa9b6339975f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -5,8 +5,6 @@ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE */ -#define DISABLE_BRANCH_PROFILING - /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 @@ -567,7 +565,7 @@ void early_setup_idt(void) */ void __head startup_64_setup_gdt_idt(void) { - struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt); + struct desc_struct *gdt = (void *)(__force unsigned long)gdt_page.gdt; void *handler = NULL; struct desc_ptr startup_gdt_descr = { diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 31345e0ba006..fefe2a25cf02 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -61,11 +61,14 @@ SYM_CODE_START_NOALIGN(startup_64) /* Set up the stack for verify_cpu() */ leaq __top_init_kernel_stack(%rip), %rsp - /* Setup GSBASE to allow stack canary access for C code */ + /* + * Set up GSBASE. + * Note that on SMP the boot CPU uses the init data section until + * the per-CPU areas are set up. + */ movl $MSR_GS_BASE, %ecx - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx - movl %edx, %eax - shrq $32, %rdx + xorl %eax, %eax + xorl %edx, %edx wrmsr call startup_64_setup_gdt_idt @@ -319,7 +322,7 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) * * RDX contains the per-cpu offset */ - movq pcpu_hot + X86_current_task(%rdx), %rax + movq current_task(%rdx), %rax movq TASK_threadsp(%rax), %rsp /* @@ -359,17 +362,12 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) movl %eax,%fs movl %eax,%gs - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx -#ifndef CONFIG_SMP - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx -#endif movl %edx, %eax shrq $32, %rdx wrmsr @@ -435,7 +433,7 @@ SYM_CODE_START(soft_restart_cpu) UNWIND_HINT_END_OF_STACK /* Find the idle task stack */ - movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx + movq PER_CPU_VAR(current_task), %rcx movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index c20d1832c481..2bade73f49e3 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -23,6 +23,7 @@ #include <asm/desc.h> #include <asm/apic.h> #include <asm/i8259.h> +#include <asm/io_apic.h> /* * This is the 'legacy' 8259A Programmable Interrupt Controller, diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index e2fab3ceb09f..6290dd120f5e 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -144,7 +144,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) * Update the sequence number to force a TSS update on return to * user mode. */ - iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); + iobm->sequence = atomic64_inc_return(&io_bitmap_sequence); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index feca4f20b06a..81f9b78e0f7b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -33,6 +33,11 @@ DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); +DEFINE_PER_CPU_CACHE_HOT(u16, __softirq_pending); +EXPORT_PER_CPU_SYMBOL(__softirq_pending); + +DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr); + atomic_t irq_err_count; /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index dc1049c01f9b..c7a5d2960d57 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -29,12 +29,9 @@ int sysctl_panic_on_stackoverflow __read_mostly; /* Debugging check for stack overflow: is there less than 1KB free? */ -static int check_stack_overflow(void) +static bool check_stack_overflow(void) { - long sp; - - __asm__ __volatile__("andl %%esp,%0" : - "=r" (sp) : "0" (THREAD_SIZE - 1)); + unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1); return sp < (sizeof(struct thread_info) + STACK_WARN); } @@ -48,18 +45,19 @@ static void print_stack_overflow(void) } #else -static inline int check_stack_overflow(void) { return 0; } +static inline bool check_stack_overflow(void) { return false; } static inline void print_stack_overflow(void) { } #endif +DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr); + static void call_on_stack(void *func, void *stack) { - asm volatile("xchgl %%ebx,%%esp \n" + asm volatile("xchgl %[sp], %%esp\n" CALL_NOSPEC - "movl %%ebx,%%esp \n" - : "=b" (stack) - : "0" (stack), - [thunk_target] "D"(func) + "movl %[sp], %%esp" + : [sp] "+b" (stack) + : [thunk_target] "D" (func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -68,13 +66,13 @@ static inline void *current_stack(void) return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } -static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) +static inline bool execute_on_irq_stack(bool overflow, struct irq_desc *desc) { struct irq_stack *curstk, *irqstk; - u32 *isp, *prev_esp, arg1; + u32 *isp, *prev_esp; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -83,7 +81,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) * current stack (which is the irq stack already after all) */ if (unlikely(curstk == irqstk)) - return 0; + return false; isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); @@ -94,14 +92,13 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); - asm volatile("xchgl %%ebx,%%esp \n" + asm volatile("xchgl %[sp], %%esp\n" CALL_NOSPEC - "movl %%ebx,%%esp \n" - : "=a" (arg1), "=b" (isp) - : "0" (desc), "1" (isp), - [thunk_target] "D" (desc->handle_irq) - : "memory", "cc", "ecx"); - return 1; + "movl %[sp], %%esp" + : "+a" (desc), [sp] "+b" (isp) + : [thunk_target] "D" (desc->handle_irq) + : "memory", "cc", "edx", "ecx"); + return true; } /* @@ -112,7 +109,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) int node = cpu_to_node(cpu); struct page *ph, *ps; - if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) + if (per_cpu(hardirq_stack_ptr, cpu)) return 0; ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); @@ -124,8 +121,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return -ENOMEM; } - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); - per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); return 0; } @@ -135,7 +132,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); @@ -150,7 +147,7 @@ void do_softirq_own_stack(void) void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - int overflow = check_stack_overflow(); + bool overflow = check_stack_overflow(); if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { if (unlikely(overflow)) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index ade0043ce56e..ca78dce39361 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,8 +26,8 @@ #include <asm/io_apic.h> #include <asm/apic.h> +DEFINE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse); DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; -DECLARE_INIT_PER_CPU(irq_stack_backing_store); #ifdef CONFIG_VMAP_STACK /* @@ -51,7 +51,7 @@ static int map_irq_stack(unsigned int cpu) return -ENOMEM; /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -64,14 +64,14 @@ static int map_irq_stack(unsigned int cpu) void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif int irq_init_percpu_irqstack(unsigned int cpu) { - if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) + if (per_cpu(hardirq_stack_ptr, cpu)) return 0; return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 7f542a7799cb..fdabd5dda154 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -9,6 +9,7 @@ */ .pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) + ENDBR pushf pop %_ASM_AX RET diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 72e6a45e7ec2..09608fd93687 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -373,16 +373,7 @@ out: kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - u32 insn; - - /* - * Since 'addr' is not guaranteed to be safe to access, use - * copy_from_kernel_nofault() to read the instruction: - */ - if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) - return NULL; - - if (is_endbr(insn)) { + if (is_endbr((u32 *)addr)) { *on_func_entry = !offset || offset == 4; if (*on_func_entry) offset = 4; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7a422a6c5983..3be9b3342c67 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -838,7 +838,6 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP if (pv_tlb_flush_supported()) { pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; pr_info("KVM setup pv remote TLB flush\n"); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8984abd91c00..a7998f351701 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -19,6 +19,7 @@ #include <linux/jump_label.h> #include <linux/random.h> #include <linux/memory.h> +#include <linux/stackprotector.h> #include <asm/text-patching.h> #include <asm/page.h> @@ -130,6 +131,20 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, goto overflow; size = 4; break; +#if defined(CONFIG_STACKPROTECTOR) && \ + defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 + case R_X86_64_REX_GOTPCRELX: { + static unsigned long __percpu *const addr = &__stack_chk_guard; + + if (sym->st_value != (u64)addr) { + pr_err("%s: Unsupported GOTPCREL relocation\n", me->name); + return -ENOEXEC; + } + + val = (u64)&addr + rel[i].r_addend; + fallthrough; + } +#endif case R_X86_64_PC32: case R_X86_64_PLT32: val -= (u64)loc; @@ -146,21 +161,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - void *wr_loc = module_writable_address(me, loc); - - if (memcmp(wr_loc, &zero, size)) { + if (memcmp(loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(wr_loc, &val, size); + write(loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -227,7 +239,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; @@ -236,6 +248,8 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -266,60 +280,33 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size, me); + apply_retpolines(rseg, rseg + retpolines->sh_size); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size, me); - } - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size, me); + apply_returns(rseg, rseg + returns->sh_size); } - if (calls || alt) { + if (calls) { struct callthunk_sites cs = {}; - if (calls) { - cs.call_start = (void *)calls->sh_addr; - cs.call_end = (void *)calls->sh_addr + calls->sh_size; - } - - if (alt) { - cs.alt_start = (void *)alt->sh_addr; - cs.alt_end = (void *)alt->sh_addr + alt->sh_size; - } + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; callthunks_patch_module_calls(&cs, me); } + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); } - - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - - return 0; -} - -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *locks = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - } - if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -329,6 +316,10 @@ int module_post_finalize(const Elf_Ehdr *hdr, text, text_end); } + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ed163c8c8604..9a95d00f1423 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -40,8 +40,12 @@ #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> +/* + * An emergency handler can be set in any context including NMI + */ struct nmi_desc { raw_spinlock_t lock; + nmi_handler_t emerg_handler; struct list_head head; }; @@ -132,9 +136,22 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) static int nmi_handle(unsigned int type, struct pt_regs *regs) { struct nmi_desc *desc = nmi_to_desc(type); + nmi_handler_t ehandler; struct nmiaction *a; int handled=0; + /* + * Call the emergency handler, if set + * + * In the case of crash_nmi_callback() emergency handler, it will + * return in the case of the crashing CPU to enable it to complete + * other necessary crashing actions ASAP. Other handlers in the + * linked list won't need to be run. + */ + ehandler = desc->emerg_handler; + if (ehandler) + return ehandler(type, regs); + rcu_read_lock(); /* @@ -224,6 +241,31 @@ void unregister_nmi_handler(unsigned int type, const char *name) } EXPORT_SYMBOL_GPL(unregister_nmi_handler); +/** + * set_emergency_nmi_handler - Set emergency handler + * @type: NMI type + * @handler: the emergency handler to be stored + * + * Set an emergency NMI handler which, if set, will preempt all the other + * handlers in the linked list. If a NULL handler is passed in, it will clear + * it. It is expected that concurrent calls to this function will not happen + * or the system is screwed beyond repair. + */ +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler) +{ + struct nmi_desc *desc = nmi_to_desc(type); + + if (WARN_ON_ONCE(desc->emerg_handler == handler)) + return; + desc->emerg_handler = handler; + + /* + * Ensure the emergency handler is visible to other CPUs before + * function return + */ + smp_wmb(); +} + static void pci_serr_error(unsigned char reason, struct pt_regs *regs) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ccaa3397a67..97925632c28e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -59,21 +59,6 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } -#ifndef CONFIG_PT_RECLAIM -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct ptdesc *ptdesc = (struct ptdesc *)table; - - pagetable_dtor(ptdesc); - tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} -#else -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_table(tlb, table); -} -#endif - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -90,30 +75,20 @@ void paravirt_set_sched_clock(u64 (*func)(void)) static_call_update(pv_sched_clock, func); } -/* These are in entry.S */ -static struct resource reserve_ioports = { - .start = 0, - .end = IO_SPACE_LIMIT, - .name = "paravirt-ioport", - .flags = IORESOURCE_IO | IORESOURCE_BUSY, -}; +#ifdef CONFIG_PARAVIRT_XXL +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} -/* - * Reserve the whole legacy IO space to prevent any legacy drivers - * from wasting time probing for their hardware. This is a fairly - * brute-force approach to disabling all non-virtual drivers. - * - * Note that this must be called very early to have any effect. - */ -int paravirt_disable_iospace(void) +static noinstr unsigned long pv_native_read_cr3(void) { - return request_resource(&ioport_resource, &reserve_ioports); + return __native_read_cr3(); } -#ifdef CONFIG_PARAVIRT_XXL -static noinstr void pv_native_write_cr2(unsigned long val) +static noinstr void pv_native_write_cr3(unsigned long cr3) { - native_write_cr2(val); + native_write_cr3(cr3); } static noinstr unsigned long pv_native_get_debugreg(int regno) @@ -195,7 +170,6 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = native_tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, @@ -203,8 +177,8 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = pv_native_write_cr2, - .mmu.read_cr3 = __native_read_cr3, - .mmu.write_cr3 = native_write_cr3, + .mmu.read_cr3 = pv_native_read_cr3, + .mmu.write_cr3 = pv_native_write_cr3, .mmu.pgd_alloc = __paravirt_pgd_alloc, .mmu.pgd_free = paravirt_nop, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6da6769d7254..91f6ff618852 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -93,7 +93,12 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - memcpy(dst, src, arch_task_struct_size); + /* init_task is not dynamically sized (incomplete FPU state) */ + if (unlikely(src == &init_task)) + memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(init_task), 0); + else + memcpy(dst, src, arch_task_struct_size); + #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif @@ -1043,7 +1048,7 @@ unsigned long __get_wchan(struct task_struct *p) return addr; } -long do_arch_prctl_common(int option, unsigned long arg2) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { switch (option) { case ARCH_GET_CPUID: @@ -1058,5 +1063,13 @@ long do_arch_prctl_common(int option, unsigned long arg2) return fpu_xstate_prctl(option, arg2); } + if (!in_ia32_syscall()) + return do_arch_prctl_64(current, option, arg2); + return -EINVAL; } + +SYSCALL_DEFINE0(ni_syscall) +{ + return -ENOSYS; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0917c7f25720..4636ef359973 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -190,13 +190,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0 and pcpu_hot.top_of_stack. This changes + * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); - this_cpu_write(pcpu_hot.top_of_stack, + this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); @@ -206,7 +206,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - raw_cpu_write(pcpu_hot.current_task, next_p); + raw_cpu_write(current_task, next_p); switch_fpu_finish(next_p); @@ -215,8 +215,3 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } - -SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - return do_arch_prctl_common(option, arg2); -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 226472332a70..7196ca7048be 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -614,7 +614,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(pcpu_hot.hardirq_stack_inuse)); + this_cpu_read(hardirq_stack_inuse)); if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_p, cpu); @@ -668,8 +668,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - raw_cpu_write(pcpu_hot.current_task, next_p); - raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); + raw_cpu_write(current_task, next_p); + raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(next_p); @@ -942,7 +942,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) case ARCH_MAP_VDSO_X32: return prctl_map_vdso(&vdso_image_x32, arg2); # endif -# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +# ifdef CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: return prctl_map_vdso(&vdso_image_32, arg2); # endif @@ -979,26 +979,3 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) return ret; } - -SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - long ret; - - ret = do_arch_prctl_64(current, option, arg2); - if (ret == -EINVAL) - ret = do_arch_prctl_common(option, arg2); - - return ret; -} - -#ifdef CONFIG_IA32_EMULATION -COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - return do_arch_prctl_common(option, arg2); -} -#endif - -unsigned long KSTK_ESP(struct task_struct *task) -{ - return task_pt_regs(task)->sp; -} diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 6d0df6a58873..a92f18db9610 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -10,6 +10,8 @@ #include <asm/setup.h> #include <asm/mce.h> +#include <linux/platform_data/x86/apple.h> + #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) static void quirk_intel_irqbalance(struct pci_dev *dev) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index dc1dd3f3e67f..964f6b0a3d68 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -921,20 +921,16 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) return; /* Make a note of crashing cpu. Will be used in NMI callback. */ - crashing_cpu = safe_smp_processor_id(); + crashing_cpu = smp_processor_id(); shootdown_callback = callback; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, - NMI_FLAG_FIRST, "crash")) - return; /* Return what? */ + /* - * Ensure the new callback function is set before sending - * out the NMI + * Set emergency handler to preempt other handlers. */ - wmb(); + set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback); apic_send_IPI_allbutself(NMI_VECTOR); diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index b44d8863e57f..ac058971a382 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -40,6 +40,16 @@ SYM_DATA(kexec_pa_table_page, .quad 0) SYM_DATA(kexec_pa_swap_page, .quad 0) SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) + .balign 16 +SYM_DATA_START_LOCAL(kexec_debug_gdt) + .word kexec_debug_gdt_end - kexec_debug_gdt - 1 + .long 0 + .word 0 + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end) + .section .text..relocate_kernel,"ax"; .code64 SYM_CODE_START_NOALIGN(relocate_kernel) @@ -116,6 +126,19 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* store the start address on the stack */ pushq %rdx + /* Create a GDTR (16 bits limit, 64 bits addr) on stack */ + leaq kexec_debug_gdt(%rip), %rax + pushq %rax + pushw (%rax) + + /* Load the GDT, put the stack back */ + lgdt (%rsp) + addq $10, %rsp + + /* Test that we can load segments */ + movq %ds, %rax + movq %rax, %ds + /* * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP * below. diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cebee310e200..c7164a8de983 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -56,6 +56,9 @@ #include <asm/unwind.h> #include <asm/vsyscall.h> #include <linux/vmalloc.h> +#if defined(CONFIG_X86_LOCAL_APIC) +#include <asm/nmi.h> +#endif /* * max_low_pfn_mapped: highest directly mapped pfn < 4 GB @@ -146,6 +149,69 @@ static size_t ima_kexec_buffer_size; /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ int bootloader_type, bootloader_version; +static const struct ctl_table x86_sysctl_table[] = { + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_X86_LOCAL_APIC) + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_ACPI_SLEEP) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +}; + +static int __init init_x86_sysctl(void) +{ + register_sysctl_init("kernel", x86_sysctl_table); + return 0; +} +arch_initcall(init_x86_sysctl); + /* * Setup options */ @@ -429,6 +495,46 @@ static void __init parse_setup_data(void) } } +/* + * Translate the fields of 'struct boot_param' into global variables + * representing these parameters. + */ +static void __init parse_boot_params(void) +{ + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI32_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI64_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + } +#endif + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; +} + static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_indirect *indirect; @@ -527,6 +633,23 @@ void __init reserve_standard_io_resources(void) } +static void __init setup_kernel_resources(void) +{ + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; + + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); +} + static bool __init snb_gfx_workaround_needed(void) { #ifdef CONFIG_PCI @@ -789,35 +912,7 @@ void __init setup_arch(char **cmdline_p) setup_olpc_ofw_pgd(); - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); - screen_info = boot_params.screen_info; - edid_info = boot_params.edid_info; -#ifdef CONFIG_X86_32 - apm_info.bios = boot_params.apm_bios_info; - ist_info = boot_params.ist_info; -#endif - saved_video_mode = boot_params.hdr.vid_mode; - bootloader_type = boot_params.hdr.type_of_loader; - if ((bootloader_type >> 4) == 0xe) { - bootloader_type &= 0xf; - bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; - } - bootloader_version = bootloader_type & 0xf; - bootloader_version |= boot_params.hdr.ext_loader_ver << 4; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; -#endif -#ifdef CONFIG_EFI - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI32_LOADER_SIGNATURE, 4)) { - set_bit(EFI_BOOT, &efi.flags); - } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI64_LOADER_SIGNATURE, 4)) { - set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); - } -#endif + parse_boot_params(); x86_init.oem.arch_setup(); @@ -841,19 +936,8 @@ void __init setup_arch(char **cmdline_p) copy_edd(); - if (!boot_params.hdr.root_flags) - root_mountflags &= ~MS_RDONLY; setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); - code_resource.start = __pa_symbol(_text); - code_resource.end = __pa_symbol(_etext)-1; - rodata_resource.start = __pa_symbol(__start_rodata); - rodata_resource.end = __pa_symbol(__end_rodata)-1; - data_resource.start = __pa_symbol(_sdata); - data_resource.end = __pa_symbol(_edata)-1; - bss_resource.start = __pa_symbol(__bss_start); - bss_resource.end = __pa_symbol(__bss_stop)-1; - /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug @@ -866,30 +950,6 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_memblock_x86_reserve_range(); -#ifdef CONFIG_MEMORY_HOTPLUG - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - if (movable_node_is_enabled()) - memblock_set_bottom_up(true); -#endif - x86_report_nx(); apic_setup_apic_calls(); @@ -901,7 +961,6 @@ void __init setup_arch(char **cmdline_p) setup_clear_cpu_cap(X86_FEATURE_APIC); } - e820__reserve_setup_data(); e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) @@ -921,11 +980,11 @@ void __init setup_arch(char **cmdline_p) tsc_early_init(); x86_init.resources.probe_roms(); - /* after parse_early_param, so could debug it */ - insert_resource(&iomem_resource, &code_resource); - insert_resource(&iomem_resource, &rodata_resource); - insert_resource(&iomem_resource, &data_resource); - insert_resource(&iomem_resource, &bss_resource); + /* + * Add resources for kernel text and data to the iomem_resource. + * Do it after parse_early_param, so it can be debugged. + */ + setup_kernel_resources(); e820_add_kernel_range(); trim_bios_range(); @@ -990,7 +1049,6 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); /* diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b30d6e180df7..bfa48e7a32a2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -23,18 +23,13 @@ #include <asm/cpumask.h> #include <asm/cpu.h> -#ifdef CONFIG_X86_64 -#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) -#else -#define BOOT_PERCPU_OFFSET 0 -#endif +DEFINE_PER_CPU_CACHE_HOT(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); -DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +DEFINE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off); EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { - [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, -}; +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init; EXPORT_SYMBOL(__per_cpu_offset); /* @@ -169,7 +164,7 @@ void __init setup_per_cpu_areas(void) for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(pcpu_hot.cpu_number, cpu) = cpu; + per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* * Copy data used in early init routines from the diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index ef654530bf5a..98123ff10506 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -33,25 +33,55 @@ #include <asm/smap.h> #include <asm/gsseg.h> +/* + * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0 + * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index + * GDT, selector values 0~3 all point to the NULL descriptor, thus values + * 0, 1, 2 and 3 are all valid NULL selector values. + * + * However IRET zeros ES, FS, GS, and DS segment registers if any of them + * is found to have any nonzero NULL selector value, which can be used by + * userspace in pre-FRED systems to spot any interrupt/exception by loading + * a nonzero NULL selector and waiting for it to become zero. Before FRED + * there was nothing software could do to prevent such an information leak. + * + * ERETU, the only legit instruction to return to userspace from kernel + * under FRED, by design does NOT zero any segment register to avoid this + * problem behavior. + * + * As such, leave NULL selector values 0~3 unchanged. + */ +static inline u16 fixup_rpl(u16 sel) +{ + return sel <= 3 ? sel : sel | 3; +} + #ifdef CONFIG_IA32_EMULATION #include <asm/unistd_32_ia32.h> static inline void reload_segments(struct sigcontext_32 *sc) { - unsigned int cur; + u16 cur; + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ savesegment(gs, cur); - if ((sc->gs | 0x03) != cur) - load_gs_index(sc->gs | 0x03); + if (fixup_rpl(sc->gs) != cur) + load_gs_index(fixup_rpl(sc->gs)); savesegment(fs, cur); - if ((sc->fs | 0x03) != cur) - loadsegment(fs, sc->fs | 0x03); + if (fixup_rpl(sc->fs) != cur) + loadsegment(fs, fixup_rpl(sc->fs)); + savesegment(ds, cur); - if ((sc->ds | 0x03) != cur) - loadsegment(ds, sc->ds | 0x03); + if (fixup_rpl(sc->ds) != cur) + loadsegment(ds, fixup_rpl(sc->ds)); savesegment(es, cur); - if ((sc->es | 0x03) != cur) - loadsegment(es, sc->es | 0x03); + if (fixup_rpl(sc->es) != cur) + loadsegment(es, fixup_rpl(sc->es)); } #define sigset32_t compat_sigset_t @@ -105,18 +135,12 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, regs->orig_ax = -1; #ifdef CONFIG_IA32_EMULATION - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ reload_segments(&sc); #else - loadsegment(gs, sc.gs); - regs->fs = sc.fs; - regs->es = sc.es; - regs->ds = sc.ds; + loadsegment(gs, fixup_rpl(sc.gs)); + regs->fs = fixup_rpl(sc.fs); + regs->es = fixup_rpl(sc.es); + regs->ds = fixup_rpl(sc.ds); #endif return fpu__restore_sig(compat_ptr(sc.fpstate), 1); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c10850ae6f09..d6cf1e23c2a3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -190,7 +190,7 @@ static void ap_starting(void) apic_ap_setup(); /* Save the processor parameters. */ - smp_store_cpu_info(cpuid); + identify_secondary_cpu(cpuid); /* * The topology information must be up to date before @@ -215,7 +215,7 @@ static void ap_calibrate_delay(void) { /* * Calibrate the delay loop and update loops_per_jiffy in cpu_data. - * smp_store_cpu_info() stored a value that is close but not as + * identify_secondary_cpu() stored a value that is close but not as * accurate as the value just calculated. * * As this is invoked after the TSC synchronization check, @@ -229,7 +229,7 @@ static void ap_calibrate_delay(void) /* * Activate a secondary processor. */ -static void notrace start_secondary(void *unused) +static void notrace __noendbr start_secondary(void *unused) { /* * Don't put *anything* except direct CPU state initialization @@ -314,26 +314,7 @@ static void notrace start_secondary(void *unused) wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } - -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ -void smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c = &cpu_data(id); - - /* Copy boot_cpu_data only on the first bringup */ - if (!c->initialized) - *c = boot_cpu_data; - c->cpu_index = id; - /* - * During boot time, CPU0 has this setup already. Save the info when - * bringing up an AP. - */ - identify_secondary_cpu(c); - c->initialized = true; -} +ANNOTATE_NOENDBR_SYM(start_secondary); static bool topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) @@ -654,10 +635,9 @@ static void impress_friends(void) * But that slows boot and resume on modern processors, which include * many cores and don't require that delay. * - * Cmdline "init_cpu_udelay=" is available to over-ride this delay. - * Modern processor families are quirked to remove the delay entirely. + * Cmdline "cpu_init_udelay=" is available to override this delay. */ -#define UDELAY_10MS_DEFAULT 10000 +#define UDELAY_10MS_LEGACY 10000 static unsigned int init_udelay = UINT_MAX; @@ -669,21 +649,21 @@ static int __init cpu_init_udelay(char *str) } early_param("cpu_init_udelay", cpu_init_udelay); -static void __init smp_quirk_init_udelay(void) +static void __init smp_set_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ - if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) || + (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) || + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) { init_udelay = 0; return; } /* else, use legacy delay */ - init_udelay = UDELAY_10MS_DEFAULT; + init_udelay = UDELAY_10MS_LEGACY; } /* @@ -841,7 +821,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); - per_cpu(pcpu_hot.current_task, cpu) = idle; + per_cpu(current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ @@ -851,7 +831,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #endif return 0; } @@ -1094,7 +1074,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); - smp_quirk_init_udelay(); + smp_set_init_udelay(); speculative_store_bypass_ht_init(); @@ -1262,43 +1242,9 @@ void play_dead_common(void) * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. */ -static inline void mwait_play_dead(void) +void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - - eax = CPUID_LEAF_MWAIT; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } /* Set up state for the kexec() hack below */ md->status = CPUDEAD_MWAIT_WAIT; @@ -1319,7 +1265,7 @@ static inline void mwait_play_dead(void) mb(); __monitor(md, 0, 0); mb(); - __mwait(eax, 0); + __mwait(eax_hint, 0); if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { /* @@ -1391,9 +1337,9 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead(); - if (cpuidle_play_dead()) - hlt_play_dead(); + /* Below returns only on error. */ + cpuidle_play_dead(); + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 4c1bcb6053fc..46b8f1f16676 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -200,8 +200,7 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; for (i = 0; i < e820_table->nr_entries; i++) { - if ((e820_table->entries[i].type != E820_TYPE_RAM) - && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) + if (e820_table->entries[i].type != E820_TYPE_RAM) continue; add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2dbadf347b5f..9f88b8a78e50 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -94,10 +94,20 @@ __always_inline int is_valid_bugaddr(unsigned long addr) /* * Check for UD1 or UD2, accounting for Address Size Override Prefixes. - * If it's a UD1, get the ModRM byte to pass along to UBSan. + * If it's a UD1, further decode to determine its use: + * + * FineIBT: ea (bad) + * FineIBT: f0 75 f9 lock jne . - 6 + * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax + * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax + * static_call: 0f b9 cc ud1 %esp,%ecx + * + * Notably UBSAN uses EAX, static_call uses ECX. */ -__always_inline int decode_bug(unsigned long addr, u32 *imm) +__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { + unsigned long start = addr; + bool lock = false; u8 v; if (addr < TASK_SIZE_MAX) @@ -106,28 +116,67 @@ __always_inline int decode_bug(unsigned long addr, u32 *imm) v = *(u8 *)(addr++); if (v == INSN_ASOP) v = *(u8 *)(addr++); - if (v != OPCODE_ESCAPE) + + if (v == INSN_LOCK) { + lock = true; + v = *(u8 *)(addr++); + } + + switch (v) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + addr += 1; /* d8 */ + *len = addr - start; + WARN_ON_ONCE(!lock); + return BUG_LOCK; + + case 0xea: + *len = addr - start; + return BUG_EA; + + case OPCODE_ESCAPE: + break; + + default: return BUG_NONE; + } v = *(u8 *)(addr++); - if (v == SECOND_BYTE_OPCODE_UD2) + if (v == SECOND_BYTE_OPCODE_UD2) { + *len = addr - start; return BUG_UD2; + } - if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1) + if (v != SECOND_BYTE_OPCODE_UD1) return BUG_NONE; - /* Retrieve the immediate (type value) for the UBSAN UD1 */ - v = *(u8 *)(addr++); - if (X86_MODRM_RM(v) == 4) - addr++; - *imm = 0; - if (X86_MODRM_MOD(v) == 1) - *imm = *(u8 *)addr; - else if (X86_MODRM_MOD(v) == 2) - *imm = *(u32 *)addr; - else - WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v)); + v = *(u8 *)(addr++); /* ModRM */ + + if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) + addr++; /* SIB */ + + /* Decode immediate, if present */ + switch (X86_MODRM_MOD(v)) { + case 0: if (X86_MODRM_RM(v) == 5) + addr += 4; /* RIP + disp32 */ + break; + + case 1: *imm = *(s8 *)addr; + addr += 1; + break; + + case 2: *imm = *(s32 *)addr; + addr += 4; + break; + + case 3: break; + } + + /* record instruction length */ + *len = addr - start; + + if (X86_MODRM_REG(v) == 0) /* EAX */ + return BUG_UD1_UBSAN; return BUG_UD1; } @@ -257,11 +306,12 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { + unsigned long addr = regs->ip; bool handled = false; - int ud_type; - u32 imm; + int ud_type, ud_len; + s32 ud_imm; - ud_type = decode_bug(regs->ip, &imm); + ud_type = decode_bug(addr, &ud_imm, &ud_len); if (ud_type == BUG_NONE) return handled; @@ -281,15 +331,47 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (ud_type == BUG_UD2) { - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || - handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += LEN_UD2; + + switch (ud_type) { + case BUG_UD2: + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + fallthrough; + + case BUG_EA: + case BUG_LOCK: + if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { handled = true; + break; + } + break; + + case BUG_UD1_UBSAN: + if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { + pr_crit("%s at %pS\n", + report_ubsan_failure(regs, ud_imm), + (void *)regs->ip); } - } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { - pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip); + break; + + default: + break; + } + + /* + * When continuing, and regs->ip hasn't changed, move it to the next + * instruction. When not continuing execution, restore the instruction + * pointer. + */ + if (handled) { + if (regs->ip == addr) + regs->ip += ud_len; + } else { + regs->ip = addr; } + if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); @@ -380,6 +462,21 @@ __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, #endif /* + * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64 + * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch + * between configs triggers objtool warnings. + * + * This is a temporary hack until we have compiler or plugin support for + * annotating noreturns. + */ +#ifdef CONFIG_X86_ESPFIX64 +#define always_true() true +#else +bool always_true(void); +bool __weak always_true(void) { return true; } +#endif + +/* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the @@ -514,7 +611,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault) pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); - panic("Machine halted."); + if (always_true()) + panic("Machine halted."); instrumentation_end(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 34dec0b72ea8..88e5a4ed9db3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -959,7 +959,7 @@ static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable()) + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -979,7 +979,7 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable()) + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; local_irq_save(flags); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index deeb02825670..48e6cc1cb017 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -152,7 +152,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann), X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 5a952c5ea66b..9194695662b2 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,19 +357,23 @@ void *arch_uprobe_trampoline(unsigned long *psize) return &insn; } -static unsigned long trampoline_check_ip(void) +static unsigned long trampoline_check_ip(unsigned long tramp) { - unsigned long tramp = uprobe_get_trampoline_vaddr(); - return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry); } SYSCALL_DEFINE0(uretprobe) { struct pt_regs *regs = task_pt_regs(current); - unsigned long err, ip, sp, r11_cx_ax[3]; + unsigned long err, ip, sp, r11_cx_ax[3], tramp; + + /* If there's no trampoline, we are called from wrong place. */ + tramp = uprobe_get_trampoline_vaddr(); + if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR)) + goto sigill; - if (regs->ip != trampoline_check_ip()) + /* Make sure the ip matches the only allowed sys_uretprobe caller. */ + if (unlikely(regs->ip != trampoline_check_ip(tramp))) goto sigill; err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 1258a5872d12..37ad43792452 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -29,8 +29,12 @@ */ #include <asm/cpufeatures.h> +#include <asm/cpufeaturemasks.h> #include <asm/msr-index.h> +#define SSE_MASK \ + (REQUIRED_MASK0 & ((1<<(X86_FEATURE_XMM & 31)) | (1<<(X86_FEATURE_XMM2 & 31)))) + SYM_FUNC_START_LOCAL(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0deb4887d6e9..ccdc45e5b759 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -43,7 +43,8 @@ ENTRY(phys_startup_64) #endif jiffies = jiffies_64; -const_pcpu_hot = pcpu_hot; +const_current_task = current_task; +const_cpu_current_top_of_stack = cpu_current_top_of_stack; #if defined(CONFIG_X86_64) /* @@ -112,12 +113,6 @@ ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ -#ifdef CONFIG_X86_64 -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(6); /* RW_ */ -#endif - init PT_LOAD FLAGS(7); /* RWE */ -#endif note PT_NOTE FLAGS(0); /* ___ */ } @@ -193,6 +188,8 @@ SECTIONS PAGE_ALIGNED_DATA(PAGE_SIZE) + CACHE_HOT_DATA(L1_CACHE_BYTES) + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) DATA_DATA @@ -216,21 +213,7 @@ SECTIONS __init_begin = .; /* paired with __init_end */ } -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - .init.text - should - * start another segment - init. - */ - PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) - ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, - "per-CPU data too large - increase CONFIG_PHYSICAL_START") -#endif - INIT_TEXT_SECTION(PAGE_SIZE) -#ifdef CONFIG_X86_64 - :init -#endif /* * Section for code used exclusively before alternatives are run. All @@ -347,9 +330,8 @@ SECTIONS EXIT_DATA } -#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU_SECTION(INTERNODE_CACHE_BYTES) -#endif + PERCPU_SECTION(L1_CACHE_BYTES) + ASSERT(__per_cpu_hot_end - __per_cpu_hot_start <= 64, "percpu cache hot data too large") RUNTIME_CONST_VARIABLES RUNTIME_CONST(ptr, USER_PTR_MAX) @@ -493,19 +475,6 @@ SECTIONS PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); #ifdef CONFIG_X86_64 -/* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(fixed_percpu_data); -INIT_PER_CPU(irq_stack_backing_store); - -#ifdef CONFIG_SMP -. = ASSERT((fixed_percpu_data == 0), - "fixed_percpu_data is not at start of per-cpu area"); -#endif #ifdef CONFIG_MITIGATION_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6ebeb6cea6c0..24f0318c50d7 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -952,8 +952,7 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; + hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d7ab8780ab9e..739aa6c0d0c3 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -690,8 +690,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->kvm = kvm; pit_state = &pit->pit_state; - hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - pit_state->timer.function = pit_timer_fn; + hrtimer_setup(&pit_state->timer, pit_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9dbc0f5d9865..28e3317124fd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2914,9 +2914,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); - hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - apic->lapic_timer.timer.function = apic_timer_fn; + hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8abeab91d329..d5d0c5c3300b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -607,6 +607,9 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); + + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -684,6 +687,9 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + return 0; } @@ -1563,7 +1569,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) + if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) && + static_branch_likely(&switch_vcpu_ibpb)) indirect_branch_prediction_barrier(); } if (kvm_vcpu_apicv_active(vcpu)) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d06e50d9c0e7..5504d9e9fd32 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5329,9 +5329,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b70ed72c1783..5c5766467a61 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1477,7 +1477,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, * performs IBPB on nested VM-Exit (a single nested transition * may switch the active VMCS multiple times). */ - if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) + if (static_branch_likely(&switch_vcpu_ibpb) && + (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) indirect_branch_prediction_barrier(); } diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 633c87e2fd92..96677576c836 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -118,7 +118,7 @@ do_exception: #else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - asm volatile("1: vmread %2, %1\n\t" + asm volatile("1: vmread %[field], %[output]\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -127,24 +127,26 @@ do_exception: * @field, and bounce through the trampoline to preserve * volatile registers. */ - "xorl %k1, %k1\n\t" + "xorl %k[output], %k[output]\n\t" "2:\n\t" - "push %1\n\t" - "push %2\n\t" + "push %[output]\n\t" + "push %[field]\n\t" "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the * memory for @fault so that the result is '0' on error. */ - "pop %2\n\t" - "pop %1\n\t" + "pop %[field]\n\t" + "pop %[output]\n\t" "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output]) - : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); + : ASM_CALL_CONSTRAINT, [output] "=&r" (value) + : [field] "r" (field) + : "cc"); return value; #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 7449be30d701..bd21e9c335ad 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -2289,8 +2289,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); - hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - vcpu->arch.xen.timer.function = xen_timer_callback; + hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 8a59c61624c2..64ccecedc9f8 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -56,7 +56,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += string_32.o lib-y += memmove_32.o lib-y += cmpxchg8b_emu.o -ifneq ($(CONFIG_X86_CMPXCHG64),y) +ifneq ($(CONFIG_X86_CX8),y) lib-y += atomic64_386_32.o endif else @@ -66,5 +66,6 @@ endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o copy_user_uncached_64.o - lib-y += cmpxchg16b_emu.o + lib-y += cmpxchg16b_emu.o + lib-y += bhi.o endif diff --git a/arch/x86/lib/bhi.S b/arch/x86/lib/bhi.S new file mode 100644 index 000000000000..58891681261b --- /dev/null +++ b/arch/x86/lib/bhi.S @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/unwind_hints.h> +#include <asm/nospec-branch.h> + +/* + * Notably, the FineIBT preamble calling these will have ZF set and r10 zero. + * + * The very last element is in fact larger than 32 bytes, but since its the + * last element, this does not matter, + * + * There are 2 #UD sites, located between 0,1-2,3 and 4,5-6,7 such that they + * can be reached using Jcc.d8, these elements (1 and 5) have sufficiently + * big alignment holes for this to not stagger the array. + */ + +.pushsection .noinstr.text, "ax" + + .align 32 +SYM_CODE_START(__bhi_args) + +#ifdef CONFIG_FINEIBT_BHI + + .align 32 +SYM_INNER_LABEL(__bhi_args_0, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_1, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 8 + ANNOTATE_REACHABLE +.Lud_1: ud2 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_2, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + cmovne %r10, %rsi + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_3, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_1 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_4, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_5, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 8 + ANNOTATE_REACHABLE +.Lud_2: ud2 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_6, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + cmovne %r10, %r9 + ANNOTATE_UNRET_SAFE + ret + int3 + + .align 32 +SYM_INNER_LABEL(__bhi_args_7, SYM_L_LOCAL) + ANNOTATE_NOENDBR + UNWIND_HINT_FUNC + jne .Lud_2 + cmovne %r10, %rdi + cmovne %r10, %rsi + cmovne %r10, %rdx + cmovne %r10, %rcx + cmovne %r10, %r8 + cmovne %r10, %r9 + cmovne %r10, %rsp + ANNOTATE_UNRET_SAFE + ret + int3 + +#endif /* CONFIG_FINEIBT_BHI */ + + .align 32 +SYM_INNER_LABEL(__bhi_args_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + nop /* Work around toolchain+objtool quirk */ +SYM_CODE_END(__bhi_args) + +.popsection diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 2760a15fbc00..a508e4a8c66a 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0-only */ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <linux/objtool.h> #include <asm/asm.h> /* @@ -14,7 +16,7 @@ * Zero a page. * %rdi - page */ -SYM_FUNC_START(clear_page_rep) +SYM_TYPED_FUNC_START(clear_page_rep) movl $4096/8,%ecx xorl %eax,%eax rep stosq @@ -22,7 +24,7 @@ SYM_FUNC_START(clear_page_rep) SYM_FUNC_END(clear_page_rep) EXPORT_SYMBOL_GPL(clear_page_rep) -SYM_FUNC_START(clear_page_orig) +SYM_TYPED_FUNC_START(clear_page_orig) xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -44,7 +46,7 @@ SYM_FUNC_START(clear_page_orig) SYM_FUNC_END(clear_page_orig) EXPORT_SYMBOL_GPL(clear_page_orig) -SYM_FUNC_START(clear_page_erms) +SYM_TYPED_FUNC_START(clear_page_erms) movl $4096,%ecx xorl %eax,%eax rep stosb @@ -63,6 +65,7 @@ EXPORT_SYMBOL_GPL(clear_page_erms) * rcx: uncleared bytes or 0 if successful. */ SYM_FUNC_START(rep_stos_alternative) + ANNOTATE_NOENDBR cmpq $64,%rcx jae .Lunrolled diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 1c96be769adc..d4bb24347ff8 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,7 +7,7 @@ .text -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 /* * Emulate 'cmpxchg8b (%esi)' on UP diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index d6ae793d08fa..d8e87fedc20d 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -3,6 +3,7 @@ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> @@ -13,7 +14,7 @@ * prefetch distance based on SMP/UP. */ ALIGN -SYM_FUNC_START(copy_page) +SYM_TYPED_FUNC_START(copy_page) ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index fc9fb5d06174..aa8c341b2441 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -8,6 +8,8 @@ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/cfi_types.h> +#include <linux/objtool.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> #include <asm/asm.h> @@ -30,6 +32,7 @@ * it simpler for us, we can clobber rsi/rdi and rax freely. */ SYM_FUNC_START(rep_movs_alternative) + ANNOTATE_NOENDBR cmpq $64,%rcx jae .Llarge diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S index 2918e36eece2..18350b343c2a 100644 --- a/arch/x86/lib/copy_user_uncached_64.S +++ b/arch/x86/lib/copy_user_uncached_64.S @@ -5,6 +5,7 @@ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/asm.h> /* @@ -27,6 +28,7 @@ * rax uncopied bytes or 0 if successful. */ SYM_FUNC_START(__copy_user_nocache) + ANNOTATE_NOENDBR /* If destination is not 7-byte aligned, we'll have to align it */ testb $7,%dil jne .Lalign diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 23f81ca3f06b..e86eda2c0b04 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -131,7 +131,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles) * Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu * variable as the monitor target. */ - __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); + __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 89ecd57c9d42..9d5654b8a72a 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -28,22 +28,20 @@ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/page_types.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/asm.h> #include <asm/smap.h> +#include <asm/runtime-const.h> #define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC .macro check_range size:req .if IS_ENABLED(CONFIG_X86_64) - movq $0x0123456789abcdef,%rdx - 1: - .pushsection runtime_ptr_USER_PTR_MAX,"a" - .long 1b - 8 - . - .popsection + RUNTIME_CONST_PTR USER_PTR_MAX, rdx cmp %rdx, %rax cmova %rdx, %rax .else @@ -62,6 +60,7 @@ .text SYM_FUNC_START(__get_user_1) + ANNOTATE_NOENDBR check_range size=1 ASM_STAC UACCESS movzbl (%_ASM_AX),%edx @@ -72,6 +71,7 @@ SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) SYM_FUNC_START(__get_user_2) + ANNOTATE_NOENDBR check_range size=2 ASM_STAC UACCESS movzwl (%_ASM_AX),%edx @@ -82,6 +82,7 @@ SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) SYM_FUNC_START(__get_user_4) + ANNOTATE_NOENDBR check_range size=4 ASM_STAC UACCESS movl (%_ASM_AX),%edx @@ -92,6 +93,7 @@ SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) SYM_FUNC_START(__get_user_8) + ANNOTATE_NOENDBR #ifndef CONFIG_X86_64 xor %ecx,%ecx #endif @@ -111,6 +113,7 @@ EXPORT_SYMBOL(__get_user_8) /* .. and the same for __get_user, just without the range checks */ SYM_FUNC_START(__get_user_nocheck_1) + ANNOTATE_NOENDBR ASM_STAC ASM_BARRIER_NOSPEC UACCESS movzbl (%_ASM_AX),%edx @@ -121,6 +124,7 @@ SYM_FUNC_END(__get_user_nocheck_1) EXPORT_SYMBOL(__get_user_nocheck_1) SYM_FUNC_START(__get_user_nocheck_2) + ANNOTATE_NOENDBR ASM_STAC ASM_BARRIER_NOSPEC UACCESS movzwl (%_ASM_AX),%edx @@ -131,6 +135,7 @@ SYM_FUNC_END(__get_user_nocheck_2) EXPORT_SYMBOL(__get_user_nocheck_2) SYM_FUNC_START(__get_user_nocheck_4) + ANNOTATE_NOENDBR ASM_STAC ASM_BARRIER_NOSPEC UACCESS movl (%_ASM_AX),%edx @@ -141,6 +146,7 @@ SYM_FUNC_END(__get_user_nocheck_4) EXPORT_SYMBOL(__get_user_nocheck_4) SYM_FUNC_START(__get_user_nocheck_8) + ANNOTATE_NOENDBR ASM_STAC ASM_BARRIER_NOSPEC #ifdef CONFIG_X86_64 diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 774bdf3e6f0a..edbeb3ecad38 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/asm.h> @@ -9,6 +10,7 @@ * %rdi: w */ SYM_FUNC_START(__sw_hweight32) + ANNOTATE_NOENDBR #ifdef CONFIG_X86_64 movl %edi, %eax # w @@ -42,6 +44,7 @@ EXPORT_SYMBOL(__sw_hweight32) */ #ifdef CONFIG_X86_64 SYM_FUNC_START(__sw_hweight64) + ANNOTATE_NOENDBR pushq %rdi pushq %rdx diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 1b60ae81ecd8..aa1f92ee6b2e 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -8,6 +8,7 @@ */ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> @@ -26,7 +27,7 @@ * Output: * rax: dest */ -SYM_FUNC_START(__memmove) +SYM_TYPED_FUNC_START(__memmove) mov %rdi, %rax diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 0199d56cb479..d66b710d628f 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -3,6 +3,7 @@ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/cpufeatures.h> #include <asm/alternative.h> @@ -28,7 +29,7 @@ * only for the return value that is the same as the source input, * which the compiler could/should do much better anyway. */ -SYM_FUNC_START(__memset) +SYM_TYPED_FUNC_START(__memset) ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS movq %rdi,%r9 diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index ebd259f31496..5ef8494896e8 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <linux/errno.h> +#include <linux/cfi_types.h> #include <asm/asm.h> #include <asm/msr.h> @@ -12,7 +13,7 @@ * */ .macro op_safe_regs op -SYM_FUNC_START(\op\()_safe_regs) +SYM_TYPED_FUNC_START(\op\()_safe_regs) pushq %rbx pushq %r12 movq %rdi, %r10 /* Save pointer */ diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 4bf4fad5b148..5a18ecc04a6c 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -103,6 +103,7 @@ int msr_set_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, true); } +EXPORT_SYMBOL_GPL(msr_set_bit); /** * msr_clear_bit - Clear @bit in a MSR @msr. @@ -118,6 +119,7 @@ int msr_clear_bit(u32 msr, u8 bit) { return __flip_bit(msr, bit, false); } +EXPORT_SYMBOL_GPL(msr_clear_bit); #ifdef CONFIG_TRACEPOINTS void do_trace_write_msr(unsigned int msr, u64 val, int failed) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 975c9c18263d..46d9e9b98a61 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -13,6 +13,7 @@ */ #include <linux/export.h> #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/thread_info.h> #include <asm/errno.h> #include <asm/asm.h> @@ -45,6 +46,7 @@ .text SYM_FUNC_START(__put_user_1) + ANNOTATE_NOENDBR check_range size=1 ASM_STAC 1: movb %al,(%_ASM_CX) @@ -55,6 +57,7 @@ SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) SYM_FUNC_START(__put_user_nocheck_1) + ANNOTATE_NOENDBR ASM_STAC 2: movb %al,(%_ASM_CX) xor %ecx,%ecx @@ -64,6 +67,7 @@ SYM_FUNC_END(__put_user_nocheck_1) EXPORT_SYMBOL(__put_user_nocheck_1) SYM_FUNC_START(__put_user_2) + ANNOTATE_NOENDBR check_range size=2 ASM_STAC 3: movw %ax,(%_ASM_CX) @@ -74,6 +78,7 @@ SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) SYM_FUNC_START(__put_user_nocheck_2) + ANNOTATE_NOENDBR ASM_STAC 4: movw %ax,(%_ASM_CX) xor %ecx,%ecx @@ -83,6 +88,7 @@ SYM_FUNC_END(__put_user_nocheck_2) EXPORT_SYMBOL(__put_user_nocheck_2) SYM_FUNC_START(__put_user_4) + ANNOTATE_NOENDBR check_range size=4 ASM_STAC 5: movl %eax,(%_ASM_CX) @@ -93,6 +99,7 @@ SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) SYM_FUNC_START(__put_user_nocheck_4) + ANNOTATE_NOENDBR ASM_STAC 6: movl %eax,(%_ASM_CX) xor %ecx,%ecx @@ -102,6 +109,7 @@ SYM_FUNC_END(__put_user_nocheck_4) EXPORT_SYMBOL(__put_user_nocheck_4) SYM_FUNC_START(__put_user_8) + ANNOTATE_NOENDBR check_range size=8 ASM_STAC 7: mov %_ASM_AX,(%_ASM_CX) @@ -115,6 +123,7 @@ SYM_FUNC_END(__put_user_8) EXPORT_SYMBOL(__put_user_8) SYM_FUNC_START(__put_user_nocheck_8) + ANNOTATE_NOENDBR ASM_STAC 9: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 391059b2c6fb..a26c43abd47d 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -326,6 +326,7 @@ SYM_FUNC_END(retbleed_untrain_ret) #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) SYM_FUNC_START(entry_untrain_ret) + ANNOTATE_NOENDBR ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) @@ -342,7 +343,7 @@ SYM_FUNC_START(call_depth_return_thunk) * case. */ CALL_THUNKS_DEBUG_INC_RETS - shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth) + shlq $5, PER_CPU_VAR(__x86_call_depth) jz 1f ANNOTATE_UNRET_SAFE ret diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index e9251b89a9e9..654280aaa3e9 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -18,7 +18,7 @@ #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB - * @vaddr: virtual start address + * @addr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) diff --git a/arch/x86/math-emu/control_w.h b/arch/x86/math-emu/control_w.h index 60f4dcc5edc3..93cbc89b34e2 100644 --- a/arch/x86/math-emu/control_w.h +++ b/arch/x86/math-emu/control_w.h @@ -11,7 +11,7 @@ #ifndef _CONTROLW_H_ #define _CONTROLW_H_ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define _Const_(x) $##x #else #define _Const_(x) x diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h index 75230b977577..59961d350bc4 100644 --- a/arch/x86/math-emu/exception.h +++ b/arch/x86/math-emu/exception.h @@ -10,7 +10,7 @@ #ifndef _EXCEPTION_H_ #define _EXCEPTION_H_ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define Const_(x) $##x #else #define Const_(x) x @@ -37,7 +37,7 @@ #define PRECISION_LOST_UP Const_((EX_Precision | SW_C1)) #define PRECISION_LOST_DOWN Const_(EX_Precision) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef DEBUG #define EXCEPTION(x) { printk("exception in %s at line %d\n", \ @@ -46,6 +46,6 @@ #define EXCEPTION(x) FPU_exception(x) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _EXCEPTION_H_ */ diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index 0c122226ca56..def569c50b76 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -20,7 +20,7 @@ */ #define PECULIAR_486 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #include "fpu_asm.h" #define Const(x) $##x #else @@ -68,7 +68,7 @@ #define FPU_Exception Const(0x80000000) /* Added to tag returns. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include "fpu_system.h" @@ -213,6 +213,6 @@ asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy, #include "fpu_proto.h" #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _FPU_EMU_H_ */ diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h index b77bafec9526..f642957330ef 100644 --- a/arch/x86/math-emu/status_w.h +++ b/arch/x86/math-emu/status_w.h @@ -13,7 +13,7 @@ #include "fpu_emu.h" /* for definition of PECULIAR_486 */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define Const__(x) $##x #else #define Const__(x) x @@ -37,7 +37,7 @@ #define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define COMP_A_gt_B 1 #define COMP_A_eq_B 2 @@ -63,6 +63,6 @@ static inline void setcc(int cc) # define clear_C1() #endif /* PECULIAR_486 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _STATUS_H_ */ diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 5ab7bd2f1983..bd5d101c5c37 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -101,9 +101,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, pmd_t *pmd; bool use_gbpage; - next = (addr & PUD_MASK) + PUD_SIZE; - if (next > end) - next = end; + next = pud_addr_end(addr, end); /* if this is already a gbpage, this portion is already mapped */ if (pud_leaf(*pud)) @@ -154,10 +152,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, p4d_t *p4d = p4d_page + p4d_index(addr); pud_t *pud; - next = (addr & P4D_MASK) + P4D_SIZE; - if (next > end) - next = end; - + next = p4d_addr_end(addr, end); if (p4d_present(*p4d)) { pud = pud_offset(p4d, 0); result = ident_pud_init(info, pud, addr, next); @@ -199,10 +194,7 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; - next = (addr & PGDIR_MASK) + PGDIR_SIZE; - if (next > end) - next = end; - + next = pgd_addr_end(addr, end); if (pgd_present(*pgd)) { p4d = p4d_offset(pgd, 0); result = ident_p4d_init(info, p4d, addr, next); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 62aa4d66a032..bfa444a7dbb0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -645,8 +645,13 @@ static void __init memory_map_top_down(unsigned long map_start, */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); - memblock_phys_free(addr, PMD_SIZE); - real_end = addr + PMD_SIZE; + if (!addr) { + pr_warn("Failed to release memory for alloc_low_pages()"); + real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE)); + } else { + memblock_phys_free(addr, PMD_SIZE); + real_end = addr + PMD_SIZE; + } /* step_size need to be small so pgt_buf from BRK could cover it */ step_size = PMD_SIZE; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ac41b1e0940d..f288aad8dc74 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -582,7 +582,7 @@ static void __init lowmem_pfn_init(void) "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" #define MSG_HIGHMEM_TRIMMED \ - "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" + "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n" /* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: @@ -606,18 +606,13 @@ static void __init highmem_pfn_init(void) #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_HIGHMEM64G if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); } -#endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 01ea7c6df303..519aa53114fa 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -469,8 +469,6 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_RAM) && !e820__mapped_any(paddr & PAGE_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && - !e820__mapped_any(paddr & PAGE_MASK, paddr_next, E820_TYPE_ACPI)) set_pte_init(pte, __pte(0), init); continue; @@ -526,8 +524,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_RAM) && !e820__mapped_any(paddr & PMD_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && - !e820__mapped_any(paddr & PMD_MASK, paddr_next, E820_TYPE_ACPI)) set_pmd_init(pmd, __pmd(0), init); continue; @@ -615,8 +611,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_RAM) && !e820__mapped_any(paddr & PUD_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && - !e820__mapped_any(paddr & PUD_MASK, paddr_next, E820_TYPE_ACPI)) set_pud_init(pud, __pud(0), init); continue; @@ -704,8 +698,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RAM) && !e820__mapped_any(paddr & P4D_MASK, paddr_next, - E820_TYPE_RESERVED_KERN) && - !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_ACPI)) set_p4d_init(p4d, __p4d(0), init); continue; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 38ff7791a9c7..42c90b420773 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -503,6 +503,14 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) +{ + if ((flags & MEMREMAP_DEC) || cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return (void __force *)ioremap_cache(phys_addr, size); + + return (void __force *)ioremap_encrypted(phys_addr, size); +} + /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9dddf19a5571..0539efd0d216 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt /* cpu_feature_enabled() cannot be used this early */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 11a93542d198..3c306de52fd4 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -113,8 +113,14 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt physical memory region size based on available memory */ - if (memory_tb < kaslr_regions[0].size_tb) + /* + * Adapt physical memory region size based on available memory, + * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the + * device BAR space assuming the direct map space is large enough + * for creating a ZONE_DEVICE mapping in the direct map corresponding + * to the physical BAR address. + */ + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb)) kaslr_regions[0].size_tb = memory_tb; /* diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index b56c5c073003..7490ff6d83b1 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -7,8 +7,6 @@ * Author: Tom Lendacky <thomas.lendacky@amd.com> */ -#define DISABLE_BRANCH_PROFILING - #include <linux/linkage.h> #include <linux/init.h> #include <linux/mm.h> diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index e25288ee33c2..f8a33b25ae86 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -72,6 +72,7 @@ SYM_FUNC_START(sme_encrypt_execute) SYM_FUNC_END(sme_encrypt_execute) SYM_FUNC_START(__enc_copy) + ANNOTATE_NOENDBR /* * Routine used to encrypt memory in place. * This routine must be run outside of the kernel proper since diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index e6c7686f443a..5eecdd92da10 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -7,8 +7,6 @@ * Author: Tom Lendacky <thomas.lendacky@amd.com> */ -#define DISABLE_BRANCH_PROFILING - /* * Since we're dealing with identity mappings, physical and virtual * addresses are the same, so override these defines which are ultimately @@ -565,7 +563,7 @@ void __head sme_enable(struct boot_params *bp) } RIP_REL_REF(sme_me_mask) = me_mask; - physical_mask &= ~me_mask; - cc_vendor = CC_VENDOR_AMD; + RIP_REL_REF(physical_mask) &= ~me_mask; + RIP_REL_REF(cc_vendor) = CC_VENDOR_AMD; cc_set_mask(me_mask); } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index b8a6ffffb451..5ed2109211da 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -84,7 +84,6 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; - unsigned long gap_min, gap_max; /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) @@ -94,13 +93,7 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, * Top of mmap area (just below the process stack). * Leave an at least ~128 MB hole with possible stack randomization. */ - gap_min = SIZE_128M; - gap_max = (task_size / 6) * 5; - - if (gap < gap_min) - gap = gap_min; - else if (gap > gap_max) - gap = gap_max; + gap = clamp(gap, SIZE_128M, (task_size / 6) * 5); return PAGE_ALIGN(task_size - gap - rnd); } diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c index 3d2f7f0a6ed1..ad3c1feec990 100644 --- a/arch/x86/mm/pat/cpa-test.c +++ b/arch/x86/mm/pat/cpa-test.c @@ -183,7 +183,7 @@ static int pageattr_test(void) break; case 1: - err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1); break; case 2: diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index feb8cc6a12bf..e40861c9cb90 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -43,6 +43,7 @@ #include <linux/fs.h> #include <linux/rbtree.h> +#include <asm/cpu_device_id.h> #include <asm/cacheflush.h> #include <asm/cacheinfo.h> #include <asm/processor.h> @@ -290,9 +291,8 @@ void __init pat_bp_init(void) return; } - if ((c->x86_vendor == X86_VENDOR_INTEL) && - (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || - ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + if ((c->x86_vfm >= INTEL_PENTIUM_PRO && c->x86_vfm <= INTEL_PENTIUM_M_DOTHAN) || + (c->x86_vfm >= INTEL_P4_WILLAMETTE && c->x86_vfm <= INTEL_P4_CEDARMILL)) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index ef4514d64c05..72405d315b41 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -73,6 +73,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ +#define CPA_COLLAPSE 16 /* try to collapse large pages */ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { @@ -105,6 +106,18 @@ static void split_page_count(int level) direct_pages_count[level - 1] += PTRS_PER_PTE; } +static void collapse_page_count(int level) +{ + direct_pages_count[level]++; + if (system_state == SYSTEM_RUNNING) { + if (level == PG_LEVEL_2M) + count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE); + else if (level == PG_LEVEL_1G) + count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE); + } + direct_pages_count[level - 1] -= PTRS_PER_PTE; +} + void arch_report_meminfo(struct seq_file *m) { seq_printf(m, "DirectMap4k: %8lu kB\n", @@ -122,6 +135,7 @@ void arch_report_meminfo(struct seq_file *m) } #else static inline void split_page_count(int level) { } +static inline void collapse_page_count(int level) { } #endif #ifdef CONFIG_X86_CPA_STATISTICS @@ -211,14 +225,14 @@ within(unsigned long addr, unsigned long start, unsigned long end) return addr >= start && addr < end; } +#ifdef CONFIG_X86_64 + static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } -#ifdef CONFIG_X86_64 - /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): @@ -394,16 +408,49 @@ static void __cpa_flush_tlb(void *data) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } -static void cpa_flush(struct cpa_data *data, int cache) +static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); + +static void cpa_collapse_large_pages(struct cpa_data *cpa) +{ + unsigned long start, addr, end; + struct ptdesc *ptdesc, *tmp; + LIST_HEAD(pgtables); + int collapsed = 0; + int i; + + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + for (i = 0; i < cpa->numpages; i++) + collapsed += collapse_large_pages(__cpa_addr(cpa, i), + &pgtables); + } else { + addr = __cpa_addr(cpa, 0); + start = addr & PMD_MASK; + end = addr + PAGE_SIZE * cpa->numpages; + + for (addr = start; within(addr, start, end); addr += PMD_SIZE) + collapsed += collapse_large_pages(addr, &pgtables); + } + + if (!collapsed) + return; + + flush_tlb_all(); + + list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { + list_del(&ptdesc->pt_list); + __free_page(ptdesc_page(ptdesc)); + } +} + +static void cpa_flush(struct cpa_data *cpa, int cache) { - struct cpa_data *cpa = data; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); - return; + goto collapse_large_pages; } if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) @@ -412,7 +459,7 @@ static void cpa_flush(struct cpa_data *data, int cache) on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) - return; + goto collapse_large_pages; mb(); for (i = 0; i < cpa->numpages; i++) { @@ -428,6 +475,10 @@ static void cpa_flush(struct cpa_data *data, int cache) clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); + +collapse_large_pages: + if (cpa->flags & CPA_COLLAPSE) + cpa_collapse_large_pages(cpa); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, @@ -1197,6 +1248,161 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, return 0; } +static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, + struct list_head *pgtables) +{ + pmd_t _pmd, old_pmd; + pte_t *pte, first; + unsigned long pfn; + pgprot_t pgprot; + int i = 0; + + addr &= PMD_MASK; + pte = pte_offset_kernel(pmd, addr); + first = *pte; + pfn = pte_pfn(first); + + /* Make sure alignment is suitable */ + if (PFN_PHYS(pfn) & ~PMD_MASK) + return 0; + + /* The page is 4k intentionally */ + if (pte_flags(first) & _PAGE_KERNEL_4K) + return 0; + + /* Check that the rest of PTEs are compatible with the first one */ + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) { + pte_t entry = *pte; + + if (!pte_present(entry)) + return 0; + if (pte_flags(entry) != pte_flags(first)) + return 0; + if (pte_pfn(entry) != pte_pfn(first) + i) + return 0; + } + + old_pmd = *pmd; + + /* Success: set up a large page */ + pgprot = pgprot_4k_2_large(pte_pgprot(first)); + pgprot_val(pgprot) |= _PAGE_PSE; + _pmd = pfn_pmd(pfn, pgprot); + set_pmd(pmd, _pmd); + + /* Queue the page table to be freed after TLB flush */ + list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); + + if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { + struct page *page; + + /* Update all PGD tables to use the same large page */ + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + pmd_t *pmd = pmd_offset(pud, addr); + /* Something is wrong if entries doesn't match */ + if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd))) + continue; + set_pmd(pmd, _pmd); + } + } + + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) + collapse_page_count(PG_LEVEL_2M); + + return 1; +} + +static int collapse_pud_page(pud_t *pud, unsigned long addr, + struct list_head *pgtables) +{ + unsigned long pfn; + pmd_t *pmd, first; + int i; + + if (!direct_gbpages) + return 0; + + addr &= PUD_MASK; + pmd = pmd_offset(pud, addr); + first = *pmd; + + /* + * To restore PUD page all PMD entries must be large and + * have suitable alignment + */ + pfn = pmd_pfn(first); + if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK)) + return 0; + + /* + * To restore PUD page, all following PMDs must be compatible with the + * first one. + */ + for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) { + pmd_t entry = *pmd; + + if (!pmd_present(entry) || !pmd_leaf(entry)) + return 0; + if (pmd_flags(entry) != pmd_flags(first)) + return 0; + if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE) + return 0; + } + + /* Restore PUD page and queue page table to be freed after TLB flush */ + list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables); + set_pud(pud, pfn_pud(pfn, pmd_pgprot(first))); + + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) + collapse_page_count(PG_LEVEL_1G); + + return 1; +} + +/* + * Collapse PMD and PUD pages in the kernel mapping around the address where + * possible. + * + * Caller must flush TLB and free page tables queued on the list before + * touching the new entries. CPU must not see TLB entries of different size + * with different attributes. + */ +static int collapse_large_pages(unsigned long addr, struct list_head *pgtables) +{ + int collapsed = 0; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + addr &= PMD_MASK; + + spin_lock(&pgd_lock); + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + goto out; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + goto out; + pud = pud_offset(p4d, addr); + if (!pud_present(*pud) || pud_leaf(*pud)) + goto out; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) + goto out; + + collapsed = collapse_pmd_page(pmd, addr, pgtables); + if (collapsed) + collapsed += collapse_pud_page(pud, addr, pgtables); + +out: + spin_unlock(&pgd_lock); + return collapsed; +} + static bool try_to_free_pte_page(pte_t *pte) { int i; @@ -1942,19 +2148,6 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages, CPA_PAGES_ARRAY, pages); } -/* - * __set_memory_prot is an internal helper for callers that have been passed - * a pgprot_t value from upper layers and a reservation has already been taken. - * If you want to set the pgprot to a specific page protocol, use the - * set_memory_xx() functions. - */ -int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) -{ - return change_page_attr_set_clr(&addr, numpages, prot, - __pgprot(~pgprot_val(prot)), 0, 0, - NULL); -} - int _set_memory_uc(unsigned long addr, int numpages) { /* @@ -2120,7 +2313,8 @@ int set_memory_rox(unsigned long addr, int numpages) if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; - return change_page_attr_clear(&addr, numpages, clr, 0); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0, + CPA_COLLAPSE, NULL); } int set_memory_rw(unsigned long addr, int numpages) @@ -2147,7 +2341,8 @@ int set_memory_p(unsigned long addr, int numpages) int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + return change_page_attr_set_clr(&addr, numpages, + __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL); } @@ -2420,7 +2615,7 @@ static int __set_pages_np(struct page *page, int numpages) .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS }; /* @@ -2507,7 +2702,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)), .flags = CPA_NO_CHECK_ALIAS, }; @@ -2550,7 +2745,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS, }; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1fef5ad32d5a..cec321fb74f2 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -12,59 +12,15 @@ phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif -#ifdef CONFIG_HIGHPTE -#define PGTABLE_HIGHMEM __GFP_HIGHMEM -#else -#define PGTABLE_HIGHMEM 0 -#endif - -#ifndef CONFIG_PARAVIRT -#ifndef CONFIG_PT_RECLAIM -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct ptdesc *ptdesc = (struct ptdesc *)table; - - pagetable_dtor(ptdesc); - tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} -#else -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_table(tlb, table); -} -#endif /* !CONFIG_PT_RECLAIM */ -#endif /* !CONFIG_PARAVIRT */ - -gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; - pgtable_t pte_alloc_one(struct mm_struct *mm) { - return __pte_alloc_one(mm, __userpte_alloc_gfp); -} - -static int __init setup_userpte(char *arg) -{ - if (!arg) - return -EINVAL; - - /* - * "userpte=nohigh" disables allocation of user pagetables in - * high memory. - */ - if (strcmp(arg, "nohigh") == 0) - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - else - return -EINVAL; - return 0; + return __pte_alloc_one(mm, GFP_PGTABLE_USER); } -early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); - paravirt_tlb_remove_table(tlb, page_ptdesc(pte)); + tlb_remove_table(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -78,21 +34,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd)); + tlb_remove_table(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud)); + tlb_remove_table(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d)); + tlb_remove_table(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6cf881a942bb..e459d97ef397 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -74,13 +74,15 @@ * use different names for each of them: * * ASID - [0, TLB_NR_DYN_ASIDS-1] - * the canonical identifier for an mm + * the canonical identifier for an mm, dynamically allocated on each CPU + * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] + * the canonical, global identifier for an mm, identical across all CPUs * - * kPCID - [1, TLB_NR_DYN_ASIDS] + * kPCID - [1, MAX_ASID_AVAILABLE] * the value we write into the PCID part of CR3; corresponds to the * ASID+1, because PCID 0 is special. * - * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] * for KPTI each mm has two address spaces and thus needs two * PCID values, but we can still do with a single ASID denomination * for each mm. Corresponds to kPCID + 2048. @@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, return; } + /* + * TLB consistency for global ASIDs is maintained with hardware assisted + * remote TLB flushing. Global ASIDs are always up to date. + */ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + u16 global_asid = mm_global_asid(next); + + if (global_asid) { + *new_asid = global_asid; + *need_flush = false; + return; + } + } + if (this_cpu_read(cpu_tlbstate.invalidate_other)) clear_asid_other(); @@ -252,6 +268,268 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, } /* + * Global ASIDs are allocated for multi-threaded processes that are + * active on multiple CPUs simultaneously, giving each of those + * processes the same PCID on every CPU, for use with hardware-assisted + * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR. + * + * These global ASIDs are held for the lifetime of the process. + */ +static DEFINE_RAW_SPINLOCK(global_asid_lock); +static u16 last_global_asid = MAX_ASID_AVAILABLE; +static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE); +static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE); +static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1; + +/* + * When the search for a free ASID in the global ASID space reaches + * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously + * freed global ASIDs are safe to re-use. + * + * This way the global flush only needs to happen at ASID rollover + * time, and not at ASID allocation time. + */ +static void reset_global_asid_space(void) +{ + lockdep_assert_held(&global_asid_lock); + + invlpgb_flush_all_nonglobals(); + + /* + * The TLB flush above makes it safe to re-use the previously + * freed global ASIDs. + */ + bitmap_andnot(global_asid_used, global_asid_used, + global_asid_freed, MAX_ASID_AVAILABLE); + bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE); + + /* Restart the search from the start of global ASID space. */ + last_global_asid = TLB_NR_DYN_ASIDS; +} + +static u16 allocate_global_asid(void) +{ + u16 asid; + + lockdep_assert_held(&global_asid_lock); + + /* The previous allocation hit the edge of available address space */ + if (last_global_asid >= MAX_ASID_AVAILABLE - 1) + reset_global_asid_space(); + + asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid); + + if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) { + /* This should never happen. */ + VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", + global_asid_available); + return 0; + } + + /* Claim this global ASID. */ + __set_bit(asid, global_asid_used); + last_global_asid = asid; + global_asid_available--; + return asid; +} + +/* + * Check whether a process is currently active on more than @threshold CPUs. + * This is a cheap estimation on whether or not it may make sense to assign + * a global ASID to this process, and use broadcast TLB invalidation. + */ +static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold) +{ + int count = 0; + int cpu; + + /* This quick check should eliminate most single threaded programs. */ + if (cpumask_weight(mm_cpumask(mm)) <= threshold) + return false; + + /* Slower check to make sure. */ + for_each_cpu(cpu, mm_cpumask(mm)) { + /* Skip the CPUs that aren't really running this process. */ + if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) + continue; + + if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) + continue; + + if (++count > threshold) + return true; + } + return false; +} + +/* + * Assign a global ASID to the current process, protecting against + * races between multiple threads in the process. + */ +static void use_global_asid(struct mm_struct *mm) +{ + u16 asid; + + guard(raw_spinlock_irqsave)(&global_asid_lock); + + /* This process is already using broadcast TLB invalidation. */ + if (mm_global_asid(mm)) + return; + + /* + * The last global ASID was consumed while waiting for the lock. + * + * If this fires, a more aggressive ASID reuse scheme might be + * needed. + */ + if (!global_asid_available) { + VM_WARN_ONCE(1, "Ran out of global ASIDs\n"); + return; + } + + asid = allocate_global_asid(); + if (!asid) + return; + + mm_assign_global_asid(mm, asid); +} + +void mm_free_global_asid(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return; + + if (!mm_global_asid(mm)) + return; + + guard(raw_spinlock_irqsave)(&global_asid_lock); + + /* The global ASID can be re-used only after flush at wrap-around. */ +#ifdef CONFIG_BROADCAST_TLB_FLUSH + __set_bit(mm->context.global_asid, global_asid_freed); + + mm->context.global_asid = 0; + global_asid_available++; +#endif +} + +/* + * Is the mm transitioning from a CPU-local ASID to a global ASID? + */ +static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid) +{ + u16 global_asid = mm_global_asid(mm); + + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return false; + + /* Process is transitioning to a global ASID */ + if (global_asid && asid != global_asid) + return true; + + return false; +} + +/* + * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86 + * systems have over 8k CPUs. Because of this potential ASID shortage, + * global ASIDs are handed out to processes that have frequent TLB + * flushes and are active on 4 or more CPUs simultaneously. + */ +static void consider_global_asid(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return; + + /* Check every once in a while. */ + if ((current->pid & 0x1f) != (jiffies & 0x1f)) + return; + + /* + * Assign a global ASID if the process is active on + * 4 or more CPUs simultaneously. + */ + if (mm_active_cpus_exceeds(mm, 3)) + use_global_asid(mm); +} + +static void finish_asid_transition(struct flush_tlb_info *info) +{ + struct mm_struct *mm = info->mm; + int bc_asid = mm_global_asid(mm); + int cpu; + + if (!mm_in_asid_transition(mm)) + return; + + for_each_cpu(cpu, mm_cpumask(mm)) { + /* + * The remote CPU is context switching. Wait for that to + * finish, to catch the unlikely case of it switching to + * the target mm with an out of date ASID. + */ + while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING) + cpu_relax(); + + if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) + continue; + + /* + * If at least one CPU is not using the global ASID yet, + * send a TLB flush IPI. The IPI should cause stragglers + * to transition soon. + * + * This can race with the CPU switching to another task; + * that results in a (harmless) extra IPI. + */ + if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) { + flush_tlb_multi(mm_cpumask(info->mm), info); + return; + } + } + + /* All the CPUs running this process are using the global ASID. */ + mm_clear_asid_transition(mm); +} + +static void broadcast_tlb_flush(struct flush_tlb_info *info) +{ + bool pmd = info->stride_shift == PMD_SHIFT; + unsigned long asid = mm_global_asid(info->mm); + unsigned long addr = info->start; + + /* + * TLB flushes with INVLPGB are kicked off asynchronously. + * The inc_mm_tlb_gen() guarantees page table updates are done + * before these TLB flushes happen. + */ + if (info->end == TLB_FLUSH_ALL) { + invlpgb_flush_single_pcid_nosync(kern_pcid(asid)); + /* Do any CPUs supporting INVLPGB need PTI? */ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + invlpgb_flush_single_pcid_nosync(user_pcid(asid)); + } else do { + unsigned long nr = 1; + + if (info->stride_shift <= PMD_SHIFT) { + nr = (info->end - addr) >> info->stride_shift; + nr = clamp_val(nr, 1, invlpgb_count_max); + } + + invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); + if (cpu_feature_enabled(X86_FEATURE_PTI)) + invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); + + addr += nr << info->stride_shift; + } while (addr < info->end); + + finish_asid_transition(info); + + /* Wait for the INVLPGBs kicked off above to finish. */ + __tlbsync(); +} + +/* * Given an ASID, flush the corresponding user ASID. We can delay this * until the next time we switch to it. * @@ -447,8 +725,7 @@ static void cond_mitigation(struct task_struct *next) * different context than the user space task which ran * last on this CPU. */ - if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != - (unsigned long)next->mm) + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm) indirect_branch_prediction_barrier(); } @@ -556,7 +833,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, */ if (prev == next) { /* Not actually switching mm's */ - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + VM_WARN_ON(is_dyn_asid(prev_asid) && + this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != next->context.ctx_id); /* @@ -573,6 +851,20 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); + /* Check if the current mm is transitioning to a global ASID */ + if (mm_needs_global_asid(next, prev_asid)) { + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + goto reload_tlb; + } + + /* + * Broadcast TLB invalidation keeps this ASID up to date + * all the time. + */ + if (is_global_asid(prev_asid)) + return; + /* * If the CPU is not in lazy TLB mode, we are just switching * from one thread in a process to another thread in the same @@ -607,6 +899,13 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* + * Let nmi_uaccess_okay() and finish_asid_transition() + * know that CR3 is changing. + */ + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); + barrier(); + + /* * Leave this CPU in prev's mm_cpumask. Atomic writes to * mm_cpumask can be expensive under contention. The CPU * will be removed lazily at TLB flush time. @@ -620,14 +919,12 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - - /* Let nmi_uaccess_okay() know that we're changing CR3. */ - this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); - barrier(); } +reload_tlb: new_lam = mm_lam_cr3_mask(next); if (need_flush) { + VM_WARN_ON_ONCE(is_global_asid(new_asid)); this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); load_new_mm_cr3(next->pgd, new_asid, new_lam, true); @@ -746,7 +1043,7 @@ static void flush_tlb_func(void *info) const struct flush_tlb_info *f = info; struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + u64 local_tlb_gen; bool local = smp_processor_id() == f->initiating_cpu; unsigned long nr_invalidate = 0; u64 mm_tlb_gen; @@ -769,6 +1066,16 @@ static void flush_tlb_func(void *info) if (unlikely(loaded_mm == &init_mm)) return; + /* Reload the ASID if transitioning into or out of a global ASID */ + if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) { + switch_mm_irqs_off(NULL, loaded_mm, NULL); + loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + } + + /* Broadcast ASIDs are always kept up to date with INVLPGB. */ + if (is_global_asid(loaded_mm_asid)) + return; + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != loaded_mm->context.ctx_id); @@ -786,6 +1093,8 @@ static void flush_tlb_func(void *info) return; } + local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && f->new_tlb_gen <= local_tlb_gen)) { /* @@ -953,7 +1262,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, * up on the new contents of what used to be page tables, while * doing a speculative memory access. */ - if (info->freed_tables) + if (info->freed_tables || mm_in_asid_transition(info->mm)) on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); else on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, @@ -1000,6 +1309,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); #endif + /* + * If the number of flushes is so large that a full flush + * would be faster, do a full flush. + */ + if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) { + start = 0; + end = TLB_FLUSH_ALL; + } + info->start = start; info->end = end; info->mm = mm; @@ -1026,17 +1344,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, bool freed_tables) { struct flush_tlb_info *info; + int cpu = get_cpu(); u64 new_tlb_gen; - int cpu; - - cpu = get_cpu(); - - /* Should we flush just the requested range? */ - if ((end == TLB_FLUSH_ALL) || - ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { - start = 0; - end = TLB_FLUSH_ALL; - } /* This is also a barrier that synchronizes with switch_mm(). */ new_tlb_gen = inc_mm_tlb_gen(mm); @@ -1049,9 +1358,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, * a local TLB flush is needed. Optimize this use-case by calling * flush_tlb_func_local() directly in this case. */ - if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + if (mm_global_asid(mm)) { + broadcast_tlb_flush(info); + } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { info->trim_cpumask = should_trim_cpumask(mm); flush_tlb_multi(mm_cpumask(mm), info); + consider_global_asid(mm); } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { lockdep_assert_irqs_enabled(); local_irq_disable(); @@ -1064,7 +1376,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } - static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); @@ -1074,7 +1385,32 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - on_each_cpu(do_flush_tlb_all, NULL, 1); + + /* First try (faster) hardware-assisted TLB invalidation. */ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_flush_all(); + else + /* Fall back to the IPI-based invalidation. */ + on_each_cpu(do_flush_tlb_all, NULL, 1); +} + +/* Flush an arbitrarily large range of memory with INVLPGB. */ +static void invlpgb_kernel_range_flush(struct flush_tlb_info *info) +{ + unsigned long addr, nr; + + for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) { + nr = (info->end - addr) >> PAGE_SHIFT; + + /* + * INVLPGB has a limit on the size of ranges it can + * flush. Break up large flushes. + */ + nr = clamp_val(nr, 1, invlpgb_count_max); + + invlpgb_flush_addr_nosync(addr, nr); + } + __tlbsync(); } static void do_kernel_range_flush(void *info) @@ -1087,24 +1423,37 @@ static void do_kernel_range_flush(void *info) flush_tlb_one_kernel(addr); } -void flush_tlb_kernel_range(unsigned long start, unsigned long end) +static void kernel_tlb_flush_all(struct flush_tlb_info *info) { - /* Balance as user space task's flush, a bit conservative */ - if (end == TLB_FLUSH_ALL || - (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_flush_all(); + else on_each_cpu(do_flush_tlb_all, NULL, 1); - } else { - struct flush_tlb_info *info; - - preempt_disable(); - info = get_flush_tlb_info(NULL, start, end, 0, false, - TLB_GENERATION_INVALID); +} +static void kernel_tlb_flush_range(struct flush_tlb_info *info) +{ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) + invlpgb_kernel_range_flush(info); + else on_each_cpu(do_kernel_range_flush, info, 1); +} - put_flush_tlb_info(); - preempt_enable(); - } +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + struct flush_tlb_info *info; + + guard(preempt)(); + + info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, + TLB_GENERATION_INVALID); + + if (info->end == TLB_FLUSH_ALL) + kernel_tlb_flush_all(info); + else + kernel_tlb_flush_range(info); + + put_flush_tlb_info(); } /* @@ -1283,7 +1632,10 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * a local TLB flush is needed. Optimize this use-case by calling * flush_tlb_func_local() directly in this case. */ - if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) { + invlpgb_flush_all_nonglobals(); + batch->unmapped_pages = false; + } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { flush_tlb_multi(&batch->cpumask, info); } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { lockdep_assert_irqs_enabled(); @@ -1325,7 +1677,7 @@ bool nmi_uaccess_okay(void) if (loaded_mm != current_mm) return false; - VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa()); return true; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a43fc5af973d..72776dcb75aa 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -410,16 +410,20 @@ static void emit_nops(u8 **pprog, int len) * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT * in arch/x86/kernel/alternative.c */ +static int emit_call(u8 **prog, void *func, void *ip); -static void emit_fineibt(u8 **pprog, u32 hash) +static void emit_fineibt(u8 **pprog, u8 *ip, u32 hash, int arity) { u8 *prog = *pprog; EMIT_ENDBR(); EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */ - EMIT2(0x74, 0x07); /* jz.d8 +7 */ - EMIT2(0x0f, 0x0b); /* ud2 */ - EMIT1(0x90); /* nop */ + if (cfi_bhi) { + emit_call(&prog, __bhi_args[arity], ip + 11); + } else { + EMIT2(0x75, 0xf9); /* jne.d8 .-7 */ + EMIT3(0x0f, 0x1f, 0x00); /* nop3 */ + } EMIT_ENDBR_POISON(); *pprog = prog; @@ -448,13 +452,13 @@ static void emit_kcfi(u8 **pprog, u32 hash) *pprog = prog; } -static void emit_cfi(u8 **pprog, u32 hash) +static void emit_cfi(u8 **pprog, u8 *ip, u32 hash, int arity) { u8 *prog = *pprog; switch (cfi_mode) { case CFI_FINEIBT: - emit_fineibt(&prog, hash); + emit_fineibt(&prog, ip, hash, arity); break; case CFI_KCFI: @@ -505,13 +509,17 @@ static void emit_prologue_tail_call(u8 **pprog, bool is_subprog) * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, +static void emit_prologue(u8 **pprog, u8 *ip, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog, bool is_exception_cb) { u8 *prog = *pprog; - emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash); + if (is_subprog) { + emit_cfi(&prog, ip, cfi_bpf_subprog_hash, 5); + } else { + emit_cfi(&prog, ip, cfi_bpf_hash, 1); + } /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ @@ -641,7 +649,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * See emit_prologue(), for IBT builds the trampoline hook is preceded * with an ENDBR instruction. */ - if (is_endbr(*(u32 *)ip)) + if (is_endbr(ip)) ip += ENDBR_INSN_SIZE; return __bpf_arch_text_poke(ip, t, old_addr, new_addr); @@ -1480,7 +1488,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image detect_reg_usage(insn, insn_cnt, callee_regs_used); - emit_prologue(&prog, stack_depth, + emit_prologue(&prog, image, stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); /* Exception callback will clobber callee regs for its own use, and @@ -3036,7 +3044,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* skip patched call instruction and point orig_call to actual * body of the kernel function. */ - if (is_endbr(*(u32 *)orig_call)) + if (is_endbr(orig_call)) orig_call += ENDBR_INSN_SIZE; orig_call += X86_PATCH_SIZE; } @@ -3047,7 +3055,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* * Indirect call for bpf_struct_ops */ - emit_cfi(&prog, cfi_get_func_hash(func_addr)); + emit_cfi(&prog, image, + cfi_get_func_hash(func_addr), + cfi_get_func_arity(func_addr)); } else { /* * Direct-call fentry stub, as such it needs accounting for the diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 48bcada5cabe..4933fb337983 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -12,8 +12,6 @@ obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o -obj-$(CONFIG_STA2X11) += sta2x11-fixup.o - obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c deleted file mode 100644 index 8c8ddc4dcc08..000000000000 --- a/arch/x86/pci/sta2x11-fixup.c +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * DMA translation between STA2x11 AMBA memory mapping and the x86 memory mapping - * - * ST Microelectronics ConneXt (STA2X11/STA2X10) - * - * Copyright (c) 2010-2011 Wind River Systems, Inc. - */ - -#include <linux/pci.h> -#include <linux/pci_ids.h> -#include <linux/export.h> -#include <linux/list.h> -#include <linux/dma-map-ops.h> -#include <linux/swiotlb.h> -#include <asm/iommu.h> -#include <asm/sta2x11.h> - -#define STA2X11_SWIOTLB_SIZE (4*1024*1024) - -/* - * We build a list of bus numbers that are under the ConneXt. The - * main bridge hosts 4 busses, which are the 4 endpoints, in order. - */ -#define STA2X11_NR_EP 4 /* 0..3 included */ -#define STA2X11_NR_FUNCS 8 /* 0..7 included */ -#define STA2X11_AMBA_SIZE (512 << 20) - -struct sta2x11_ahb_regs { /* saved during suspend */ - u32 base, pexlbase, pexhbase, crw; -}; - -struct sta2x11_mapping { - int is_suspended; - struct sta2x11_ahb_regs regs[STA2X11_NR_FUNCS]; -}; - -struct sta2x11_instance { - struct list_head list; - int bus0; - struct sta2x11_mapping map[STA2X11_NR_EP]; -}; - -static LIST_HEAD(sta2x11_instance_list); - -/* At probe time, record new instances of this bridge (likely one only) */ -static void sta2x11_new_instance(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - - instance = kzalloc(sizeof(*instance), GFP_ATOMIC); - if (!instance) - return; - /* This has a subordinate bridge, with 4 more-subordinate ones */ - instance->bus0 = pdev->subordinate->number + 1; - - if (list_empty(&sta2x11_instance_list)) { - int size = STA2X11_SWIOTLB_SIZE; - /* First instance: register your own swiotlb area */ - dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_init_late(size, GFP_DMA, NULL)) - dev_emerg(&pdev->dev, "init swiotlb failed\n"); - } - list_add(&instance->list, &sta2x11_instance_list); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, 0xcc17, sta2x11_new_instance); - -/* - * Utility functions used in this file from below - */ -static struct sta2x11_instance *sta2x11_pdev_to_instance(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - int ep; - - list_for_each_entry(instance, &sta2x11_instance_list, list) { - ep = pdev->bus->number - instance->bus0; - if (ep >= 0 && ep < STA2X11_NR_EP) - return instance; - } - return NULL; -} - -static int sta2x11_pdev_to_ep(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - - instance = sta2x11_pdev_to_instance(pdev); - if (!instance) - return -1; - - return pdev->bus->number - instance->bus0; -} - -/* This is exported, as some devices need to access the MFD registers */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev) -{ - return sta2x11_pdev_to_instance(pdev); -} -EXPORT_SYMBOL(sta2x11_get_instance); - -/* At setup time, we use our own ops if the device is a ConneXt one */ -static void sta2x11_setup_pdev(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); - - if (!instance) /* either a sta2x11 bridge or another ST device */ - return; - - /* We must enable all devices as master, for audio DMA to work */ - pci_set_master(pdev); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_setup_pdev); - -/* - * At boot we must set up the mappings for the pcie-to-amba bridge. - * It involves device access, and the same happens at suspend/resume time - */ - -#define AHB_MAPB 0xCA4 -#define AHB_CRW(i) (AHB_MAPB + 0 + (i) * 0x10) -#define AHB_CRW_SZMASK 0xfffffc00UL -#define AHB_CRW_ENABLE (1 << 0) -#define AHB_CRW_WTYPE_MEM (2 << 1) -#define AHB_CRW_ROE (1UL << 3) /* Relax Order Ena */ -#define AHB_CRW_NSE (1UL << 4) /* No Snoop Enable */ -#define AHB_BASE(i) (AHB_MAPB + 4 + (i) * 0x10) -#define AHB_PEXLBASE(i) (AHB_MAPB + 8 + (i) * 0x10) -#define AHB_PEXHBASE(i) (AHB_MAPB + 12 + (i) * 0x10) - -/* At probe time, enable mapping for each endpoint, using the pdev */ -static void sta2x11_map_ep(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); - struct device *dev = &pdev->dev; - u32 amba_base, max_amba_addr; - int i, ret; - - if (!instance) - return; - - pci_read_config_dword(pdev, AHB_BASE(0), &amba_base); - max_amba_addr = amba_base + STA2X11_AMBA_SIZE - 1; - - ret = dma_direct_set_offset(dev, 0, amba_base, STA2X11_AMBA_SIZE); - if (ret) - dev_err(dev, "sta2x11: could not set DMA offset\n"); - - dev->bus_dma_limit = max_amba_addr; - dma_set_mask_and_coherent(&pdev->dev, max_amba_addr); - - /* Configure AHB mapping */ - pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0); - pci_write_config_dword(pdev, AHB_PEXHBASE(0), 0); - pci_write_config_dword(pdev, AHB_CRW(0), STA2X11_AMBA_SIZE | - AHB_CRW_WTYPE_MEM | AHB_CRW_ENABLE); - - /* Disable all the other windows */ - for (i = 1; i < STA2X11_NR_FUNCS; i++) - pci_write_config_dword(pdev, AHB_CRW(i), 0); - - dev_info(&pdev->dev, - "sta2x11: Map EP %i: AMBA address %#8x-%#8x\n", - sta2x11_pdev_to_ep(pdev), amba_base, max_amba_addr); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_map_ep); - -#ifdef CONFIG_PM /* Some register values must be saved and restored */ - -static struct sta2x11_mapping *sta2x11_pdev_to_mapping(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - int ep; - - instance = sta2x11_pdev_to_instance(pdev); - if (!instance) - return NULL; - ep = sta2x11_pdev_to_ep(pdev); - return instance->map + ep; -} - -static void suspend_mapping(struct pci_dev *pdev) -{ - struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); - int i; - - if (!map) - return; - - if (map->is_suspended) - return; - map->is_suspended = 1; - - /* Save all window configs */ - for (i = 0; i < STA2X11_NR_FUNCS; i++) { - struct sta2x11_ahb_regs *regs = map->regs + i; - - pci_read_config_dword(pdev, AHB_BASE(i), ®s->base); - pci_read_config_dword(pdev, AHB_PEXLBASE(i), ®s->pexlbase); - pci_read_config_dword(pdev, AHB_PEXHBASE(i), ®s->pexhbase); - pci_read_config_dword(pdev, AHB_CRW(i), ®s->crw); - } -} -DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, suspend_mapping); - -static void resume_mapping(struct pci_dev *pdev) -{ - struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); - int i; - - if (!map) - return; - - - if (!map->is_suspended) - goto out; - map->is_suspended = 0; - - /* Restore all window configs */ - for (i = 0; i < STA2X11_NR_FUNCS; i++) { - struct sta2x11_ahb_regs *regs = map->regs + i; - - pci_write_config_dword(pdev, AHB_BASE(i), regs->base); - pci_write_config_dword(pdev, AHB_PEXLBASE(i), regs->pexlbase); - pci_write_config_dword(pdev, AHB_PEXHBASE(i), regs->pexhbase); - pci_write_config_dword(pdev, AHB_CRW(i), regs->crw); - } -out: - pci_set_master(pdev); /* Like at boot, enable master on all devices */ -} -DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, resume_mapping); - -#endif /* CONFIG_PM */ diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index ccb23c73cbe8..63066e7c8517 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -14,7 +14,6 @@ #include <linux/interrupt.h> #include <linux/platform_device.h> #include <linux/pm.h> -#include <linux/pm_wakeup.h> #include <linux/power_supply.h> #include <linux/suspend.h> #include <linux/workqueue.h> diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index cf5dca2dbb91..e108ce7dad6a 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -215,13 +215,12 @@ static u32 __init olpc_dt_get_board_revision(void) static int __init olpc_dt_compatible_match(phandle node, const char *compat) { char buf[64], *p; - int plen, len; + int plen; plen = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf)); if (plen <= 0) return 0; - len = strlen(compat); for (p = buf; p < buf + plen; p += strlen(p) + 1) { if (strcmp(p, compat) == 0) return 1; diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 4733a5f467b8..cfa18ec7d55f 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -173,10 +173,14 @@ SYM_CODE_START(pvh_start_xen) 1: UNWIND_HINT_END_OF_STACK - /* Set base address in stack canary descriptor. */ - mov $MSR_GS_BASE,%ecx - leal canary(%rip), %eax - xor %edx, %edx + /* + * Set up GSBASE. + * Note that on SMP the boot CPU uses the init data section until + * the per-CPU areas are set up. + */ + movl $MSR_GS_BASE,%ecx + xorl %eax, %eax + xorl %edx, %edx wrmsr /* Call xen_prepare_pvh() via the kernel virtual mapping */ @@ -238,8 +242,6 @@ SYM_DATA_START_LOCAL(gdt_start) SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end) .balign 16 -SYM_DATA_LOCAL(canary, .fill 48, 1, 0) - SYM_DATA_START_LOCAL(early_stack) .fill BOOT_STACK_SIZE, 1, 0 SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 0a0539e1cc81..8c534c36adfa 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -26,6 +26,7 @@ /* code below belongs to the image kernel */ .align PAGE_SIZE SYM_FUNC_START(restore_registers) + ANNOTATE_NOENDBR /* go back to the original page tables */ movq %r9, %cr3 @@ -119,6 +120,7 @@ SYM_FUNC_END(restore_image) /* code below has been relocated to a safe page */ SYM_FUNC_START(core_restore_code) + ANNOTATE_NOENDBR /* switch to temporary page tables */ movq %rax, %cr3 /* flush TLB */ diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h index c76041a35397..867e55f1d6af 100644 --- a/arch/x86/realmode/rm/realmode.h +++ b/arch/x86/realmode/rm/realmode.h @@ -2,7 +2,7 @@ #ifndef ARCH_X86_REALMODE_RM_REALMODE_H #define ARCH_X86_REALMODE_RM_REALMODE_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* * 16-bit ljmpw to the real_mode_seg @@ -12,7 +12,7 @@ */ #define LJMPW_RM(to) .byte 0xea ; .word (to), real_mode_seg -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Signature at the end of the realmode region diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h index 0e4fd08ae447..3b6d8fa82d3e 100644 --- a/arch/x86/realmode/rm/wakeup.h +++ b/arch/x86/realmode/rm/wakeup.h @@ -7,7 +7,7 @@ #ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H #define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> /* This must match data at wakeup.S */ diff --git a/arch/x86/tools/cpufeaturemasks.awk b/arch/x86/tools/cpufeaturemasks.awk new file mode 100755 index 000000000000..173d5bf2d999 --- /dev/null +++ b/arch/x86/tools/cpufeaturemasks.awk @@ -0,0 +1,88 @@ +#!/usr/bin/awk +# +# Convert cpufeatures.h to a list of compile-time masks +# Note: this blithely assumes that each word has at least one +# feature defined in it; if not, something else is wrong! +# + +BEGIN { + printf "#ifndef _ASM_X86_CPUFEATUREMASKS_H\n"; + printf "#define _ASM_X86_CPUFEATUREMASKS_H\n\n"; + + file = 0 +} + +FNR == 1 { + ++file; + + # arch/x86/include/asm/cpufeatures.h + if (file == 1) + FS = "[ \t()*+]+"; + + # .config + if (file == 2) + FS = "="; +} + +# Create a dictionary of sorts, containing all defined feature bits +file == 1 && $1 ~ /^#define$/ && $2 ~ /^X86_FEATURE_/ { + nfeat = $3 * $4 + $5; + feat = $2; + sub(/^X86_FEATURE_/, "", feat); + feats[nfeat] = feat; +} +file == 1 && $1 ~ /^#define$/ && $2 == "NCAPINTS" { + ncapints = int($3); +} + +# Create a dictionary featstat[REQUIRED|DISABLED, FEATURE_NAME] = on | off +file == 2 && $1 ~ /^CONFIG_X86_(REQUIRED|DISABLED)_FEATURE_/ { + on = ($2 == "y"); + if (split($1, fs, "CONFIG_X86_|_FEATURE_") == 3) + featstat[fs[2], fs[3]] = on; +} + +END { + sets[1] = "REQUIRED"; + sets[2] = "DISABLED"; + + for (ns in sets) { + s = sets[ns]; + + printf "/*\n"; + printf " * %s features:\n", s; + printf " *\n"; + fstr = ""; + for (i = 0; i < ncapints; i++) { + mask = 0; + for (j = 0; j < 32; j++) { + feat = feats[i*32 + j]; + if (featstat[s, feat]) { + nfstr = fstr " " feat; + if (length(nfstr) > 72) { + printf " * %s\n", fstr; + nfstr = " " feat; + } + fstr = nfstr; + mask += (2 ^ j); + } + } + masks[i] = mask; + } + printf " * %s\n */\n", fstr; + + for (i = 0; i < ncapints; i++) + printf "#define %s_MASK%d\t0x%08xU\n", s, i, masks[i]; + + printf "\n#define %s_MASK_BIT_SET(x)\t\t\t\\\n", s; + printf "\t((\t\t\t\t\t"; + for (i = 0; i < ncapints; i++) { + if (masks[i]) + printf "\t\\\n\t\t((x) >> 5) == %2d ? %s_MASK%d :", i, s, i; + } + printf " 0\t\\\n"; + printf "\t) & (1U << ((x) & 31)))\n\n"; + } + + printf "#endif /* _ASM_X86_CPUFEATUREMASKS_H */\n"; +} diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index e937be979ec8..5778bc498415 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -29,9 +29,13 @@ static struct relocs relocs16; static struct relocs relocs32; #if ELF_BITS == 64 -static struct relocs relocs32neg; static struct relocs relocs64; # define FMT PRIu64 + +#ifndef R_X86_64_REX_GOTPCRELX +# define R_X86_64_REX_GOTPCRELX 42 +#endif + #else # define FMT PRIu32 #endif @@ -86,8 +90,6 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__initramfs_start|" "(jiffies|jiffies_64)|" #if ELF_BITS == 64 - "__per_cpu_load|" - "init_per_cpu__.*|" "__end_rodata_hpage_align|" #endif "_end)$" @@ -227,6 +229,7 @@ static const char *rel_type(unsigned type) REL_TYPE(R_X86_64_PC16), REL_TYPE(R_X86_64_8), REL_TYPE(R_X86_64_PC8), + REL_TYPE(R_X86_64_REX_GOTPCRELX), #else REL_TYPE(R_386_NONE), REL_TYPE(R_386_32), @@ -284,34 +287,6 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym) return name; } -static Elf_Sym *sym_lookup(const char *symname) -{ - int i; - - for (i = 0; i < shnum; i++) { - struct section *sec = &secs[i]; - long nsyms; - char *strtab; - Elf_Sym *symtab; - Elf_Sym *sym; - - if (sec->shdr.sh_type != SHT_SYMTAB) - continue; - - nsyms = sec->shdr.sh_size/sizeof(Elf_Sym); - symtab = sec->symtab; - strtab = sec->link->strtab; - - for (sym = symtab; --nsyms >= 0; sym++) { - if (!sym->st_name) - continue; - if (strcmp(symname, strtab + sym->st_name) == 0) - return sym; - } - } - return 0; -} - #if BYTE_ORDER == LITTLE_ENDIAN # define le16_to_cpu(val) (val) # define le32_to_cpu(val) (val) @@ -760,84 +735,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, } } -/* - * The .data..percpu section is a special case for x86_64 SMP kernels. - * It is used to initialize the actual per_cpu areas and to provide - * definitions for the per_cpu variables that correspond to their offsets - * within the percpu area. Since the values of all of the symbols need - * to be offsets from the start of the per_cpu area the virtual address - * (sh_addr) of .data..percpu is 0 in SMP kernels. - * - * This means that: - * - * Relocations that reference symbols in the per_cpu area do not - * need further relocation (since the value is an offset relative - * to the start of the per_cpu area that does not change). - * - * Relocations that apply to the per_cpu area need to have their - * offset adjusted by by the value of __per_cpu_load to make them - * point to the correct place in the loaded image (because the - * virtual address of .data..percpu is 0). - * - * For non SMP kernels .data..percpu is linked as part of the normal - * kernel data and does not require special treatment. - * - */ -static int per_cpu_shndx = -1; -static Elf_Addr per_cpu_load_addr; - -static void percpu_init(void) -{ - int i; - - for (i = 0; i < shnum; i++) { - ElfW(Sym) *sym; - - if (strcmp(sec_name(i), ".data..percpu")) - continue; - - if (secs[i].shdr.sh_addr != 0) /* non SMP kernel */ - return; - - sym = sym_lookup("__per_cpu_load"); - if (!sym) - die("can't find __per_cpu_load\n"); - - per_cpu_shndx = i; - per_cpu_load_addr = sym->st_value; - - return; - } -} - #if ELF_BITS == 64 -/* - * Check to see if a symbol lies in the .data..percpu section. - * - * The linker incorrectly associates some symbols with the - * .data..percpu section so we also need to check the symbol - * name to make sure that we classify the symbol correctly. - * - * The GNU linker incorrectly associates: - * __init_begin - * __per_cpu_load - * - * The "gold" linker incorrectly associates: - * init_per_cpu__fixed_percpu_data - * init_per_cpu__gdt_page - */ -static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) -{ - int shndx = sym_index(sym); - - return (shndx == per_cpu_shndx) && - strcmp(symname, "__init_begin") && - strcmp(symname, "__per_cpu_load") && - strncmp(symname, "init_per_cpu_", 13); -} - - static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { @@ -848,12 +747,6 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, if (sym->st_shndx == SHN_UNDEF) return 0; - /* - * Adjust the offset if this reloc applies to the percpu section. - */ - if (sec->shdr.sh_info == per_cpu_shndx) - offset += per_cpu_load_addr; - switch (r_type) { case R_X86_64_NONE: /* NONE can be ignored. */ @@ -861,33 +754,23 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, case R_X86_64_PC32: case R_X86_64_PLT32: + case R_X86_64_REX_GOTPCRELX: /* - * PC relative relocations don't need to be adjusted unless - * referencing a percpu symbol. + * PC relative relocations don't need to be adjusted. * * NB: R_X86_64_PLT32 can be treated as R_X86_64_PC32. */ - if (is_percpu_sym(sym, symname)) - add_reloc(&relocs32neg, offset); break; case R_X86_64_PC64: /* * Only used by jump labels */ - if (is_percpu_sym(sym, symname)) - die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", symname); break; case R_X86_64_32: case R_X86_64_32S: case R_X86_64_64: - /* - * References to the percpu area don't need to be adjusted. - */ - if (is_percpu_sym(sym, symname)) - break; - if (shn_abs) { /* * Whitelisted absolute symbols do not require @@ -1055,7 +938,8 @@ static int cmp_relocs(const void *va, const void *vb) static void sort_relocs(struct relocs *r) { - qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); + if (r->count) + qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); } static int write32(uint32_t v, FILE *f) @@ -1099,7 +983,6 @@ static void emit_relocs(int as_text, int use_real_mode) /* Order the relocations for more efficient processing */ sort_relocs(&relocs32); #if ELF_BITS == 64 - sort_relocs(&relocs32neg); sort_relocs(&relocs64); #else sort_relocs(&relocs16); @@ -1131,13 +1014,6 @@ static void emit_relocs(int as_text, int use_real_mode) /* Now print each relocation */ for (i = 0; i < relocs64.count; i++) write_reloc(relocs64.offset[i], stdout); - - /* Print a stop */ - write_reloc(0, stdout); - - /* Now print each inverse 32-bit relocation */ - for (i = 0; i < relocs32neg.count; i++) - write_reloc(relocs32neg.offset[i], stdout); #endif /* Print a stop */ @@ -1190,9 +1066,6 @@ void process(FILE *fp, int use_real_mode, int as_text, read_symtabs(fp); read_relocs(fp); - if (ELF_BITS == 64) - percpu_init(); - if (show_absolute_syms) { print_absolute_symbols(); return; diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 42e74a5a7d78..fc473ca12c44 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -198,7 +198,6 @@ static void __init __snp_fixup_e820_tables(u64 pa) pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa); e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); - e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); if (!memblock_is_region_reserved(pa, PMD_SIZE)) memblock_reserve(pa, PMD_SIZE); } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 77e788e928cd..98d8a50d2aed 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,7 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) - depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5e57835e999d..dcc2041f8e61 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -73,6 +73,7 @@ #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/cpu.h> +#include <asm/irq_stack.h> #ifdef CONFIG_X86_IOPL_IOPERM #include <asm/io_bitmap.h> #endif @@ -94,6 +95,44 @@ void *xen_initial_gdt; static int xen_cpu_up_prepare_pv(unsigned int cpu); static int xen_cpu_dead_pv(unsigned int cpu); +#ifndef CONFIG_PREEMPTION +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +/* + * In case of scheduling the flag must be cleared and restored after + * returning from schedule as the task might move to a different CPU. + */ +static __always_inline bool get_and_clear_inhcall(void) +{ + bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); + + __this_cpu_write(xen_in_preemptible_hcall, false); + return inhcall; +} + +static __always_inline void restore_inhcall(bool inhcall) +{ + __this_cpu_write(xen_in_preemptible_hcall, inhcall); +} + +#else + +static __always_inline bool get_and_clear_inhcall(void) { return false; } +static __always_inline void restore_inhcall(bool inhcall) { } + +#endif + struct tls_descs { struct desc_struct desc[3]; }; @@ -687,6 +726,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check) } #endif +static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(irq_hv_callback_count); + + xen_evtchn_do_upcall(); + + set_irq_regs(old_regs); +} + +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + bool inhcall; + + instrumentation_begin(); + run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); + + inhcall = get_and_clear_inhcall(); + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { + irqentry_exit_cond_resched(); + instrumentation_end(); + restore_inhcall(inhcall); + } else { + instrumentation_end(); + irqentry_exit(regs, state); + } +} + struct trap_array_entry { void (*orig)(void); void (*xen)(void); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d078de2c952b..38971c6dcd4b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 6863d3da7dec..688ff59318ae 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -70,7 +70,7 @@ static void cpu_bringup(void) xen_enable_syscall(); } cpu = smp_processor_id(); - smp_store_cpu_info(cpu); + identify_secondary_cpu(cpu); set_cpu_sibling_map(cpu); speculative_store_bypass_ht_init(); diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index b518f36d1ca2..109af12f7647 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -51,6 +51,7 @@ SYM_FUNC_END(xen_hypercall_pv) * non-zero. */ SYM_FUNC_START(xen_irq_disable_direct) + ENDBR movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) RET SYM_FUNC_END(xen_irq_disable_direct) @@ -90,6 +91,7 @@ SYM_FUNC_END(check_events) * then enter the hypervisor to get them handled. */ SYM_FUNC_START(xen_irq_enable_direct) + ENDBR FRAME_BEGIN /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) @@ -120,6 +122,7 @@ SYM_FUNC_END(xen_irq_enable_direct) * x86 use opposite senses (mask vs enable). */ SYM_FUNC_START(xen_save_fl_direct) + ENDBR testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) setz %ah addb %ah, %ah @@ -127,6 +130,7 @@ SYM_FUNC_START(xen_save_fl_direct) SYM_FUNC_END(xen_save_fl_direct) SYM_FUNC_START(xen_read_cr2) + ENDBR FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX _ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX @@ -135,6 +139,7 @@ SYM_FUNC_START(xen_read_cr2) SYM_FUNC_END(xen_read_cr2); SYM_FUNC_START(xen_read_cr2_direct) + ENDBR FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX FRAME_END diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 894edf8d6d62..5dad6c51cdc3 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -31,16 +31,14 @@ SYM_CODE_START(startup_xen) leaq __top_init_kernel_stack(%rip), %rsp - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax - cdq + xorl %eax, %eax + xorl %edx, %edx wrmsr mov %rsi, %rdi @@ -133,11 +131,13 @@ SYM_FUNC_START(xen_hypercall_hvm) SYM_FUNC_END(xen_hypercall_hvm) SYM_FUNC_START(xen_hypercall_amd) + ANNOTATE_NOENDBR vmmcall RET SYM_FUNC_END(xen_hypercall_amd) SYM_FUNC_START(xen_hypercall_intel) + ANNOTATE_NOENDBR vmcall RET SYM_FUNC_END(xen_hypercall_intel) diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 37effc1b134e..f657a77314f8 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 38092d21acf8..44c07c4e0833 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -629,15 +629,11 @@ DEFINE_SPINLOCK(die_lock); void __noreturn die(const char * str, struct pt_regs * regs, long err) { static int die_counter; - const char *pr = ""; - - if (IS_ENABLED(CONFIG_PREEMPTION)) - pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; console_verbose(); spin_lock_irq(&die_lock); - pr_info("%s: sig: %ld [#%d]%s\n", str, err, ++die_counter, pr); + pr_info("%s: sig: %ld [#%d]\n", str, err, ++die_counter); show_regs(regs); if (!user_mode(regs)) show_stack(NULL, (unsigned long *)regs->areg[1], KERN_INFO); diff --git a/block/bdev.c b/block/bdev.c index 9d73a8fbf7f9..4844d1e27b6f 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -148,6 +148,8 @@ static void set_init_blocksize(struct block_device *bdev) bsize <<= 1; } BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); + mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping, + get_order(bsize)); } int set_blocksize(struct file *file, int size) @@ -169,6 +171,7 @@ int set_blocksize(struct file *file, int size) if (inode->i_blkbits != blksize_bits(size)) { sync_blockdev(bdev); inode->i_blkbits = blksize_bits(size); + mapping_set_folio_min_order(inode->i_mapping, get_order(size)); kill_bdev(bdev); } return 0; @@ -178,10 +181,11 @@ EXPORT_SYMBOL(set_blocksize); int sb_set_blocksize(struct super_block *sb, int size) { + if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE) + return 0; if (set_blocksize(sb->s_bdev_file, size)) return 0; - /* If we get here, we know size is power of two - * and it's value is between 512 and PAGE_SIZE */ + /* If we get here, we know size is validated */ sb->s_blocksize = size; sb->s_blocksize_bits = blksize_bits(size); return sb->s_blocksize; @@ -1274,9 +1278,6 @@ void bdev_statx(struct path *path, struct kstat *stat, struct inode *backing_inode; struct block_device *bdev; - if (!(request_mask & (STATX_DIOALIGN | STATX_WRITE_ATOMIC))) - return; - backing_inode = d_backing_inode(path->dentry); /* @@ -1303,6 +1304,8 @@ void bdev_statx(struct path *path, struct kstat *stat, queue_atomic_write_unit_max_bytes(bd_queue)); } + stat->blksize = bdev_io_min(bdev); + blkdev_put_no_open(bdev); } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 167542201603..abd80dc13562 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7315,9 +7315,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) INIT_LIST_HEAD(&bfqd->dispatch); - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + hrtimer_setup(&bfqd->idle_slice_timer, bfq_idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); bfqd->queue_weights_tree = RB_ROOT_CACHED; #ifdef CONFIG_BFQ_GROUP_IOSCHED diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9ed93d91d754..1994a11ff903 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -659,6 +659,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkcg_gq *blkg; int i; + pr_info_once("blkio.%s is deprecated\n", cftype->name); mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); @@ -1770,12 +1771,15 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->dfl_cftypes) + if (pol->dfl_cftypes == pol->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + } else { WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); - if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); + } mutex_unlock(&blkcg_pol_register_mutex); return 0; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 65a1d4427ccf..ed11438eb4be 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3004,8 +3004,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); - hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - iocg->waitq_timer.function = iocg_waitq_timer_fn; + hrtimer_setup(&iocg->waitq_timer, iocg_waitq_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->level = blkg->blkcg->css.cgroup->level; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 8fff7ccc0ac7..13659dc15c3f 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -113,27 +113,18 @@ static void ioprio_free_cpd(struct blkcg_policy_data *cpd) kfree(blkcg); } -#define IOPRIO_ATTRS \ - { \ - .name = "prio.class", \ - .seq_show = ioprio_show_prio_policy, \ - .write = ioprio_set_prio_policy, \ - }, \ - { } /* sentinel */ - -/* cgroup v2 attributes */ static struct cftype ioprio_files[] = { - IOPRIO_ATTRS -}; - -/* cgroup v1 attributes */ -static struct cftype ioprio_legacy_files[] = { - IOPRIO_ATTRS + { + .name = "prio.class", + .seq_show = ioprio_show_prio_policy, + .write = ioprio_set_prio_policy, + }, + { } /* sentinel */ }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, - .legacy_cftypes = ioprio_legacy_files, + .legacy_cftypes = ioprio_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index c20eb63750f5..43aba57b48f0 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -172,9 +172,10 @@ static void free_slice(struct kref *kref) static int clone_range_of_sgt_for_slice(struct qaic_device *qdev, struct sg_table **sgt_out, struct sg_table *sgt_in, u64 size, u64 offset) { - int total_len, len, nents, offf = 0, offl = 0; struct scatterlist *sg, *sgn, *sgf, *sgl; + unsigned int len, nents, offf, offl; struct sg_table *sgt; + size_t total_len; int ret, j; /* find out number of relevant nents needed for this mem */ @@ -182,6 +183,8 @@ static int clone_range_of_sgt_for_slice(struct qaic_device *qdev, struct sg_tabl sgf = NULL; sgl = NULL; nents = 0; + offf = 0; + offl = 0; size = size ? size : PAGE_SIZE; for_each_sgtable_dma_sg(sgt_in, sg, j) { @@ -554,6 +557,7 @@ static bool invalid_sem(struct qaic_sem *sem) static int qaic_validate_req(struct qaic_device *qdev, struct qaic_attach_slice_entry *slice_ent, u32 count, u64 total_size) { + u64 total; int i; for (i = 0; i < count; i++) { @@ -563,7 +567,8 @@ static int qaic_validate_req(struct qaic_device *qdev, struct qaic_attach_slice_ invalid_sem(&slice_ent[i].sem2) || invalid_sem(&slice_ent[i].sem3)) return -EINVAL; - if (slice_ent[i].offset + slice_ent[i].size > total_size) + if (check_add_overflow(slice_ent[i].offset, slice_ent[i].size, &total) || + total > total_size) return -EINVAL; } diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 40208a0f5dfb..797070fc9a3f 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -5,6 +5,10 @@ ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_processor_idle.o += -DDISABLE_BRANCH_PROFILING +endif + # # ACPI Boot-Time Table Parsing # diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 698897b29de2..586cc7d1d8aa 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -590,6 +590,8 @@ static void acpi_idle_play_dead(struct cpuidle_device *dev, int index) raw_safe_halt(); else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { io_idle(cx->address); + } else if (cx->entry_method == ACPI_CSTATE_FFH) { + acpi_processor_ffh_play_dead(cx); } else return; } diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index c085dd81ebe7..d956735e2a76 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2845,6 +2845,10 @@ int ata_dev_configure(struct ata_device *dev) (id[ATA_ID_SATA_CAPABILITY] & 0xe) == 0x2) dev->quirks |= ATA_QUIRK_NOLPM; + if (dev->quirks & ATA_QUIRK_NO_LPM_ON_ATI && + ata_dev_check_adapter(dev, PCI_VENDOR_ID_ATI)) + dev->quirks |= ATA_QUIRK_NOLPM; + if (ap->flags & ATA_FLAG_NO_LPM) dev->quirks |= ATA_QUIRK_NOLPM; @@ -3897,6 +3901,7 @@ static const char * const ata_quirk_names[] = { [__ATA_QUIRK_MAX_SEC_1024] = "maxsec1024", [__ATA_QUIRK_MAX_TRIM_128M] = "maxtrim128m", [__ATA_QUIRK_NO_NCQ_ON_ATI] = "noncqonati", + [__ATA_QUIRK_NO_LPM_ON_ATI] = "nolpmonati", [__ATA_QUIRK_NO_ID_DEV_LOG] = "noiddevlog", [__ATA_QUIRK_NO_LOG_DIR] = "nologdir", [__ATA_QUIRK_NO_FUA] = "nofua", @@ -4142,13 +4147,16 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = { ATA_QUIRK_ZERO_AFTER_TRIM }, { "Samsung SSD 860*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | - ATA_QUIRK_NO_NCQ_ON_ATI }, + ATA_QUIRK_NO_NCQ_ON_ATI | + ATA_QUIRK_NO_LPM_ON_ATI }, { "Samsung SSD 870*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | - ATA_QUIRK_NO_NCQ_ON_ATI }, + ATA_QUIRK_NO_NCQ_ON_ATI | + ATA_QUIRK_NO_LPM_ON_ATI }, { "SAMSUNG*MZ7LH*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | - ATA_QUIRK_NO_NCQ_ON_ATI, }, + ATA_QUIRK_NO_NCQ_ON_ATI | + ATA_QUIRK_NO_LPM_ON_ATI }, { "FCCT*M500*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM }, diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c index dce24806a052..2d32125c16fd 100644 --- a/drivers/ata/pata_octeon_cf.c +++ b/drivers/ata/pata_octeon_cf.c @@ -935,9 +935,8 @@ static int octeon_cf_probe(struct platform_device *pdev) ap->mwdma_mask = enable_dma ? ATA_MWDMA4 : 0; /* True IDE mode needs a timer to poll for not-busy. */ - hrtimer_init(&cf_port->delayed_finish, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cf_port->delayed_finish.function = octeon_cf_delayed_finish; + hrtimer_setup(&cf_port->delayed_finish, octeon_cf_delayed_finish, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } else { /* 16 bit but not True IDE */ base = cs0 + 0x800; diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 3ebe77566788..af0029d30dbe 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -11,6 +11,7 @@ #include <linux/cleanup.h> #include <linux/cpu.h> #include <linux/cpufreq.h> +#include <linux/cpu_smt.h> #include <linux/device.h> #include <linux/of.h> #include <linux/slab.h> @@ -28,7 +29,7 @@ static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; -DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1; +DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 0; EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref); static bool supports_scale_freq_counters(const struct cpumask *cpus) @@ -293,13 +294,15 @@ void topology_normalize_cpu_scale(void) capacity_scale = 1; for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); + capacity = raw_capacity[cpu] * + (per_cpu(capacity_freq_ref, cpu) ?: 1); capacity_scale = max(capacity, capacity_scale); } pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); + capacity = raw_capacity[cpu] * + (per_cpu(capacity_freq_ref, cpu) ?: 1); capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); @@ -506,6 +509,10 @@ core_initcall(free_raw_capacity); #endif #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) + +/* Used to enable the SMT control */ +static unsigned int max_smt_thread_num = 1; + /* * This function returns the logic cpu number of the node. * There are basically three kinds of return values: @@ -565,6 +572,8 @@ static int __init parse_core(struct device_node *core, int package_id, i++; } while (1); + max_smt_thread_num = max_t(unsigned int, max_smt_thread_num, i); + cpu = get_cpu_for_node(core); if (cpu >= 0) { if (!leaf) { @@ -677,6 +686,17 @@ static int __init parse_socket(struct device_node *socket) if (!has_socket) ret = parse_cluster(socket, 0, -1, 0); + /* + * Reset the max_smt_thread_num to 1 on failure. Since on failure + * we need to notify the framework the SMT is not supported, but + * max_smt_thread_num can be initialized to the SMT thread number + * of the cores which are successfully parsed. + */ + if (ret) + max_smt_thread_num = 1; + + cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num); + return ret; } diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index b848764ef018..6dd1a8860f1c 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -63,22 +63,6 @@ __setup("devtmpfs.mount=", mount_param); static struct vfsmount *mnt; -static struct dentry *public_dev_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - struct super_block *s = mnt->mnt_sb; - int err; - - atomic_inc(&s->s_active); - down_write(&s->s_umount); - err = reconfigure_single(s, flags, data); - if (err < 0) { - deactivate_locked_super(s); - return ERR_PTR(err); - } - return dget(s->s_root); -} - static struct file_system_type internal_fs_type = { .name = "devtmpfs", #ifdef CONFIG_TMPFS @@ -89,9 +73,40 @@ static struct file_system_type internal_fs_type = { .kill_sb = kill_litter_super, }; +/* Simply take a ref on the existing mount */ +static int devtmpfs_get_tree(struct fs_context *fc) +{ + struct super_block *sb = mnt->mnt_sb; + + atomic_inc(&sb->s_active); + down_write(&sb->s_umount); + fc->root = dget(sb->s_root); + return 0; +} + +/* Ops are filled in during init depending on underlying shmem or ramfs type */ +struct fs_context_operations devtmpfs_context_ops = {}; + +/* Call the underlying initialization and set to our ops */ +static int devtmpfs_init_fs_context(struct fs_context *fc) +{ + int ret; +#ifdef CONFIG_TMPFS + ret = shmem_init_fs_context(fc); +#else + ret = ramfs_init_fs_context(fc); +#endif + if (ret < 0) + return ret; + + fc->ops = &devtmpfs_context_ops; + + return 0; +} + static struct file_system_type dev_fs_type = { .name = "devtmpfs", - .mount = public_dev_mount, + .init_fs_context = devtmpfs_init_fs_context, }; static int devtmpfs_submit_req(struct req *req, const char *tmp) @@ -160,18 +175,17 @@ static int dev_mkdir(const char *name, umode_t mode) { struct dentry *dentry; struct path path; - int err; dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); - err = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); - if (!err) + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); + if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; done_path_create(&path, dentry); - return err; + return PTR_ERR_OR_ZERO(dentry); } static int create_path(const char *nodepath) @@ -245,15 +259,12 @@ static int dev_rmdir(const char *name) dentry = kern_path_locked(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (d_really_is_positive(dentry)) { - if (d_inode(dentry)->i_private == &thread) - err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), - dentry); - else - err = -EPERM; - } else { - err = -ENOENT; - } + if (d_inode(dentry)->i_private == &thread) + err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), + dentry); + else + err = -EPERM; + dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); @@ -310,6 +321,8 @@ static int handle_remove(const char *nodename, struct device *dev) { struct path parent; struct dentry *dentry; + struct kstat stat; + struct path p; int deleted = 0; int err; @@ -317,32 +330,28 @@ static int handle_remove(const char *nodename, struct device *dev) if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (d_really_is_positive(dentry)) { - struct kstat stat; - struct path p = {.mnt = parent.mnt, .dentry = dentry}; - err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE, - AT_STATX_SYNC_AS_STAT); - if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { - struct iattr newattrs; - /* - * before unlinking this node, reset permissions - * of possible references like hardlinks - */ - newattrs.ia_uid = GLOBAL_ROOT_UID; - newattrs.ia_gid = GLOBAL_ROOT_GID; - newattrs.ia_mode = stat.mode & ~0777; - newattrs.ia_valid = - ATTR_UID|ATTR_GID|ATTR_MODE; - inode_lock(d_inode(dentry)); - notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); - inode_unlock(d_inode(dentry)); - err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), - dentry, NULL); - if (!err || err == -ENOENT) - deleted = 1; - } - } else { - err = -ENOENT; + p.mnt = parent.mnt; + p.dentry = dentry; + err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE, + AT_STATX_SYNC_AS_STAT); + if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { + struct iattr newattrs; + /* + * before unlinking this node, reset permissions + * of possible references like hardlinks + */ + newattrs.ia_uid = GLOBAL_ROOT_UID; + newattrs.ia_gid = GLOBAL_ROOT_GID; + newattrs.ia_mode = stat.mode & ~0777; + newattrs.ia_valid = + ATTR_UID|ATTR_GID|ATTR_MODE; + inode_lock(d_inode(dentry)); + notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); + inode_unlock(d_inode(dentry)); + err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), + dentry, NULL); + if (!err || err == -ENOENT) + deleted = 1; } dput(dentry); inode_unlock(d_inode(parent.dentry)); @@ -443,6 +452,31 @@ static int __ref devtmpfsd(void *p) } /* + * Get the underlying (shmem/ramfs) context ops to build ours + */ +static int devtmpfs_configure_context(void) +{ + struct fs_context *fc; + + fc = fs_context_for_reconfigure(mnt->mnt_root, mnt->mnt_sb->s_flags, + MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + /* Set up devtmpfs_context_ops based on underlying type */ + devtmpfs_context_ops.free = fc->ops->free; + devtmpfs_context_ops.dup = fc->ops->dup; + devtmpfs_context_ops.parse_param = fc->ops->parse_param; + devtmpfs_context_ops.parse_monolithic = fc->ops->parse_monolithic; + devtmpfs_context_ops.get_tree = &devtmpfs_get_tree; + devtmpfs_context_ops.reconfigure = fc->ops->reconfigure; + + put_fs_context(fc); + + return 0; +} + +/* * Create devtmpfs instance, driver-core devices will add their device * nodes here. */ @@ -456,6 +490,13 @@ int __init devtmpfs_init(void) pr_err("unable to create devtmpfs %ld\n", PTR_ERR(mnt)); return PTR_ERR(mnt); } + + err = devtmpfs_configure_context(); + if (err) { + pr_err("unable to configure devtmpfs type %d\n", err); + return err; + } + err = register_filesystem(&dev_fs_type); if (err) { pr_err("unable to register devtmpfs type %d\n", err); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 2ee45841486b..425c43b2d478 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1764,8 +1764,8 @@ void pm_runtime_init(struct device *dev) INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; - hrtimer_init(&dev->power.suspend_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - dev->power.suspend_timer.function = pm_suspend_timer_fn; + hrtimer_setup(&dev->power.suspend_timer, pm_suspend_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); init_waitqueue_head(&dev->power.wait_queue); } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index fdc7a0b2af10..da1ecbf988b8 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1426,8 +1426,7 @@ static void nullb_setup_bwtimer(struct nullb *nullb) { ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - nullb->bw_timer.function = nullb_bwtimer_fn; + hrtimer_setup(&nullb->bw_timer, nullb_bwtimer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); } @@ -1604,8 +1603,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; + hrtimer_setup(&cmd->timer, null_cmd_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } cmd->error = BLK_STS_OK; cmd->nq = nq; diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c index 7174bfccc7b3..b95f6d0f17ed 100644 --- a/drivers/char/hw_random/timeriomem-rng.c +++ b/drivers/char/hw_random/timeriomem-rng.c @@ -152,8 +152,7 @@ static int timeriomem_rng_probe(struct platform_device *pdev) priv->period = ns_to_ktime(period * NSEC_PER_USEC); init_completion(&priv->completion); - hrtimer_init(&priv->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - priv->timer.function = timeriomem_rng_trigger; + hrtimer_setup(&priv->timer, timeriomem_rng_trigger, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); priv->rng_ops.name = dev_name(&pdev->dev); priv->rng_ops.read = timeriomem_rng_read; diff --git a/drivers/char/random.c b/drivers/char/random.c index 2581186fa61b..92cbd24a36d8 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -278,7 +278,7 @@ static void crng_reseed(struct work_struct *work) WRITE_ONCE(base_crng.generation, next_gen); #ifdef CONFIG_VDSO_GETRANDOM /* base_crng.generation's invalid value is ULONG_MAX, while - * _vdso_rng_data.generation's invalid value is 0, so add one to the + * vdso_k_rng_data->generation's invalid value is 0, so add one to the * former to arrive at the latter. Use smp_store_release so that this * is ordered with the write above to base_crng.generation. Pairs with * the smp_rmb() before the syscall in the vDSO code. @@ -290,7 +290,7 @@ static void crng_reseed(struct work_struct *work) * because the vDSO side only checks whether the value changed, without * actually using or interpreting the value. */ - smp_store_release((unsigned long *)&__arch_get_k_vdso_rng_data()->generation, next_gen + 1); + smp_store_release((unsigned long *)&vdso_k_rng_data->generation, next_gen + 1); #endif if (!static_branch_likely(&crng_is_ready)) crng_init = CRNG_READY; @@ -743,7 +743,7 @@ static void __cold _credit_init_bits(size_t bits) queue_work(system_unbound_wq, &set_ready); atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); #ifdef CONFIG_VDSO_GETRANDOM - WRITE_ONCE(__arch_get_k_vdso_rng_data()->is_ready, true); + WRITE_ONCE(vdso_k_rng_data->is_ready, true); #endif wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index 97c2d4f15d76..2c5c228408bf 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -340,3 +340,15 @@ config X86_SPEEDSTEP_RELAXED_CAP_CHECK option lets the probing code bypass some of those checks if the parameter "relaxed_check=1" is passed to the module. +config CPUFREQ_ARCH_CUR_FREQ + default y + bool "Current frequency derived from HW provided feedback" + help + This determines whether the scaling_cur_freq sysfs attribute returns + the last requested frequency or a more precise value based on hardware + provided feedback (as architected counters). + Given that a more precise frequency can now be provided via the + cpuinfo_avg_freq attribute, by enabling this option, + scaling_cur_freq maintains the provision of a counter based frequency, + for compatibility reasons. + diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 30ffbddc7ece..0ce79fed8e55 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -729,18 +729,26 @@ show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); show_one(scaling_min_freq, min); show_one(scaling_max_freq, max); -__weak unsigned int arch_freq_get_on_cpu(int cpu) +__weak int arch_freq_get_on_cpu(int cpu) { - return 0; + return -EOPNOTSUPP; +} + +static inline bool cpufreq_avg_freq_supported(struct cpufreq_policy *policy) +{ + return arch_freq_get_on_cpu(policy->cpu) != -EOPNOTSUPP; } static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf) { ssize_t ret; - unsigned int freq; + int freq; + + freq = IS_ENABLED(CONFIG_CPUFREQ_ARCH_CUR_FREQ) + ? arch_freq_get_on_cpu(policy->cpu) + : 0; - freq = arch_freq_get_on_cpu(policy->cpu); - if (freq) + if (freq > 0) ret = sysfs_emit(buf, "%u\n", freq); else if (cpufreq_driver->setpolicy && cpufreq_driver->get) ret = sysfs_emit(buf, "%u\n", cpufreq_driver->get(policy->cpu)); @@ -785,6 +793,19 @@ static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy, } /* + * show_cpuinfo_avg_freq - average CPU frequency as detected by hardware + */ +static ssize_t show_cpuinfo_avg_freq(struct cpufreq_policy *policy, + char *buf) +{ + int avg_freq = arch_freq_get_on_cpu(policy->cpu); + + if (avg_freq > 0) + return sysfs_emit(buf, "%u\n", avg_freq); + return avg_freq != 0 ? avg_freq : -EINVAL; +} + +/* * show_scaling_governor - show the current policy for the specified CPU */ static ssize_t show_scaling_governor(struct cpufreq_policy *policy, char *buf) @@ -946,6 +967,7 @@ static ssize_t show_bios_limit(struct cpufreq_policy *policy, char *buf) } cpufreq_freq_attr_ro_perm(cpuinfo_cur_freq, 0400); +cpufreq_freq_attr_ro(cpuinfo_avg_freq); cpufreq_freq_attr_ro(cpuinfo_min_freq); cpufreq_freq_attr_ro(cpuinfo_max_freq); cpufreq_freq_attr_ro(cpuinfo_transition_latency); @@ -1073,6 +1095,12 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return ret; } + if (cpufreq_avg_freq_supported(policy)) { + ret = sysfs_create_file(&policy->kobj, &cpuinfo_avg_freq.attr); + if (ret) + return ret; + } + ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); if (ret) return ret; diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9c4cc01fd51a..f06b9bc99945 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2200,28 +2200,20 @@ static int knl_get_turbo_pstate(int cpu) return ret; } -static void hybrid_get_type(void *data) -{ - u8 *cpu_type = data; - - *cpu_type = get_this_hybrid_cpu_type(); -} - static int hwp_get_cpu_scaling(int cpu) { if (hybrid_scaling_factor) { - u8 cpu_type = 0; - - smp_call_function_single(cpu, hybrid_get_type, &cpu_type, 1); + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u8 cpu_type = c->topo.intel_type; /* * Return the hybrid scaling factor for P-cores and use the * default core scaling for E-cores. */ - if (cpu_type == 0x40) + if (cpu_type == INTEL_CPU_TYPE_CORE) return hybrid_scaling_factor; - if (cpu_type == 0x20) + if (cpu_type == INTEL_CPU_TYPE_ATOM) return core_get_scaling(); } diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile index d103342b7cfc..1de9e92c5b0f 100644 --- a/drivers/cpuidle/Makefile +++ b/drivers/cpuidle/Makefile @@ -3,6 +3,9 @@ # Makefile for cpuidle. # +# Branch profiling isn't noinstr-safe +ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING + obj-y += cpuidle.o driver.o governor.o sysfs.o governors/ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o obj-$(CONFIG_DT_IDLE_STATES) += dt_idle_states.o diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c index e2a1e4463b6f..0470d7c175f4 100644 --- a/drivers/devfreq/event/rockchip-dfi.c +++ b/drivers/devfreq/event/rockchip-dfi.c @@ -642,8 +642,7 @@ static int rockchip_ddr_perf_init(struct rockchip_dfi *dfi) if (ret) return ret; - hrtimer_init(&dfi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - dfi->timer.function = rockchip_dfi_timer; + hrtimer_setup(&dfi->timer, rockchip_dfi_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); switch (dfi->ddr_type) { case ROCKCHIP_DDRTYPE_LPDDR2: diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c index 32019dc33cca..1877201d1aa9 100644 --- a/drivers/dpll/dpll_core.c +++ b/drivers/dpll/dpll_core.c @@ -505,7 +505,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC); ret = xa_alloc_cyclic(&dpll_pin_xa, &pin->id, pin, xa_limit_32b, &dpll_pin_xa_id, GFP_KERNEL); - if (ret) + if (ret < 0) goto err_xa_alloc; return pin; err_xa_alloc: diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 2051a7c944a5..19ad3c3b675d 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -75,6 +75,34 @@ config EDAC_GHES In doubt, say 'Y'. +config EDAC_SCRUB + bool "EDAC scrub feature" + help + The EDAC scrub feature is optional and is designed to control the + memory scrubbers in the system. The common sysfs scrub interface + abstracts the control of various arbitrary scrubbing functionalities + into a unified set of functions. + Say 'y/n' to enable/disable EDAC scrub feature. + +config EDAC_ECS + bool "EDAC ECS (Error Check Scrub) feature" + help + The EDAC ECS feature is optional and is designed to control on-die + error check scrub (e.g., DDR5 ECS) in the system. The common sysfs + ECS interface abstracts the control of various ECS functionalities + into a unified set of functions. + Say 'y/n' to enable/disable EDAC ECS feature. + +config EDAC_MEM_REPAIR + bool "EDAC memory repair feature" + help + The EDAC memory repair feature is optional and is designed to control + the memory devices with repair features, such as Post Package Repair + (PPR), memory sparing etc. The common sysfs memory repair interface + abstracts the control of various memory repair functionalities into + a unified set of functions. + Say 'y/n' to enable/disable EDAC memory repair feature. + config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64)" depends on AMD_NB && EDAC_DECODE_MCE @@ -168,7 +196,7 @@ config EDAC_I3200 config EDAC_IE31200 tristate "Intel e312xx" - depends on PCI && X86 + depends on PCI && X86 && X86_MCE_INTEL help Support for error detection and correction on the Intel E3-1200 based DRAM controllers. diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 89789ba8275f..a8f2d8f6c894 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -12,6 +12,9 @@ edac_core-y := edac_mc.o edac_device.o edac_mc_sysfs.o edac_core-y += edac_module.o edac_device_sysfs.o wq.o edac_core-$(CONFIG_EDAC_DEBUG) += debugfs.o +edac_core-$(CONFIG_EDAC_SCRUB) += scrub.o +edac_core-$(CONFIG_EDAC_ECS) += ecs.o +edac_core-$(CONFIG_EDAC_MEM_REPAIR) += mem_repair.o ifdef CONFIG_PCI edac_core-y += edac_pci.o edac_pci_sysfs.o diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 8414ceb43e4a..90f0eb7cc5b9 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/ras.h> +#include <linux/string_choices.h> #include "amd64_edac.h" #include <asm/amd_nb.h> #include <asm/amd_node.h> @@ -1171,22 +1172,21 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) edac_dbg(1, " LRDIMM %dx rank multiply\n", (dcsm & 0x3)); } - edac_dbg(1, "All DIMMs support ECC:%s\n", - (dclr & BIT(19)) ? "yes" : "no"); + edac_dbg(1, "All DIMMs support ECC: %s\n", str_yes_no(dclr & BIT(19))); edac_dbg(1, " PAR/ERR parity: %s\n", - (dclr & BIT(8)) ? "enabled" : "disabled"); + str_enabled_disabled(dclr & BIT(8))); if (pvt->fam == 0x10) edac_dbg(1, " DCT 128bit mode width: %s\n", (dclr & BIT(11)) ? "128b" : "64b"); edac_dbg(1, " x4 logical DIMMs present: L0: %s L1: %s L2: %s L3: %s\n", - (dclr & BIT(12)) ? "yes" : "no", - (dclr & BIT(13)) ? "yes" : "no", - (dclr & BIT(14)) ? "yes" : "no", - (dclr & BIT(15)) ? "yes" : "no"); + str_yes_no(dclr & BIT(12)), + str_yes_no(dclr & BIT(13)), + str_yes_no(dclr & BIT(14)), + str_yes_no(dclr & BIT(15))); } #define CS_EVEN_PRIMARY BIT(0) @@ -1353,14 +1353,14 @@ static void umc_dump_misc_regs(struct amd64_pvt *pvt) edac_dbg(1, "UMC%d UMC cap high: 0x%x\n", i, umc->umc_cap_hi); edac_dbg(1, "UMC%d ECC capable: %s, ChipKill ECC capable: %s\n", - i, (umc->umc_cap_hi & BIT(30)) ? "yes" : "no", - (umc->umc_cap_hi & BIT(31)) ? "yes" : "no"); + i, str_yes_no(umc->umc_cap_hi & BIT(30)), + str_yes_no(umc->umc_cap_hi & BIT(31))); edac_dbg(1, "UMC%d All DIMMs support ECC: %s\n", - i, (umc->umc_cfg & BIT(12)) ? "yes" : "no"); + i, str_yes_no(umc->umc_cfg & BIT(12))); edac_dbg(1, "UMC%d x4 DIMMs present: %s\n", - i, (umc->dimm_cfg & BIT(6)) ? "yes" : "no"); + i, str_yes_no(umc->dimm_cfg & BIT(6))); edac_dbg(1, "UMC%d x16 DIMMs present: %s\n", - i, (umc->dimm_cfg & BIT(7)) ? "yes" : "no"); + i, str_yes_no(umc->dimm_cfg & BIT(7))); umc_debug_display_dimm_sizes(pvt, i); } @@ -1371,11 +1371,11 @@ static void dct_dump_misc_regs(struct amd64_pvt *pvt) edac_dbg(1, "F3xE8 (NB Cap): 0x%08x\n", pvt->nbcap); edac_dbg(1, " NB two channel DRAM capable: %s\n", - (pvt->nbcap & NBCAP_DCT_DUAL) ? "yes" : "no"); + str_yes_no(pvt->nbcap & NBCAP_DCT_DUAL)); edac_dbg(1, " ECC capable: %s, ChipKill ECC capable: %s\n", - (pvt->nbcap & NBCAP_SECDED) ? "yes" : "no", - (pvt->nbcap & NBCAP_CHIPKILL) ? "yes" : "no"); + str_yes_no(pvt->nbcap & NBCAP_SECDED), + str_yes_no(pvt->nbcap & NBCAP_CHIPKILL)); debug_dump_dramcfg_low(pvt, pvt->dclr0, 0); @@ -1398,7 +1398,7 @@ static void dct_dump_misc_regs(struct amd64_pvt *pvt) if (!dct_ganging_enabled(pvt)) debug_dump_dramcfg_low(pvt, pvt->dclr1, 1); - edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); + edac_dbg(1, " DramHoleValid: %s\n", str_yes_no(dhar_valid(pvt))); amd64_info("using x%u syndromes.\n", pvt->ecc_sym_sz); } @@ -2027,15 +2027,15 @@ static void read_dram_ctl_register(struct amd64_pvt *pvt) if (!dct_ganging_enabled(pvt)) edac_dbg(0, " Address range split per DCT: %s\n", - (dct_high_range_enabled(pvt) ? "yes" : "no")); + str_yes_no(dct_high_range_enabled(pvt))); edac_dbg(0, " data interleave for ECC: %s, DRAM cleared since last warm reset: %s\n", - (dct_data_intlv_enabled(pvt) ? "enabled" : "disabled"), - (dct_memory_cleared(pvt) ? "yes" : "no")); + str_enabled_disabled(dct_data_intlv_enabled(pvt)), + str_yes_no(dct_memory_cleared(pvt))); edac_dbg(0, " channel interleave: %s, " "interleave bits selector: 0x%x\n", - (dct_interleave_enabled(pvt) ? "enabled" : "disabled"), + str_enabled_disabled(dct_interleave_enabled(pvt)), dct_sel_interleave_addr(pvt)); } @@ -3208,8 +3208,7 @@ static bool nb_mce_bank_enabled_on_node(u16 nid) nbe = reg->l & MSR_MCGCTL_NBE; edac_dbg(0, "core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n", - cpu, reg->q, - (nbe ? "enabled" : "disabled")); + cpu, reg->q, str_enabled_disabled(nbe)); if (!nbe) goto out; @@ -3353,12 +3352,9 @@ static bool dct_ecc_enabled(struct amd64_pvt *pvt) edac_dbg(0, "NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n", MSR_IA32_MCG_CTL, nid); - edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, (ecc_en ? "enabled" : "disabled")); + edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, str_enabled_disabled(ecc_en)); - if (!ecc_en || !nb_mce_en) - return false; - else - return true; + return ecc_en && nb_mce_en; } static bool umc_ecc_enabled(struct amd64_pvt *pvt) @@ -3378,7 +3374,7 @@ static bool umc_ecc_enabled(struct amd64_pvt *pvt) } } - edac_dbg(3, "Node %d: DRAM ECC %s.\n", pvt->mc_node_id, (ecc_en ? "enabled" : "disabled")); + edac_dbg(3, "Node %d: DRAM ECC %s.\n", pvt->mc_node_id, str_enabled_disabled(ecc_en)); return ecc_en; } diff --git a/drivers/edac/debugfs.c b/drivers/edac/debugfs.c index 4804332d9946..8195fc9c9354 100644 --- a/drivers/edac/debugfs.c +++ b/drivers/edac/debugfs.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only + +#include <linux/string_choices.h> + #include "edac_module.h" static struct dentry *edac_debugfs; @@ -22,7 +25,7 @@ static ssize_t edac_fake_inject_write(struct file *file, "Generating %d %s fake error%s to %d.%d.%d to test core handling. NOTE: this won't test the driver-specific decoding logic.\n", errcount, (type == HW_EVENT_ERR_UNCORRECTED) ? "UE" : "CE", - errcount > 1 ? "s" : "", + str_plural(errcount), mci->fake_inject_layer[0], mci->fake_inject_layer[1], mci->fake_inject_layer[2] diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c new file mode 100755 index 000000000000..1d51838a60c1 --- /dev/null +++ b/drivers/edac/ecs.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The generic ECS driver is designed to support control of on-die error + * check scrub (e.g., DDR5 ECS). The common sysfs ECS interface abstracts + * the control of various ECS functionalities into a unified set of functions. + * + * Copyright (c) 2024-2025 HiSilicon Limited. + */ + +#include <linux/edac.h> + +#define EDAC_ECS_FRU_NAME "ecs_fru" + +enum edac_ecs_attributes { + ECS_LOG_ENTRY_TYPE, + ECS_MODE, + ECS_RESET, + ECS_THRESHOLD, + ECS_MAX_ATTRS +}; + +struct edac_ecs_dev_attr { + struct device_attribute dev_attr; + int fru_id; +}; + +struct edac_ecs_fru_context { + char name[EDAC_FEAT_NAME_LEN]; + struct edac_ecs_dev_attr dev_attr[ECS_MAX_ATTRS]; + struct attribute *ecs_attrs[ECS_MAX_ATTRS + 1]; + struct attribute_group group; +}; + +struct edac_ecs_context { + u16 num_media_frus; + struct edac_ecs_fru_context *fru_ctxs; +}; + +#define TO_ECS_DEV_ATTR(_dev_attr) \ + container_of(_dev_attr, struct edac_ecs_dev_attr, dev_attr) + +#define EDAC_ECS_ATTR_SHOW(attrib, cb, type, format) \ +static ssize_t attrib##_show(struct device *ras_feat_dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct edac_ecs_dev_attr *dev_attr = TO_ECS_DEV_ATTR(attr); \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops; \ + type data; \ + int ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->ecs.private, \ + dev_attr->fru_id, &data); \ + if (ret) \ + return ret; \ + \ + return sysfs_emit(buf, format, data); \ +} + +EDAC_ECS_ATTR_SHOW(log_entry_type, get_log_entry_type, u32, "%u\n") +EDAC_ECS_ATTR_SHOW(mode, get_mode, u32, "%u\n") +EDAC_ECS_ATTR_SHOW(threshold, get_threshold, u32, "%u\n") + +#define EDAC_ECS_ATTR_STORE(attrib, cb, type, conv_func) \ +static ssize_t attrib##_store(struct device *ras_feat_dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct edac_ecs_dev_attr *dev_attr = TO_ECS_DEV_ATTR(attr); \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops; \ + type data; \ + int ret; \ + \ + ret = conv_func(buf, 0, &data); \ + if (ret < 0) \ + return ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->ecs.private, \ + dev_attr->fru_id, data); \ + if (ret) \ + return ret; \ + \ + return len; \ +} + +EDAC_ECS_ATTR_STORE(log_entry_type, set_log_entry_type, unsigned long, kstrtoul) +EDAC_ECS_ATTR_STORE(mode, set_mode, unsigned long, kstrtoul) +EDAC_ECS_ATTR_STORE(reset, reset, unsigned long, kstrtoul) +EDAC_ECS_ATTR_STORE(threshold, set_threshold, unsigned long, kstrtoul) + +static umode_t ecs_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id) +{ + struct device *ras_feat_dev = kobj_to_dev(kobj); + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); + const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops; + + switch (attr_id) { + case ECS_LOG_ENTRY_TYPE: + if (ops->get_log_entry_type) { + if (ops->set_log_entry_type) + return a->mode; + else + return 0444; + } + break; + case ECS_MODE: + if (ops->get_mode) { + if (ops->set_mode) + return a->mode; + else + return 0444; + } + break; + case ECS_RESET: + if (ops->reset) + return a->mode; + break; + case ECS_THRESHOLD: + if (ops->get_threshold) { + if (ops->set_threshold) + return a->mode; + else + return 0444; + } + break; + default: + break; + } + + return 0; +} + +#define EDAC_ECS_ATTR_RO(_name, _fru_id) \ + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_RO(_name), \ + .fru_id = _fru_id }) + +#define EDAC_ECS_ATTR_WO(_name, _fru_id) \ + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_WO(_name), \ + .fru_id = _fru_id }) + +#define EDAC_ECS_ATTR_RW(_name, _fru_id) \ + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_RW(_name), \ + .fru_id = _fru_id }) + +static int ecs_create_desc(struct device *ecs_dev, const struct attribute_group **attr_groups, + u16 num_media_frus) +{ + struct edac_ecs_context *ecs_ctx; + u32 fru; + + ecs_ctx = devm_kzalloc(ecs_dev, sizeof(*ecs_ctx), GFP_KERNEL); + if (!ecs_ctx) + return -ENOMEM; + + ecs_ctx->num_media_frus = num_media_frus; + ecs_ctx->fru_ctxs = devm_kcalloc(ecs_dev, num_media_frus, + sizeof(*ecs_ctx->fru_ctxs), + GFP_KERNEL); + if (!ecs_ctx->fru_ctxs) + return -ENOMEM; + + for (fru = 0; fru < num_media_frus; fru++) { + struct edac_ecs_fru_context *fru_ctx = &ecs_ctx->fru_ctxs[fru]; + struct attribute_group *group = &fru_ctx->group; + int i; + + fru_ctx->dev_attr[ECS_LOG_ENTRY_TYPE] = EDAC_ECS_ATTR_RW(log_entry_type, fru); + fru_ctx->dev_attr[ECS_MODE] = EDAC_ECS_ATTR_RW(mode, fru); + fru_ctx->dev_attr[ECS_RESET] = EDAC_ECS_ATTR_WO(reset, fru); + fru_ctx->dev_attr[ECS_THRESHOLD] = EDAC_ECS_ATTR_RW(threshold, fru); + + for (i = 0; i < ECS_MAX_ATTRS; i++) + fru_ctx->ecs_attrs[i] = &fru_ctx->dev_attr[i].dev_attr.attr; + + sprintf(fru_ctx->name, "%s%d", EDAC_ECS_FRU_NAME, fru); + group->name = fru_ctx->name; + group->attrs = fru_ctx->ecs_attrs; + group->is_visible = ecs_attr_visible; + + attr_groups[fru] = group; + } + + return 0; +} + +/** + * edac_ecs_get_desc - get EDAC ECS descriptors + * @ecs_dev: client device, supports ECS feature + * @attr_groups: pointer to attribute group container + * @num_media_frus: number of media FRUs in the device + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + */ +int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, u16 num_media_frus) +{ + if (!ecs_dev || !attr_groups || !num_media_frus) + return -EINVAL; + + return ecs_create_desc(ecs_dev, attr_groups, num_media_frus); +} diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 621dc2a5d034..0734909b08a4 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -570,3 +570,188 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev, block ? block->name : "N/A", count, msg); } EXPORT_SYMBOL_GPL(edac_device_handle_ue_count); + +static void edac_dev_release(struct device *dev) +{ + struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev); + + kfree(ctx->mem_repair); + kfree(ctx->scrub); + kfree(ctx->dev.groups); + kfree(ctx); +} + +static const struct device_type edac_dev_type = { + .name = "edac_dev", + .release = edac_dev_release, +}; + +static void edac_dev_unreg(void *data) +{ + device_unregister(data); +} + +/** + * edac_dev_register - register device for RAS features with EDAC + * @parent: parent device. + * @name: name for the folder in the /sys/bus/edac/devices/, + * which is derived from the parent device. + * For e.g. /sys/bus/edac/devices/cxl_mem0/ + * @private: parent driver's data to store in the context if any. + * @num_features: number of RAS features to register. + * @ras_features: list of RAS features to register. + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + * + */ +int edac_dev_register(struct device *parent, char *name, + void *private, int num_features, + const struct edac_dev_feature *ras_features) +{ + const struct attribute_group **ras_attr_groups; + struct edac_dev_data *dev_data; + struct edac_dev_feat_ctx *ctx; + int mem_repair_cnt = 0; + int attr_gcnt = 0; + int ret = -ENOMEM; + int scrub_cnt = 0; + int feat; + + if (!parent || !name || !num_features || !ras_features) + return -EINVAL; + + /* Double parse to make space for attributes */ + for (feat = 0; feat < num_features; feat++) { + switch (ras_features[feat].ft_type) { + case RAS_FEAT_SCRUB: + attr_gcnt++; + scrub_cnt++; + break; + case RAS_FEAT_ECS: + attr_gcnt += ras_features[feat].ecs_info.num_media_frus; + break; + case RAS_FEAT_MEM_REPAIR: + attr_gcnt++; + mem_repair_cnt++; + break; + default: + return -EINVAL; + } + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL); + if (!ras_attr_groups) + goto ctx_free; + + if (scrub_cnt) { + ctx->scrub = kcalloc(scrub_cnt, sizeof(*ctx->scrub), GFP_KERNEL); + if (!ctx->scrub) + goto groups_free; + } + + if (mem_repair_cnt) { + ctx->mem_repair = kcalloc(mem_repair_cnt, sizeof(*ctx->mem_repair), GFP_KERNEL); + if (!ctx->mem_repair) + goto data_mem_free; + } + + attr_gcnt = 0; + scrub_cnt = 0; + mem_repair_cnt = 0; + for (feat = 0; feat < num_features; feat++, ras_features++) { + switch (ras_features->ft_type) { + case RAS_FEAT_SCRUB: + if (!ras_features->scrub_ops || scrub_cnt != ras_features->instance) { + ret = -EINVAL; + goto data_mem_free; + } + + dev_data = &ctx->scrub[scrub_cnt]; + dev_data->instance = scrub_cnt; + dev_data->scrub_ops = ras_features->scrub_ops; + dev_data->private = ras_features->ctx; + ret = edac_scrub_get_desc(parent, &ras_attr_groups[attr_gcnt], + ras_features->instance); + if (ret) + goto data_mem_free; + + scrub_cnt++; + attr_gcnt++; + break; + case RAS_FEAT_ECS: + if (!ras_features->ecs_ops) { + ret = -EINVAL; + goto data_mem_free; + } + + dev_data = &ctx->ecs; + dev_data->ecs_ops = ras_features->ecs_ops; + dev_data->private = ras_features->ctx; + ret = edac_ecs_get_desc(parent, &ras_attr_groups[attr_gcnt], + ras_features->ecs_info.num_media_frus); + if (ret) + goto data_mem_free; + + attr_gcnt += ras_features->ecs_info.num_media_frus; + break; + case RAS_FEAT_MEM_REPAIR: + if (!ras_features->mem_repair_ops || + mem_repair_cnt != ras_features->instance) { + ret = -EINVAL; + goto data_mem_free; + } + + dev_data = &ctx->mem_repair[mem_repair_cnt]; + dev_data->instance = mem_repair_cnt; + dev_data->mem_repair_ops = ras_features->mem_repair_ops; + dev_data->private = ras_features->ctx; + ret = edac_mem_repair_get_desc(parent, &ras_attr_groups[attr_gcnt], + ras_features->instance); + if (ret) + goto data_mem_free; + + mem_repair_cnt++; + attr_gcnt++; + break; + default: + ret = -EINVAL; + goto data_mem_free; + } + } + + ctx->dev.parent = parent; + ctx->dev.bus = edac_get_sysfs_subsys(); + ctx->dev.type = &edac_dev_type; + ctx->dev.groups = ras_attr_groups; + ctx->private = private; + dev_set_drvdata(&ctx->dev, ctx); + + ret = dev_set_name(&ctx->dev, "%s", name); + if (ret) + goto data_mem_free; + + ret = device_register(&ctx->dev); + if (ret) { + put_device(&ctx->dev); + return ret; + } + + return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev); + +data_mem_free: + kfree(ctx->mem_repair); + kfree(ctx->scrub); +groups_free: + kfree(ras_attr_groups); +ctx_free: + kfree(ctx); + return ret; +} +EXPORT_SYMBOL_GPL(edac_dev_register); diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index f45d849d3f15..355a977019e9 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -751,6 +751,8 @@ static int i10nm_get_ddr_munits(void) continue; } else { d->imc[lmc].mdev = mdev; + if (res_cfg->type == SPR) + skx_set_mc_mapping(d, i, lmc); lmc++; } } diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index 49b4499269fb..b5cf25905b05 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/edac.h> #include <linux/mmzone.h> +#include <linux/string_choices.h> #include "edac_module.h" @@ -899,7 +900,7 @@ static void decode_mtr(int slot_row, u16 mtr) edac_dbg(2, "\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr)); edac_dbg(2, "\t\tELECTRICAL THROTTLING is %s\n", - MTR_DIMMS_ETHROTTLE(mtr) ? "enabled" : "disabled"); + str_enabled_disabled(MTR_DIMMS_ETHROTTLE(mtr))); edac_dbg(2, "\t\tNUMBANK: %d bank(s)\n", MTR_DRAM_BANKS(mtr)); edac_dbg(2, "\t\tNUMRANK: %s\n", diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 61adaa872ba7..69068f8d0cad 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -23,6 +23,7 @@ #include <linux/slab.h> #include <linux/edac.h> #include <linux/mmzone.h> +#include <linux/string_choices.h> #include "edac_module.h" @@ -620,7 +621,7 @@ static int decode_mtr(struct i7300_pvt *pvt, edac_dbg(2, "\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr)); edac_dbg(2, "\t\tELECTRICAL THROTTLING is %s\n", - MTR_DIMMS_ETHROTTLE(mtr) ? "enabled" : "disabled"); + str_enabled_disabled(MTR_DIMMS_ETHROTTLE(mtr))); edac_dbg(2, "\t\tNUMBANK: %d bank(s)\n", MTR_DRAM_BANKS(mtr)); edac_dbg(2, "\t\tNUMRANK: %s\n", @@ -871,9 +872,9 @@ static int i7300_get_mc_regs(struct mem_ctl_info *mci) IS_MIRRORED(pvt->mc_settings) ? "" : "non-"); edac_dbg(0, "Error detection is %s\n", - IS_ECC_ENABLED(pvt->mc_settings) ? "enabled" : "disabled"); + str_enabled_disabled(IS_ECC_ENABLED(pvt->mc_settings))); edac_dbg(0, "Retry is %s\n", - IS_RETRY_ENABLED(pvt->mc_settings) ? "enabled" : "disabled"); + str_enabled_disabled(IS_RETRY_ENABLED(pvt->mc_settings))); /* Get Memory Interleave Range registers */ pci_read_config_word(pvt->pci_dev_16_1_fsb_addr_map, MIR0, diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c index 4fc16922dc1a..204834149579 100644 --- a/drivers/edac/ie31200_edac.c +++ b/drivers/edac/ie31200_edac.c @@ -51,6 +51,7 @@ #include <linux/edac.h> #include <linux/io-64-nonatomic-lo-hi.h> +#include <asm/mce.h> #include "edac_module.h" #define EDAC_MOD_STR "ie31200_edac" @@ -84,44 +85,23 @@ #define PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_9 0x3ec6 #define PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_10 0x3eca -/* Test if HB is for Skylake or later. */ -#define DEVICE_ID_SKYLAKE_OR_LATER(did) \ - (((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_8) || \ - ((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_9) || \ - ((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_10) || \ - ((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_11) || \ - ((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_12) || \ - (((did) & PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_MASK) == \ - PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_MASK)) - -#define IE31200_DIMMS 4 -#define IE31200_RANKS 8 -#define IE31200_RANKS_PER_CHANNEL 4 +/* Raptor Lake-S */ +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_1 0xa703 +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_2 0x4640 +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_3 0x4630 + +#define IE31200_RANKS_PER_CHANNEL 8 #define IE31200_DIMMS_PER_CHANNEL 2 #define IE31200_CHANNELS 2 +#define IE31200_IMC_NUM 2 /* Intel IE31200 register addresses - device 0 function 0 - DRAM Controller */ #define IE31200_MCHBAR_LOW 0x48 #define IE31200_MCHBAR_HIGH 0x4c -#define IE31200_MCHBAR_MASK GENMASK_ULL(38, 15) -#define IE31200_MMR_WINDOW_SIZE BIT(15) /* * Error Status Register (16b) * - * 15 reserved - * 14 Isochronous TBWRR Run Behind FIFO Full - * (ITCV) - * 13 Isochronous TBWRR Run Behind FIFO Put - * (ITSTV) - * 12 reserved - * 11 MCH Thermal Sensor Event - * for SMI/SCI/SERR (GTSE) - * 10 reserved - * 9 LOCK to non-DRAM Memory Flag (LCKF) - * 8 reserved - * 7 DRAM Throttle Flag (DTF) - * 6:2 reserved * 1 Multi-bit DRAM ECC Error Flag (DMERR) * 0 Single-bit DRAM ECC Error Flag (DSERR) */ @@ -130,68 +110,60 @@ #define IE31200_ERRSTS_CE BIT(0) #define IE31200_ERRSTS_BITS (IE31200_ERRSTS_UE | IE31200_ERRSTS_CE) -/* - * Channel 0 ECC Error Log (64b) - * - * 63:48 Error Column Address (ERRCOL) - * 47:32 Error Row Address (ERRROW) - * 31:29 Error Bank Address (ERRBANK) - * 28:27 Error Rank Address (ERRRANK) - * 26:24 reserved - * 23:16 Error Syndrome (ERRSYND) - * 15: 2 reserved - * 1 Multiple Bit Error Status (MERRSTS) - * 0 Correctable Error Status (CERRSTS) - */ - -#define IE31200_C0ECCERRLOG 0x40c8 -#define IE31200_C1ECCERRLOG 0x44c8 -#define IE31200_C0ECCERRLOG_SKL 0x4048 -#define IE31200_C1ECCERRLOG_SKL 0x4448 -#define IE31200_ECCERRLOG_CE BIT(0) -#define IE31200_ECCERRLOG_UE BIT(1) -#define IE31200_ECCERRLOG_RANK_BITS GENMASK_ULL(28, 27) -#define IE31200_ECCERRLOG_RANK_SHIFT 27 -#define IE31200_ECCERRLOG_SYNDROME_BITS GENMASK_ULL(23, 16) -#define IE31200_ECCERRLOG_SYNDROME_SHIFT 16 - -#define IE31200_ECCERRLOG_SYNDROME(log) \ - ((log & IE31200_ECCERRLOG_SYNDROME_BITS) >> \ - IE31200_ECCERRLOG_SYNDROME_SHIFT) - #define IE31200_CAPID0 0xe4 #define IE31200_CAPID0_PDCD BIT(4) #define IE31200_CAPID0_DDPCD BIT(6) #define IE31200_CAPID0_ECC BIT(1) -#define IE31200_MAD_DIMM_0_OFFSET 0x5004 -#define IE31200_MAD_DIMM_0_OFFSET_SKL 0x500C -#define IE31200_MAD_DIMM_SIZE GENMASK_ULL(7, 0) -#define IE31200_MAD_DIMM_A_RANK BIT(17) -#define IE31200_MAD_DIMM_A_RANK_SHIFT 17 -#define IE31200_MAD_DIMM_A_RANK_SKL BIT(10) -#define IE31200_MAD_DIMM_A_RANK_SKL_SHIFT 10 -#define IE31200_MAD_DIMM_A_WIDTH BIT(19) -#define IE31200_MAD_DIMM_A_WIDTH_SHIFT 19 -#define IE31200_MAD_DIMM_A_WIDTH_SKL GENMASK_ULL(9, 8) -#define IE31200_MAD_DIMM_A_WIDTH_SKL_SHIFT 8 - -/* Skylake reports 1GB increments, everything else is 256MB */ -#define IE31200_PAGES(n, skl) \ - (n << (28 + (2 * skl) - PAGE_SHIFT)) +/* Non-constant mask variant of FIELD_GET() */ +#define field_get(_mask, _reg) (((_reg) & (_mask)) >> (ffs(_mask) - 1)) static int nr_channels; static struct pci_dev *mci_pdev; static int ie31200_registered = 1; +struct res_config { + enum mem_type mtype; + bool cmci; + int imc_num; + /* Host MMIO configuration register */ + u64 reg_mchbar_mask; + u64 reg_mchbar_window_size; + /* ECC error log register */ + u64 reg_eccerrlog_offset[IE31200_CHANNELS]; + u64 reg_eccerrlog_ce_mask; + u64 reg_eccerrlog_ce_ovfl_mask; + u64 reg_eccerrlog_ue_mask; + u64 reg_eccerrlog_ue_ovfl_mask; + u64 reg_eccerrlog_rank_mask; + u64 reg_eccerrlog_syndrome_mask; + /* MSR to clear ECC error log register */ + u32 msr_clear_eccerrlog_offset; + /* DIMM characteristics register */ + u64 reg_mad_dimm_size_granularity; + u64 reg_mad_dimm_offset[IE31200_CHANNELS]; + u32 reg_mad_dimm_size_mask[IE31200_DIMMS_PER_CHANNEL]; + u32 reg_mad_dimm_rank_mask[IE31200_DIMMS_PER_CHANNEL]; + u32 reg_mad_dimm_width_mask[IE31200_DIMMS_PER_CHANNEL]; +}; + struct ie31200_priv { void __iomem *window; void __iomem *c0errlog; void __iomem *c1errlog; + struct res_config *cfg; + struct mem_ctl_info *mci; + struct pci_dev *pdev; + struct device dev; }; +static struct ie31200_pvt { + struct ie31200_priv *priv[IE31200_IMC_NUM]; +} ie31200_pvt; + enum ie31200_chips { IE31200 = 0, + IE31200_1 = 1, }; struct ie31200_dev_info { @@ -202,18 +174,22 @@ struct ie31200_error_info { u16 errsts; u16 errsts2; u64 eccerrlog[IE31200_CHANNELS]; + u64 erraddr; }; static const struct ie31200_dev_info ie31200_devs[] = { [IE31200] = { .ctl_name = "IE31200" }, + [IE31200_1] = { + .ctl_name = "IE31200_1" + }, }; struct dimm_data { - u8 size; /* in multiples of 256MB, except Skylake is 1GB */ - u8 dual_rank : 1, - x16_width : 2; /* 0 means x8 width */ + u64 size; /* in bytes */ + u8 ranks; + enum dev_type dtype; }; static int how_many_channels(struct pci_dev *pdev) @@ -251,29 +227,54 @@ static bool ecc_capable(struct pci_dev *pdev) return true; } -static int eccerrlog_row(u64 log) -{ - return ((log & IE31200_ECCERRLOG_RANK_BITS) >> - IE31200_ECCERRLOG_RANK_SHIFT); -} +#define mci_to_pci_dev(mci) (((struct ie31200_priv *)(mci)->pvt_info)->pdev) static void ie31200_clear_error_info(struct mem_ctl_info *mci) { + struct ie31200_priv *priv = mci->pvt_info; + struct res_config *cfg = priv->cfg; + + /* + * The PCI ERRSTS register is deprecated. Write the MSR to clear + * the ECC error log registers in all memory controllers. + */ + if (cfg->msr_clear_eccerrlog_offset) { + if (wrmsr_safe(cfg->msr_clear_eccerrlog_offset, + cfg->reg_eccerrlog_ce_mask | + cfg->reg_eccerrlog_ce_ovfl_mask | + cfg->reg_eccerrlog_ue_mask | + cfg->reg_eccerrlog_ue_ovfl_mask, 0) < 0) + ie31200_printk(KERN_ERR, "Failed to wrmsr.\n"); + + return; + } + /* * Clear any error bits. * (Yes, we really clear bits by writing 1 to them.) */ - pci_write_bits16(to_pci_dev(mci->pdev), IE31200_ERRSTS, + pci_write_bits16(mci_to_pci_dev(mci), IE31200_ERRSTS, IE31200_ERRSTS_BITS, IE31200_ERRSTS_BITS); } static void ie31200_get_and_clear_error_info(struct mem_ctl_info *mci, struct ie31200_error_info *info) { - struct pci_dev *pdev; + struct pci_dev *pdev = mci_to_pci_dev(mci); struct ie31200_priv *priv = mci->pvt_info; - pdev = to_pci_dev(mci->pdev); + /* + * The PCI ERRSTS register is deprecated, directly read the + * MMIO-mapped ECC error log registers. + */ + if (priv->cfg->msr_clear_eccerrlog_offset) { + info->eccerrlog[0] = lo_hi_readq(priv->c0errlog); + if (nr_channels == 2) + info->eccerrlog[1] = lo_hi_readq(priv->c1errlog); + + ie31200_clear_error_info(mci); + return; + } /* * This is a mess because there is no atomic way to read all the @@ -309,46 +310,56 @@ static void ie31200_get_and_clear_error_info(struct mem_ctl_info *mci, static void ie31200_process_error_info(struct mem_ctl_info *mci, struct ie31200_error_info *info) { + struct ie31200_priv *priv = mci->pvt_info; + struct res_config *cfg = priv->cfg; int channel; u64 log; - if (!(info->errsts & IE31200_ERRSTS_BITS)) - return; + if (!cfg->msr_clear_eccerrlog_offset) { + if (!(info->errsts & IE31200_ERRSTS_BITS)) + return; - if ((info->errsts ^ info->errsts2) & IE31200_ERRSTS_BITS) { - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, 0, 0, 0, - -1, -1, -1, "UE overwrote CE", ""); - info->errsts = info->errsts2; + if ((info->errsts ^ info->errsts2) & IE31200_ERRSTS_BITS) { + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, 0, 0, 0, + -1, -1, -1, "UE overwrote CE", ""); + info->errsts = info->errsts2; + } } for (channel = 0; channel < nr_channels; channel++) { log = info->eccerrlog[channel]; - if (log & IE31200_ECCERRLOG_UE) { + if (log & cfg->reg_eccerrlog_ue_mask) { edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - 0, 0, 0, - eccerrlog_row(log), + info->erraddr >> PAGE_SHIFT, 0, 0, + field_get(cfg->reg_eccerrlog_rank_mask, log), channel, -1, "ie31200 UE", ""); - } else if (log & IE31200_ECCERRLOG_CE) { + } else if (log & cfg->reg_eccerrlog_ce_mask) { edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - 0, 0, - IE31200_ECCERRLOG_SYNDROME(log), - eccerrlog_row(log), + info->erraddr >> PAGE_SHIFT, 0, + field_get(cfg->reg_eccerrlog_syndrome_mask, log), + field_get(cfg->reg_eccerrlog_rank_mask, log), channel, -1, "ie31200 CE", ""); } } } -static void ie31200_check(struct mem_ctl_info *mci) +static void __ie31200_check(struct mem_ctl_info *mci, struct mce *mce) { struct ie31200_error_info info; + info.erraddr = mce ? mce->addr : 0; ie31200_get_and_clear_error_info(mci, &info); ie31200_process_error_info(mci, &info); } -static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev) +static void ie31200_check(struct mem_ctl_info *mci) +{ + __ie31200_check(mci, NULL); +} + +static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev, struct res_config *cfg, int mc) { union { u64 mchbar; @@ -361,7 +372,8 @@ static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev) pci_read_config_dword(pdev, IE31200_MCHBAR_LOW, &u.mchbar_low); pci_read_config_dword(pdev, IE31200_MCHBAR_HIGH, &u.mchbar_high); - u.mchbar &= IE31200_MCHBAR_MASK; + u.mchbar &= cfg->reg_mchbar_mask; + u.mchbar += cfg->reg_mchbar_window_size * mc; if (u.mchbar != (resource_size_t)u.mchbar) { ie31200_printk(KERN_ERR, "mmio space beyond accessible range (0x%llx)\n", @@ -369,7 +381,7 @@ static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev) return NULL; } - window = ioremap(u.mchbar, IE31200_MMR_WINDOW_SIZE); + window = ioremap(u.mchbar, cfg->reg_mchbar_window_size); if (!window) ie31200_printk(KERN_ERR, "Cannot map mmio space at 0x%llx\n", (unsigned long long)u.mchbar); @@ -377,155 +389,108 @@ static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev) return window; } -static void __skl_populate_dimm_info(struct dimm_data *dd, u32 addr_decode, - int chan) +static void populate_dimm_info(struct dimm_data *dd, u32 addr_decode, int dimm, + struct res_config *cfg) { - dd->size = (addr_decode >> (chan << 4)) & IE31200_MAD_DIMM_SIZE; - dd->dual_rank = (addr_decode & (IE31200_MAD_DIMM_A_RANK_SKL << (chan << 4))) ? 1 : 0; - dd->x16_width = ((addr_decode & (IE31200_MAD_DIMM_A_WIDTH_SKL << (chan << 4))) >> - (IE31200_MAD_DIMM_A_WIDTH_SKL_SHIFT + (chan << 4))); + dd->size = field_get(cfg->reg_mad_dimm_size_mask[dimm], addr_decode) * cfg->reg_mad_dimm_size_granularity; + dd->ranks = field_get(cfg->reg_mad_dimm_rank_mask[dimm], addr_decode) + 1; + dd->dtype = field_get(cfg->reg_mad_dimm_width_mask[dimm], addr_decode) + DEV_X8; } -static void __populate_dimm_info(struct dimm_data *dd, u32 addr_decode, - int chan) +static void ie31200_get_dimm_config(struct mem_ctl_info *mci, void __iomem *window, + struct res_config *cfg, int mc) { - dd->size = (addr_decode >> (chan << 3)) & IE31200_MAD_DIMM_SIZE; - dd->dual_rank = (addr_decode & (IE31200_MAD_DIMM_A_RANK << chan)) ? 1 : 0; - dd->x16_width = (addr_decode & (IE31200_MAD_DIMM_A_WIDTH << chan)) ? 1 : 0; -} + struct dimm_data dimm_info; + struct dimm_info *dimm; + unsigned long nr_pages; + u32 addr_decode; + int i, j, k; -static void populate_dimm_info(struct dimm_data *dd, u32 addr_decode, int chan, - bool skl) -{ - if (skl) - __skl_populate_dimm_info(dd, addr_decode, chan); - else - __populate_dimm_info(dd, addr_decode, chan); -} + for (i = 0; i < IE31200_CHANNELS; i++) { + addr_decode = readl(window + cfg->reg_mad_dimm_offset[i]); + edac_dbg(0, "addr_decode: 0x%x\n", addr_decode); + for (j = 0; j < IE31200_DIMMS_PER_CHANNEL; j++) { + populate_dimm_info(&dimm_info, addr_decode, j, cfg); + edac_dbg(0, "mc: %d, channel: %d, dimm: %d, size: %lld MiB, ranks: %d, DRAM chip type: %d\n", + mc, i, j, dimm_info.size >> 20, + dimm_info.ranks, + dimm_info.dtype); -static int ie31200_probe1(struct pci_dev *pdev, int dev_idx) + nr_pages = MiB_TO_PAGES(dimm_info.size >> 20); + if (nr_pages == 0) + continue; + + nr_pages = nr_pages / dimm_info.ranks; + for (k = 0; k < dimm_info.ranks; k++) { + dimm = edac_get_dimm(mci, (j * dimm_info.ranks) + k, i, 0); + dimm->nr_pages = nr_pages; + edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages); + dimm->grain = 8; /* just a guess */ + dimm->mtype = cfg->mtype; + dimm->dtype = dimm_info.dtype; + dimm->edac_mode = EDAC_UNKNOWN; + } + } + } +} + +static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, int mc) { - int i, j, ret; - struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; - struct dimm_data dimm_info[IE31200_CHANNELS][IE31200_DIMMS_PER_CHANNEL]; - void __iomem *window; struct ie31200_priv *priv; - u32 addr_decode, mad_offset; - - /* - * Kaby Lake, Coffee Lake seem to work like Skylake. Please re-visit - * this logic when adding new CPU support. - */ - bool skl = DEVICE_ID_SKYLAKE_OR_LATER(pdev->device); - - edac_dbg(0, "MC:\n"); - - if (!ecc_capable(pdev)) { - ie31200_printk(KERN_INFO, "No ECC support\n"); - return -ENODEV; - } + struct mem_ctl_info *mci; + void __iomem *window; + int ret; nr_channels = how_many_channels(pdev); layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; - layers[0].size = IE31200_DIMMS; + layers[0].size = IE31200_RANKS_PER_CHANNEL; layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; layers[1].size = nr_channels; layers[1].is_virt_csrow = false; - mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, + mci = edac_mc_alloc(mc, ARRAY_SIZE(layers), layers, sizeof(struct ie31200_priv)); if (!mci) return -ENOMEM; - window = ie31200_map_mchbar(pdev); + window = ie31200_map_mchbar(pdev, cfg, mc); if (!window) { ret = -ENODEV; goto fail_free; } edac_dbg(3, "MC: init mci\n"); - mci->pdev = &pdev->dev; - if (skl) - mci->mtype_cap = MEM_FLAG_DDR4; - else - mci->mtype_cap = MEM_FLAG_DDR3; + mci->mtype_cap = BIT(cfg->mtype); mci->edac_ctl_cap = EDAC_FLAG_SECDED; mci->edac_cap = EDAC_FLAG_SECDED; mci->mod_name = EDAC_MOD_STR; - mci->ctl_name = ie31200_devs[dev_idx].ctl_name; + mci->ctl_name = ie31200_devs[mc].ctl_name; mci->dev_name = pci_name(pdev); - mci->edac_check = ie31200_check; + mci->edac_check = cfg->cmci ? NULL : ie31200_check; mci->ctl_page_to_phys = NULL; priv = mci->pvt_info; priv->window = window; - if (skl) { - priv->c0errlog = window + IE31200_C0ECCERRLOG_SKL; - priv->c1errlog = window + IE31200_C1ECCERRLOG_SKL; - mad_offset = IE31200_MAD_DIMM_0_OFFSET_SKL; - } else { - priv->c0errlog = window + IE31200_C0ECCERRLOG; - priv->c1errlog = window + IE31200_C1ECCERRLOG; - mad_offset = IE31200_MAD_DIMM_0_OFFSET; - } - - /* populate DIMM info */ - for (i = 0; i < IE31200_CHANNELS; i++) { - addr_decode = readl(window + mad_offset + - (i * 4)); - edac_dbg(0, "addr_decode: 0x%x\n", addr_decode); - for (j = 0; j < IE31200_DIMMS_PER_CHANNEL; j++) { - populate_dimm_info(&dimm_info[i][j], addr_decode, j, - skl); - edac_dbg(0, "size: 0x%x, rank: %d, width: %d\n", - dimm_info[i][j].size, - dimm_info[i][j].dual_rank, - dimm_info[i][j].x16_width); - } - } - + priv->c0errlog = window + cfg->reg_eccerrlog_offset[0]; + priv->c1errlog = window + cfg->reg_eccerrlog_offset[1]; + priv->cfg = cfg; + priv->mci = mci; + priv->pdev = pdev; + device_initialize(&priv->dev); /* - * The dram rank boundary (DRB) reg values are boundary addresses - * for each DRAM rank with a granularity of 64MB. DRB regs are - * cumulative; the last one will contain the total memory - * contained in all ranks. + * The EDAC core uses mci->pdev (pointer to the structure device) + * as the memory controller ID. The SoCs attach one or more memory + * controllers to a single pci_dev (a single pci_dev->dev can + * correspond to multiple memory controllers). + * + * To make mci->pdev unique, assign pci_dev->dev to mci->pdev + * for the first memory controller and assign a unique priv->dev + * to mci->pdev for each additional memory controller. */ - for (i = 0; i < IE31200_DIMMS_PER_CHANNEL; i++) { - for (j = 0; j < IE31200_CHANNELS; j++) { - struct dimm_info *dimm; - unsigned long nr_pages; - - nr_pages = IE31200_PAGES(dimm_info[j][i].size, skl); - if (nr_pages == 0) - continue; - - if (dimm_info[j][i].dual_rank) { - nr_pages = nr_pages / 2; - dimm = edac_get_dimm(mci, (i * 2) + 1, j, 0); - dimm->nr_pages = nr_pages; - edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages); - dimm->grain = 8; /* just a guess */ - if (skl) - dimm->mtype = MEM_DDR4; - else - dimm->mtype = MEM_DDR3; - dimm->dtype = DEV_UNKNOWN; - dimm->edac_mode = EDAC_UNKNOWN; - } - dimm = edac_get_dimm(mci, i * 2, j, 0); - dimm->nr_pages = nr_pages; - edac_dbg(0, "set nr pages: 0x%lx\n", nr_pages); - dimm->grain = 8; /* same guess */ - if (skl) - dimm->mtype = MEM_DDR4; - else - dimm->mtype = MEM_DDR3; - dimm->dtype = DEV_UNKNOWN; - dimm->edac_mode = EDAC_UNKNOWN; - } - } + mci->pdev = mc ? &priv->dev : &pdev->dev; + ie31200_get_dimm_config(mci, window, cfg, mc); ie31200_clear_error_info(mci); if (edac_mc_add_mc(mci)) { @@ -534,16 +499,115 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx) goto fail_unmap; } - /* get this far and it's successful */ - edac_dbg(3, "MC: success\n"); + ie31200_pvt.priv[mc] = priv; return 0; - fail_unmap: iounmap(window); - fail_free: edac_mc_free(mci); + return ret; +} + +static void mce_check(struct mce *mce) +{ + struct ie31200_priv *priv; + int i; + + for (i = 0; i < IE31200_IMC_NUM; i++) { + priv = ie31200_pvt.priv[i]; + if (!priv) + continue; + + __ie31200_check(priv->mci, mce); + } +} + +static int mce_handler(struct notifier_block *nb, unsigned long val, void *data) +{ + struct mce *mce = (struct mce *)data; + char *type; + + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + + /* + * Ignore unless this is a memory related error. + * Don't check MCI_STATUS_ADDRV since it's not set on some CPUs. + */ + if ((mce->status & 0xefff) >> 7 != 1) + return NOTIFY_DONE; + + type = mce->mcgstatus & MCG_STATUS_MCIP ? "Exception" : "Event"; + + edac_dbg(0, "CPU %d: Machine Check %s: 0x%llx Bank %d: 0x%llx\n", + mce->extcpu, type, mce->mcgstatus, + mce->bank, mce->status); + edac_dbg(0, "TSC 0x%llx\n", mce->tsc); + edac_dbg(0, "ADDR 0x%llx\n", mce->addr); + edac_dbg(0, "MISC 0x%llx\n", mce->misc); + edac_dbg(0, "PROCESSOR %u:0x%x TIME %llu SOCKET %u APIC 0x%x\n", + mce->cpuvendor, mce->cpuid, mce->time, + mce->socketid, mce->apicid); + + mce_check(mce); + mce->kflags |= MCE_HANDLED_EDAC; + + return NOTIFY_DONE; +} + +static struct notifier_block ie31200_mce_dec = { + .notifier_call = mce_handler, + .priority = MCE_PRIO_EDAC, +}; + +static void ie31200_unregister_mcis(void) +{ + struct ie31200_priv *priv; + struct mem_ctl_info *mci; + int i; + + for (i = 0; i < IE31200_IMC_NUM; i++) { + priv = ie31200_pvt.priv[i]; + if (!priv) + continue; + mci = priv->mci; + edac_mc_del_mc(mci->pdev); + iounmap(priv->window); + edac_mc_free(mci); + } +} + +static int ie31200_probe1(struct pci_dev *pdev, struct res_config *cfg) +{ + int i, ret; + + edac_dbg(0, "MC:\n"); + + if (!ecc_capable(pdev)) { + ie31200_printk(KERN_INFO, "No ECC support\n"); + return -ENODEV; + } + + for (i = 0; i < cfg->imc_num; i++) { + ret = ie31200_register_mci(pdev, cfg, i); + if (ret) + goto fail_register; + } + + if (cfg->cmci) { + mce_register_decode_chain(&ie31200_mce_dec); + edac_op_state = EDAC_OPSTATE_INT; + } else { + edac_op_state = EDAC_OPSTATE_POLL; + } + + /* get this far and it's successful. */ + edac_dbg(3, "MC: success\n"); + return 0; + +fail_register: + ie31200_unregister_mcis(); return ret; } @@ -555,7 +619,7 @@ static int ie31200_init_one(struct pci_dev *pdev, edac_dbg(0, "MC:\n"); if (pci_enable_device(pdev) < 0) return -EIO; - rc = ie31200_probe1(pdev, ent->driver_data); + rc = ie31200_probe1(pdev, (struct res_config *)ent->driver_data); if (rc == 0 && !mci_pdev) mci_pdev = pci_dev_get(pdev); @@ -564,43 +628,112 @@ static int ie31200_init_one(struct pci_dev *pdev, static void ie31200_remove_one(struct pci_dev *pdev) { - struct mem_ctl_info *mci; - struct ie31200_priv *priv; + struct ie31200_priv *priv = ie31200_pvt.priv[0]; edac_dbg(0, "\n"); pci_dev_put(mci_pdev); mci_pdev = NULL; - mci = edac_mc_del_mc(&pdev->dev); - if (!mci) - return; - priv = mci->pvt_info; - iounmap(priv->window); - edac_mc_free(mci); + if (priv->cfg->cmci) + mce_unregister_decode_chain(&ie31200_mce_dec); + ie31200_unregister_mcis(); } +static struct res_config snb_cfg = { + .mtype = MEM_DDR3, + .imc_num = 1, + .reg_mchbar_mask = GENMASK_ULL(38, 15), + .reg_mchbar_window_size = BIT_ULL(15), + .reg_eccerrlog_offset[0] = 0x40c8, + .reg_eccerrlog_offset[1] = 0x44c8, + .reg_eccerrlog_ce_mask = BIT_ULL(0), + .reg_eccerrlog_ue_mask = BIT_ULL(1), + .reg_eccerrlog_rank_mask = GENMASK_ULL(28, 27), + .reg_eccerrlog_syndrome_mask = GENMASK_ULL(23, 16), + .reg_mad_dimm_size_granularity = BIT_ULL(28), + .reg_mad_dimm_offset[0] = 0x5004, + .reg_mad_dimm_offset[1] = 0x5008, + .reg_mad_dimm_size_mask[0] = GENMASK(7, 0), + .reg_mad_dimm_size_mask[1] = GENMASK(15, 8), + .reg_mad_dimm_rank_mask[0] = BIT(17), + .reg_mad_dimm_rank_mask[1] = BIT(18), + .reg_mad_dimm_width_mask[0] = BIT(19), + .reg_mad_dimm_width_mask[1] = BIT(20), +}; + +static struct res_config skl_cfg = { + .mtype = MEM_DDR4, + .imc_num = 1, + .reg_mchbar_mask = GENMASK_ULL(38, 15), + .reg_mchbar_window_size = BIT_ULL(15), + .reg_eccerrlog_offset[0] = 0x4048, + .reg_eccerrlog_offset[1] = 0x4448, + .reg_eccerrlog_ce_mask = BIT_ULL(0), + .reg_eccerrlog_ue_mask = BIT_ULL(1), + .reg_eccerrlog_rank_mask = GENMASK_ULL(28, 27), + .reg_eccerrlog_syndrome_mask = GENMASK_ULL(23, 16), + .reg_mad_dimm_size_granularity = BIT_ULL(30), + .reg_mad_dimm_offset[0] = 0x500c, + .reg_mad_dimm_offset[1] = 0x5010, + .reg_mad_dimm_size_mask[0] = GENMASK(5, 0), + .reg_mad_dimm_size_mask[1] = GENMASK(21, 16), + .reg_mad_dimm_rank_mask[0] = BIT(10), + .reg_mad_dimm_rank_mask[1] = BIT(26), + .reg_mad_dimm_width_mask[0] = GENMASK(9, 8), + .reg_mad_dimm_width_mask[1] = GENMASK(25, 24), +}; + +struct res_config rpl_s_cfg = { + .mtype = MEM_DDR5, + .cmci = true, + .imc_num = 2, + .reg_mchbar_mask = GENMASK_ULL(41, 17), + .reg_mchbar_window_size = BIT_ULL(16), + .reg_eccerrlog_offset[0] = 0xe048, + .reg_eccerrlog_offset[1] = 0xe848, + .reg_eccerrlog_ce_mask = BIT_ULL(0), + .reg_eccerrlog_ce_ovfl_mask = BIT_ULL(1), + .reg_eccerrlog_ue_mask = BIT_ULL(2), + .reg_eccerrlog_ue_ovfl_mask = BIT_ULL(3), + .reg_eccerrlog_rank_mask = GENMASK_ULL(28, 27), + .reg_eccerrlog_syndrome_mask = GENMASK_ULL(23, 16), + .msr_clear_eccerrlog_offset = 0x791, + .reg_mad_dimm_offset[0] = 0xd80c, + .reg_mad_dimm_offset[1] = 0xd810, + .reg_mad_dimm_size_granularity = BIT_ULL(29), + .reg_mad_dimm_size_mask[0] = GENMASK(6, 0), + .reg_mad_dimm_size_mask[1] = GENMASK(22, 16), + .reg_mad_dimm_rank_mask[0] = GENMASK(10, 9), + .reg_mad_dimm_rank_mask[1] = GENMASK(27, 26), + .reg_mad_dimm_width_mask[0] = GENMASK(8, 7), + .reg_mad_dimm_width_mask[1] = GENMASK(25, 24), +}; + static const struct pci_device_id ie31200_pci_tbl[] = { - { PCI_VEND_DEV(INTEL, IE31200_HB_1), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_2), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_3), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_4), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_5), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_6), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_7), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_8), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_9), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_10), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_11), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_12), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_1), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_2), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_3), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_4), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_5), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_6), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_7), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_8), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_9), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, - { PCI_VEND_DEV(INTEL, IE31200_HB_CFL_10), PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_1), (kernel_ulong_t)&snb_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_2), (kernel_ulong_t)&snb_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_3), (kernel_ulong_t)&snb_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_4), (kernel_ulong_t)&snb_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_5), (kernel_ulong_t)&snb_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_6), (kernel_ulong_t)&snb_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_7), (kernel_ulong_t)&snb_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_8), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_9), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_10), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_11), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_12), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_1), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_2), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_3), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_4), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_5), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_6), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_7), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_8), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_9), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_10), (kernel_ulong_t)&skl_cfg }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_2), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_3), (kernel_ulong_t)&rpl_s_cfg}, { 0, } /* 0 terminated list. */ }; MODULE_DEVICE_TABLE(pci, ie31200_pci_tbl); @@ -617,12 +750,10 @@ static int __init ie31200_init(void) int pci_rc, i; edac_dbg(3, "MC:\n"); - /* Ensure that the OPSTATE is set correctly for POLL or NMI */ - opstate_init(); pci_rc = pci_register_driver(&ie31200_driver); if (pci_rc < 0) - goto fail0; + return pci_rc; if (!mci_pdev) { ie31200_registered = 0; @@ -633,11 +764,13 @@ static int __init ie31200_init(void) if (mci_pdev) break; } + if (!mci_pdev) { edac_dbg(0, "ie31200 pci_get_device fail\n"); pci_rc = -ENODEV; - goto fail1; + goto fail0; } + pci_rc = ie31200_init_one(mci_pdev, &ie31200_pci_tbl[i]); if (pci_rc < 0) { edac_dbg(0, "ie31200 init fail\n"); @@ -645,12 +778,12 @@ static int __init ie31200_init(void) goto fail1; } } - return 0; + return 0; fail1: - pci_unregister_driver(&ie31200_driver); -fail0: pci_dev_put(mci_pdev); +fail0: + pci_unregister_driver(&ie31200_driver); return pci_rc; } diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index fdf3a84fe698..5807517ee32d 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -125,7 +125,7 @@ #define MEM_SLICE_HASH_MASK(v) (GET_BITFIELD(v, 6, 19) << 6) #define MEM_SLICE_HASH_LSB_MASK_BIT(v) GET_BITFIELD(v, 24, 26) -static struct res_config { +static const struct res_config { bool machine_check; int num_imc; u32 imc_base; @@ -472,7 +472,7 @@ static u64 rpl_p_err_addr(u64 ecclog) return ECC_ERROR_LOG_ADDR45(ecclog); } -static struct res_config ehl_cfg = { +static const struct res_config ehl_cfg = { .num_imc = 1, .imc_base = 0x5000, .ibecc_base = 0xdc00, @@ -482,7 +482,7 @@ static struct res_config ehl_cfg = { .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, }; -static struct res_config icl_cfg = { +static const struct res_config icl_cfg = { .num_imc = 1, .imc_base = 0x5000, .ibecc_base = 0xd800, @@ -492,7 +492,7 @@ static struct res_config icl_cfg = { .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, }; -static struct res_config tgl_cfg = { +static const struct res_config tgl_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0x5000, @@ -506,7 +506,7 @@ static struct res_config tgl_cfg = { .err_addr_to_imc_addr = tgl_err_addr_to_imc_addr, }; -static struct res_config adl_cfg = { +static const struct res_config adl_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -517,7 +517,7 @@ static struct res_config adl_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static struct res_config adl_n_cfg = { +static const struct res_config adl_n_cfg = { .machine_check = true, .num_imc = 1, .imc_base = 0xd800, @@ -528,7 +528,7 @@ static struct res_config adl_n_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static struct res_config rpl_p_cfg = { +static const struct res_config rpl_p_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -540,7 +540,7 @@ static struct res_config rpl_p_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static struct res_config mtl_ps_cfg = { +static const struct res_config mtl_ps_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -551,7 +551,7 @@ static struct res_config mtl_ps_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static struct res_config mtl_p_cfg = { +static const struct res_config mtl_p_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -785,13 +785,22 @@ static u64 ecclog_read_and_clear(struct igen6_imc *imc) { u64 ecclog = readq(imc->window + ECC_ERROR_LOG_OFFSET); - if (ecclog & (ECC_ERROR_LOG_CE | ECC_ERROR_LOG_UE)) { - /* Clear CE/UE bits by writing 1s */ - writeq(ecclog, imc->window + ECC_ERROR_LOG_OFFSET); - return ecclog; - } + /* + * Quirk: The ECC_ERROR_LOG register of certain SoCs may contain + * the invalid value ~0. This will result in a flood of invalid + * error reports in polling mode. Skip it. + */ + if (ecclog == ~0) + return 0; - return 0; + /* Neither a CE nor a UE. Skip it.*/ + if (!(ecclog & (ECC_ERROR_LOG_CE | ECC_ERROR_LOG_UE))) + return 0; + + /* Clear CE/UE bits by writing 1s */ + writeq(ecclog, imc->window + ECC_ERROR_LOG_OFFSET); + + return ecclog; } static void errsts_clear(struct igen6_imc *imc) @@ -1374,7 +1383,7 @@ static void unregister_err_handler(void) unregister_nmi_handler(NMI_SERR, IGEN6_NMI_NAME); } -static void opstate_set(struct res_config *cfg, const struct pci_device_id *ent) +static void opstate_set(const struct res_config *cfg, const struct pci_device_id *ent) { /* * Quirk: Certain SoCs' error reporting interrupts don't work. diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c new file mode 100755 index 000000000000..3b1a845457b0 --- /dev/null +++ b/drivers/edac/mem_repair.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The generic EDAC memory repair driver is designed to control the memory + * devices with memory repair features, such as Post Package Repair (PPR), + * memory sparing etc. The common sysfs memory repair interface abstracts + * the control of various arbitrary memory repair functionalities into a + * unified set of functions. + * + * Copyright (c) 2024-2025 HiSilicon Limited. + */ + +#include <linux/edac.h> + +enum edac_mem_repair_attributes { + MR_TYPE, + MR_PERSIST_MODE, + MR_SAFE_IN_USE, + MR_HPA, + MR_MIN_HPA, + MR_MAX_HPA, + MR_DPA, + MR_MIN_DPA, + MR_MAX_DPA, + MR_NIBBLE_MASK, + MR_BANK_GROUP, + MR_BANK, + MR_RANK, + MR_ROW, + MR_COLUMN, + MR_CHANNEL, + MR_SUB_CHANNEL, + MEM_DO_REPAIR, + MR_MAX_ATTRS +}; + +struct edac_mem_repair_dev_attr { + struct device_attribute dev_attr; + u8 instance; +}; + +struct edac_mem_repair_context { + char name[EDAC_FEAT_NAME_LEN]; + struct edac_mem_repair_dev_attr mem_repair_dev_attr[MR_MAX_ATTRS]; + struct attribute *mem_repair_attrs[MR_MAX_ATTRS + 1]; + struct attribute_group group; +}; + +#define TO_MR_DEV_ATTR(_dev_attr) \ + container_of(_dev_attr, struct edac_mem_repair_dev_attr, dev_attr) + +#define MR_ATTR_SHOW(attrib, cb, type, format) \ +static ssize_t attrib##_show(struct device *ras_feat_dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + u8 inst = TO_MR_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_mem_repair_ops *ops = \ + ctx->mem_repair[inst].mem_repair_ops; \ + type data; \ + int ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, \ + &data); \ + if (ret) \ + return ret; \ + \ + return sysfs_emit(buf, format, data); \ +} + +MR_ATTR_SHOW(repair_type, get_repair_type, const char *, "%s\n") +MR_ATTR_SHOW(persist_mode, get_persist_mode, bool, "%u\n") +MR_ATTR_SHOW(repair_safe_when_in_use, get_repair_safe_when_in_use, bool, "%u\n") +MR_ATTR_SHOW(hpa, get_hpa, u64, "0x%llx\n") +MR_ATTR_SHOW(min_hpa, get_min_hpa, u64, "0x%llx\n") +MR_ATTR_SHOW(max_hpa, get_max_hpa, u64, "0x%llx\n") +MR_ATTR_SHOW(dpa, get_dpa, u64, "0x%llx\n") +MR_ATTR_SHOW(min_dpa, get_min_dpa, u64, "0x%llx\n") +MR_ATTR_SHOW(max_dpa, get_max_dpa, u64, "0x%llx\n") +MR_ATTR_SHOW(nibble_mask, get_nibble_mask, u32, "0x%x\n") +MR_ATTR_SHOW(bank_group, get_bank_group, u32, "%u\n") +MR_ATTR_SHOW(bank, get_bank, u32, "%u\n") +MR_ATTR_SHOW(rank, get_rank, u32, "%u\n") +MR_ATTR_SHOW(row, get_row, u32, "0x%x\n") +MR_ATTR_SHOW(column, get_column, u32, "%u\n") +MR_ATTR_SHOW(channel, get_channel, u32, "%u\n") +MR_ATTR_SHOW(sub_channel, get_sub_channel, u32, "%u\n") + +#define MR_ATTR_STORE(attrib, cb, type, conv_func) \ +static ssize_t attrib##_store(struct device *ras_feat_dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + u8 inst = TO_MR_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_mem_repair_ops *ops = \ + ctx->mem_repair[inst].mem_repair_ops; \ + type data; \ + int ret; \ + \ + ret = conv_func(buf, 0, &data); \ + if (ret < 0) \ + return ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, \ + data); \ + if (ret) \ + return ret; \ + \ + return len; \ +} + +MR_ATTR_STORE(persist_mode, set_persist_mode, unsigned long, kstrtoul) +MR_ATTR_STORE(hpa, set_hpa, u64, kstrtou64) +MR_ATTR_STORE(dpa, set_dpa, u64, kstrtou64) +MR_ATTR_STORE(nibble_mask, set_nibble_mask, unsigned long, kstrtoul) +MR_ATTR_STORE(bank_group, set_bank_group, unsigned long, kstrtoul) +MR_ATTR_STORE(bank, set_bank, unsigned long, kstrtoul) +MR_ATTR_STORE(rank, set_rank, unsigned long, kstrtoul) +MR_ATTR_STORE(row, set_row, unsigned long, kstrtoul) +MR_ATTR_STORE(column, set_column, unsigned long, kstrtoul) +MR_ATTR_STORE(channel, set_channel, unsigned long, kstrtoul) +MR_ATTR_STORE(sub_channel, set_sub_channel, unsigned long, kstrtoul) + +#define MR_DO_OP(attrib, cb) \ +static ssize_t attrib##_store(struct device *ras_feat_dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + u8 inst = TO_MR_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_mem_repair_ops *ops = ctx->mem_repair[inst].mem_repair_ops; \ + unsigned long data; \ + int ret; \ + \ + ret = kstrtoul(buf, 0, &data); \ + if (ret < 0) \ + return ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, data); \ + if (ret) \ + return ret; \ + \ + return len; \ +} + +MR_DO_OP(repair, do_repair) + +static umode_t mem_repair_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id) +{ + struct device *ras_feat_dev = kobj_to_dev(kobj); + struct device_attribute *dev_attr = container_of(a, struct device_attribute, attr); + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); + u8 inst = TO_MR_DEV_ATTR(dev_attr)->instance; + const struct edac_mem_repair_ops *ops = ctx->mem_repair[inst].mem_repair_ops; + + switch (attr_id) { + case MR_TYPE: + if (ops->get_repair_type) + return a->mode; + break; + case MR_PERSIST_MODE: + if (ops->get_persist_mode) { + if (ops->set_persist_mode) + return a->mode; + else + return 0444; + } + break; + case MR_SAFE_IN_USE: + if (ops->get_repair_safe_when_in_use) + return a->mode; + break; + case MR_HPA: + if (ops->get_hpa) { + if (ops->set_hpa) + return a->mode; + else + return 0444; + } + break; + case MR_MIN_HPA: + if (ops->get_min_hpa) + return a->mode; + break; + case MR_MAX_HPA: + if (ops->get_max_hpa) + return a->mode; + break; + case MR_DPA: + if (ops->get_dpa) { + if (ops->set_dpa) + return a->mode; + else + return 0444; + } + break; + case MR_MIN_DPA: + if (ops->get_min_dpa) + return a->mode; + break; + case MR_MAX_DPA: + if (ops->get_max_dpa) + return a->mode; + break; + case MR_NIBBLE_MASK: + if (ops->get_nibble_mask) { + if (ops->set_nibble_mask) + return a->mode; + else + return 0444; + } + break; + case MR_BANK_GROUP: + if (ops->get_bank_group) { + if (ops->set_bank_group) + return a->mode; + else + return 0444; + } + break; + case MR_BANK: + if (ops->get_bank) { + if (ops->set_bank) + return a->mode; + else + return 0444; + } + break; + case MR_RANK: + if (ops->get_rank) { + if (ops->set_rank) + return a->mode; + else + return 0444; + } + break; + case MR_ROW: + if (ops->get_row) { + if (ops->set_row) + return a->mode; + else + return 0444; + } + break; + case MR_COLUMN: + if (ops->get_column) { + if (ops->set_column) + return a->mode; + else + return 0444; + } + break; + case MR_CHANNEL: + if (ops->get_channel) { + if (ops->set_channel) + return a->mode; + else + return 0444; + } + break; + case MR_SUB_CHANNEL: + if (ops->get_sub_channel) { + if (ops->set_sub_channel) + return a->mode; + else + return 0444; + } + break; + case MEM_DO_REPAIR: + if (ops->do_repair) + return a->mode; + break; + default: + break; + } + + return 0; +} + +#define MR_ATTR_RO(_name, _instance) \ + ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_RO(_name), \ + .instance = _instance }) + +#define MR_ATTR_WO(_name, _instance) \ + ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_WO(_name), \ + .instance = _instance }) + +#define MR_ATTR_RW(_name, _instance) \ + ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_RW(_name), \ + .instance = _instance }) + +static int mem_repair_create_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance) +{ + struct edac_mem_repair_context *ctx; + struct attribute_group *group; + int i; + struct edac_mem_repair_dev_attr dev_attr[] = { + [MR_TYPE] = MR_ATTR_RO(repair_type, instance), + [MR_PERSIST_MODE] = MR_ATTR_RW(persist_mode, instance), + [MR_SAFE_IN_USE] = MR_ATTR_RO(repair_safe_when_in_use, instance), + [MR_HPA] = MR_ATTR_RW(hpa, instance), + [MR_MIN_HPA] = MR_ATTR_RO(min_hpa, instance), + [MR_MAX_HPA] = MR_ATTR_RO(max_hpa, instance), + [MR_DPA] = MR_ATTR_RW(dpa, instance), + [MR_MIN_DPA] = MR_ATTR_RO(min_dpa, instance), + [MR_MAX_DPA] = MR_ATTR_RO(max_dpa, instance), + [MR_NIBBLE_MASK] = MR_ATTR_RW(nibble_mask, instance), + [MR_BANK_GROUP] = MR_ATTR_RW(bank_group, instance), + [MR_BANK] = MR_ATTR_RW(bank, instance), + [MR_RANK] = MR_ATTR_RW(rank, instance), + [MR_ROW] = MR_ATTR_RW(row, instance), + [MR_COLUMN] = MR_ATTR_RW(column, instance), + [MR_CHANNEL] = MR_ATTR_RW(channel, instance), + [MR_SUB_CHANNEL] = MR_ATTR_RW(sub_channel, instance), + [MEM_DO_REPAIR] = MR_ATTR_WO(repair, instance) + }; + + ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + for (i = 0; i < MR_MAX_ATTRS; i++) { + memcpy(&ctx->mem_repair_dev_attr[i], + &dev_attr[i], sizeof(dev_attr[i])); + ctx->mem_repair_attrs[i] = + &ctx->mem_repair_dev_attr[i].dev_attr.attr; + } + + sprintf(ctx->name, "%s%d", "mem_repair", instance); + group = &ctx->group; + group->name = ctx->name; + group->attrs = ctx->mem_repair_attrs; + group->is_visible = mem_repair_attr_visible; + attr_groups[0] = group; + + return 0; +} + +/** + * edac_mem_repair_get_desc - get EDAC memory repair descriptors + * @dev: client device with memory repair feature + * @attr_groups: pointer to attribute group container + * @instance: device's memory repair instance number. + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + */ +int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, u8 instance) +{ + if (!dev || !attr_groups) + return -EINVAL; + + return mem_repair_create_desc(dev, attr_groups, instance); +} diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index f93f2f2b1cf2..af14c8a3279f 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -372,7 +372,7 @@ static int gen_asym_mask(struct b_cr_slice_channel_hash *p, struct b_cr_asym_mem_region1_mchbar *as1, struct b_cr_asym_2way_mem_region_mchbar *as2way) { - const int intlv[] = { 0x5, 0xA, 0x3, 0xC }; + static const int intlv[] = { 0x5, 0xA, 0x3, 0xC }; int mask = 0; if (as2way->asym_2way_interleave_enable) @@ -489,7 +489,7 @@ static int dnv_get_registers(void) */ static int get_registers(void) { - const int intlv[] = { 10, 11, 12, 12 }; + static const int intlv[] = { 10, 11, 12, 12 }; if (RD_REG(&tolud, b_cr_tolud_pci) || RD_REG(&touud_lo, b_cr_touud_lo_pci) || diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c new file mode 100755 index 000000000000..e421d3ebd959 --- /dev/null +++ b/drivers/edac/scrub.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The generic EDAC scrub driver controls the memory scrubbers in the + * system. The common sysfs scrub interface abstracts the control of + * various arbitrary scrubbing functionalities into a unified set of + * functions. + * + * Copyright (c) 2024-2025 HiSilicon Limited. + */ + +#include <linux/edac.h> + +enum edac_scrub_attributes { + SCRUB_ADDRESS, + SCRUB_SIZE, + SCRUB_ENABLE_BACKGROUND, + SCRUB_MIN_CYCLE_DURATION, + SCRUB_MAX_CYCLE_DURATION, + SCRUB_CUR_CYCLE_DURATION, + SCRUB_MAX_ATTRS +}; + +struct edac_scrub_dev_attr { + struct device_attribute dev_attr; + u8 instance; +}; + +struct edac_scrub_context { + char name[EDAC_FEAT_NAME_LEN]; + struct edac_scrub_dev_attr scrub_dev_attr[SCRUB_MAX_ATTRS]; + struct attribute *scrub_attrs[SCRUB_MAX_ATTRS + 1]; + struct attribute_group group; +}; + +#define TO_SCRUB_DEV_ATTR(_dev_attr) \ + container_of(_dev_attr, struct edac_scrub_dev_attr, dev_attr) + +#define EDAC_SCRUB_ATTR_SHOW(attrib, cb, type, format) \ +static ssize_t attrib##_show(struct device *ras_feat_dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + u8 inst = TO_SCRUB_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops; \ + type data; \ + int ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->scrub[inst].private, &data); \ + if (ret) \ + return ret; \ + \ + return sysfs_emit(buf, format, data); \ +} + +EDAC_SCRUB_ATTR_SHOW(addr, read_addr, u64, "0x%llx\n") +EDAC_SCRUB_ATTR_SHOW(size, read_size, u64, "0x%llx\n") +EDAC_SCRUB_ATTR_SHOW(enable_background, get_enabled_bg, bool, "%u\n") +EDAC_SCRUB_ATTR_SHOW(min_cycle_duration, get_min_cycle, u32, "%u\n") +EDAC_SCRUB_ATTR_SHOW(max_cycle_duration, get_max_cycle, u32, "%u\n") +EDAC_SCRUB_ATTR_SHOW(current_cycle_duration, get_cycle_duration, u32, "%u\n") + +#define EDAC_SCRUB_ATTR_STORE(attrib, cb, type, conv_func) \ +static ssize_t attrib##_store(struct device *ras_feat_dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + u8 inst = TO_SCRUB_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops; \ + type data; \ + int ret; \ + \ + ret = conv_func(buf, 0, &data); \ + if (ret < 0) \ + return ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->scrub[inst].private, data); \ + if (ret) \ + return ret; \ + \ + return len; \ +} + +EDAC_SCRUB_ATTR_STORE(addr, write_addr, u64, kstrtou64) +EDAC_SCRUB_ATTR_STORE(size, write_size, u64, kstrtou64) +EDAC_SCRUB_ATTR_STORE(enable_background, set_enabled_bg, unsigned long, kstrtoul) +EDAC_SCRUB_ATTR_STORE(current_cycle_duration, set_cycle_duration, unsigned long, kstrtoul) + +static umode_t scrub_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id) +{ + struct device *ras_feat_dev = kobj_to_dev(kobj); + struct device_attribute *dev_attr = container_of(a, struct device_attribute, attr); + u8 inst = TO_SCRUB_DEV_ATTR(dev_attr)->instance; + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); + const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops; + + switch (attr_id) { + case SCRUB_ADDRESS: + if (ops->read_addr) { + if (ops->write_addr) + return a->mode; + else + return 0444; + } + break; + case SCRUB_SIZE: + if (ops->read_size) { + if (ops->write_size) + return a->mode; + else + return 0444; + } + break; + case SCRUB_ENABLE_BACKGROUND: + if (ops->get_enabled_bg) { + if (ops->set_enabled_bg) + return a->mode; + else + return 0444; + } + break; + case SCRUB_MIN_CYCLE_DURATION: + if (ops->get_min_cycle) + return a->mode; + break; + case SCRUB_MAX_CYCLE_DURATION: + if (ops->get_max_cycle) + return a->mode; + break; + case SCRUB_CUR_CYCLE_DURATION: + if (ops->get_cycle_duration) { + if (ops->set_cycle_duration) + return a->mode; + else + return 0444; + } + break; + default: + break; + } + + return 0; +} + +#define EDAC_SCRUB_ATTR_RO(_name, _instance) \ + ((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_RO(_name), \ + .instance = _instance }) + +#define EDAC_SCRUB_ATTR_WO(_name, _instance) \ + ((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_WO(_name), \ + .instance = _instance }) + +#define EDAC_SCRUB_ATTR_RW(_name, _instance) \ + ((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_RW(_name), \ + .instance = _instance }) + +static int scrub_create_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, u8 instance) +{ + struct edac_scrub_context *scrub_ctx; + struct attribute_group *group; + int i; + struct edac_scrub_dev_attr dev_attr[] = { + [SCRUB_ADDRESS] = EDAC_SCRUB_ATTR_RW(addr, instance), + [SCRUB_SIZE] = EDAC_SCRUB_ATTR_RW(size, instance), + [SCRUB_ENABLE_BACKGROUND] = EDAC_SCRUB_ATTR_RW(enable_background, instance), + [SCRUB_MIN_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RO(min_cycle_duration, instance), + [SCRUB_MAX_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RO(max_cycle_duration, instance), + [SCRUB_CUR_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RW(current_cycle_duration, instance) + }; + + scrub_ctx = devm_kzalloc(scrub_dev, sizeof(*scrub_ctx), GFP_KERNEL); + if (!scrub_ctx) + return -ENOMEM; + + group = &scrub_ctx->group; + for (i = 0; i < SCRUB_MAX_ATTRS; i++) { + memcpy(&scrub_ctx->scrub_dev_attr[i], &dev_attr[i], sizeof(dev_attr[i])); + scrub_ctx->scrub_attrs[i] = &scrub_ctx->scrub_dev_attr[i].dev_attr.attr; + } + sprintf(scrub_ctx->name, "%s%d", "scrub", instance); + group->name = scrub_ctx->name; + group->attrs = scrub_ctx->scrub_attrs; + group->is_visible = scrub_attr_visible; + + attr_groups[0] = group; + + return 0; +} + +/** + * edac_scrub_get_desc - get EDAC scrub descriptors + * @scrub_dev: client device, with scrub support + * @attr_groups: pointer to attribute group container + * @instance: device's scrub instance number. + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + */ +int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, u8 instance) +{ + if (!scrub_dev || !attr_groups) + return -EINVAL; + + return scrub_create_desc(scrub_dev, attr_groups, instance); +} diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index f7bd930e058f..fa5b442b1844 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -121,6 +121,35 @@ void skx_adxl_put(void) } EXPORT_SYMBOL_GPL(skx_adxl_put); +static void skx_init_mc_mapping(struct skx_dev *d) +{ + /* + * By default, the BIOS presents all memory controllers within each + * socket to the EDAC driver. The physical indices are the same as + * the logical indices of the memory controllers enumerated by the + * EDAC driver. + */ + for (int i = 0; i < NUM_IMC; i++) + d->mc_mapping[i] = i; +} + +void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) +{ + edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n", + pmc, lmc); + + d->mc_mapping[pmc] = lmc; +} +EXPORT_SYMBOL_GPL(skx_set_mc_mapping); + +static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc) +{ + edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", + pmc, d->mc_mapping[pmc]); + + return d->mc_mapping[pmc]; +} + static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) { struct skx_dev *d; @@ -188,6 +217,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) return false; } + res->imc = skx_get_mc_mapping(d, res->imc); + for (i = 0; i < adxl_component_count; i++) { if (adxl_values[i] == ~0x0ull) continue; @@ -326,6 +357,8 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) d->bus[0], d->bus[1], d->bus[2], d->bus[3]); list_add_tail(&d->list, &dev_edac_list); prev = pdev; + + skx_init_mc_mapping(d); } if (list) diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index b0845bdd4516..ca5408803f87 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -93,6 +93,16 @@ struct skx_dev { struct pci_dev *uracu; /* for i10nm CPU */ struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; + /* + * Some server BIOS may hide certain memory controllers, and the + * EDAC driver skips those hidden memory controllers. However, the + * ADXL still decodes memory error address using physical memory + * controller indices. The mapping table is used to convert the + * physical indices (reported by ADXL) to the logical indices + * (used the EDAC driver) of present memory controllers during the + * error handling process. + */ + u8 mc_mapping[NUM_IMC]; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ @@ -242,6 +252,7 @@ void skx_adxl_put(void); void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log); void skx_set_mem_cfg(bool mem_cfg_2lm); void skx_set_res_cfg(struct res_config *cfg); +void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c index 699c7d29d80c..9955396c9a52 100644 --- a/drivers/edac/xgene_edac.c +++ b/drivers/edac/xgene_edac.c @@ -15,6 +15,7 @@ #include <linux/of.h> #include <linux/of_address.h> #include <linux/regmap.h> +#include <linux/string_choices.h> #include "edac_module.h" @@ -1407,7 +1408,7 @@ static void xgene_edac_iob_gic_report(struct edac_device_ctl_info *edac_dev) dev_err(edac_dev->dev, "Multiple XGIC write size error\n"); info = readl(ctx->dev_csr + XGICTRANSERRREQINFO); dev_err(edac_dev->dev, "XGIC %s access @ 0x%08X (0x%08X)\n", - info & REQTYPE_MASK ? "read" : "write", ERRADDR_RD(info), + str_read_write(info & REQTYPE_MASK), ERRADDR_RD(info), info); writel(reg, ctx->dev_csr + XGICTRANSERRINTSTS); @@ -1489,19 +1490,19 @@ static void xgene_edac_rb_report(struct edac_device_ctl_info *edac_dev) if (reg & AGENT_OFFLINE_ERR_MASK) dev_err(edac_dev->dev, "IOB bus %s access to offline agent error\n", - write ? "write" : "read"); + str_write_read(write)); if (reg & UNIMPL_RBPAGE_ERR_MASK) dev_err(edac_dev->dev, "IOB bus %s access to unimplemented page error\n", - write ? "write" : "read"); + str_write_read(write)); if (reg & WORD_ALIGNED_ERR_MASK) dev_err(edac_dev->dev, "IOB bus %s word aligned access error\n", - write ? "write" : "read"); + str_write_read(write)); if (reg & PAGE_ACCESS_ERR_MASK) dev_err(edac_dev->dev, "IOB bus %s to page out of range access error\n", - write ? "write" : "read"); + str_write_read(write)); if (regmap_write(ctx->edac->rb_map, RBEIR, 0)) return; if (regmap_write(ctx->edac->rb_map, RBCSR, 0)) @@ -1560,7 +1561,7 @@ rb_skip: err_addr_lo = readl(ctx->dev_csr + IOBBATRANSERRREQINFOL); err_addr_hi = readl(ctx->dev_csr + IOBBATRANSERRREQINFOH); dev_err(edac_dev->dev, "IOB BA %s access at 0x%02X.%08X (0x%08X)\n", - REQTYPE_F2_RD(err_addr_hi) ? "read" : "write", + str_read_write(REQTYPE_F2_RD(err_addr_hi)), ERRADDRH_F2_RD(err_addr_hi), err_addr_lo, err_addr_hi); if (reg & WRERR_RESP_MASK) dev_err(edac_dev->dev, "IOB BA requestor ID 0x%08X\n", @@ -1611,7 +1612,7 @@ chk_iob_axi0: dev_err(edac_dev->dev, "%sAXI slave 0 illegal %s access @ 0x%02X.%08X (0x%08X)\n", reg & IOBAXIS0_M_ILLEGAL_ACCESS_MASK ? "Multiple " : "", - REQTYPE_RD(err_addr_hi) ? "read" : "write", + str_read_write(REQTYPE_RD(err_addr_hi)), ERRADDRH_RD(err_addr_hi), err_addr_lo, err_addr_hi); writel(reg, ctx->dev_csr + IOBAXIS0TRANSERRINTSTS); @@ -1625,7 +1626,7 @@ chk_iob_axi1: dev_err(edac_dev->dev, "%sAXI slave 1 illegal %s access @ 0x%02X.%08X (0x%08X)\n", reg & IOBAXIS0_M_ILLEGAL_ACCESS_MASK ? "Multiple " : "", - REQTYPE_RD(err_addr_hi) ? "read" : "write", + str_read_write(REQTYPE_RD(err_addr_hi)), ERRADDRH_RD(err_addr_hi), err_addr_lo, err_addr_hi); writel(reg, ctx->dev_csr + IOBAXIS1TRANSERRINTSTS); } diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 5a732018be36..fd80b2f3233a 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -75,6 +75,10 @@ efi_status_t efi_random_alloc(unsigned long size, if (align < EFI_ALLOC_ALIGN) align = EFI_ALLOC_ALIGN; + /* Avoid address 0x0, as it can be mistaken for NULL */ + if (alloc_min == 0) + alloc_min = align; + size = round_up(size, EFI_ALLOC_ALIGN); /* count the suitable slots in each memory map entry */ diff --git a/drivers/firmware/imx/imx-scu.c b/drivers/firmware/imx/imx-scu.c index 1dd4362ef9a3..8c28e25ddc8a 100644 --- a/drivers/firmware/imx/imx-scu.c +++ b/drivers/firmware/imx/imx-scu.c @@ -280,6 +280,7 @@ static int imx_scu_probe(struct platform_device *pdev) return ret; sc_ipc->fast_ipc = of_device_is_compatible(args.np, "fsl,imx8-mu-scu"); + of_node_put(args.np); num_channel = sc_ipc->fast_ipc ? 2 : SCU_MU_CHAN_NUM; for (i = 0; i < num_channel; i++) { diff --git a/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c b/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c index 447246bd04be..98a463e9774b 100644 --- a/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c +++ b/drivers/firmware/qcom/qcom_qseecom_uefisecapp.c @@ -814,15 +814,6 @@ static int qcom_uefisecapp_probe(struct auxiliary_device *aux_dev, qcuefi->client = container_of(aux_dev, struct qseecom_client, aux_dev); - auxiliary_set_drvdata(aux_dev, qcuefi); - status = qcuefi_set_reference(qcuefi); - if (status) - return status; - - status = efivars_register(&qcuefi->efivars, &qcom_efivar_ops); - if (status) - qcuefi_set_reference(NULL); - memset(&pool_config, 0, sizeof(pool_config)); pool_config.initial_size = SZ_4K; pool_config.policy = QCOM_TZMEM_POLICY_MULTIPLIER; @@ -833,6 +824,15 @@ static int qcom_uefisecapp_probe(struct auxiliary_device *aux_dev, if (IS_ERR(qcuefi->mempool)) return PTR_ERR(qcuefi->mempool); + auxiliary_set_drvdata(aux_dev, qcuefi); + status = qcuefi_set_reference(qcuefi); + if (status) + return status; + + status = efivars_register(&qcuefi->efivars, &qcom_efivar_ops); + if (status) + qcuefi_set_reference(NULL); + return status; } diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c index f0569bb9411f..fc4d67e4c4a6 100644 --- a/drivers/firmware/qcom/qcom_scm.c +++ b/drivers/firmware/qcom/qcom_scm.c @@ -2301,8 +2301,8 @@ static int qcom_scm_probe(struct platform_device *pdev) __scm->mempool = devm_qcom_tzmem_pool_new(__scm->dev, &pool_config); if (IS_ERR(__scm->mempool)) { - dev_err_probe(__scm->dev, PTR_ERR(__scm->mempool), - "Failed to create the SCM memory pool\n"); + ret = dev_err_probe(__scm->dev, PTR_ERR(__scm->mempool), + "Failed to create the SCM memory pool\n"); goto err; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c index 03308261f894..7507d9443028 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c @@ -188,8 +188,8 @@ static int amdgpu_vkms_crtc_init(struct drm_device *dev, struct drm_crtc *crtc, amdgpu_crtc->connector = NULL; amdgpu_crtc->vsync_timer_enabled = AMDGPU_IRQ_STATE_DISABLE; - hrtimer_init(&amdgpu_crtc->vblank_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - amdgpu_crtc->vblank_timer.function = &amdgpu_vkms_vblank_simulate; + hrtimer_setup(&amdgpu_crtc->vblank_timer, &amdgpu_vkms_vblank_simulate, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 2523221a2519..48ff00427882 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -2437,7 +2437,7 @@ static int gfx_v12_0_cp_gfx_load_me_microcode_rs64(struct amdgpu_device *adev) (void **)&adev->gfx.me.me_fw_data_ptr); if (r) { dev_err(adev->dev, "(%d) failed to create me data bo\n", r); - gfx_v12_0_pfp_fini(adev); + gfx_v12_0_me_fini(adev); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index 0fb88e6d5d54..c3c144a4f45e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -501,9 +501,6 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device *adev, uint64_t *flags) { struct amdgpu_bo *bo = mapping->bo_va->base.bo; - struct amdgpu_device *bo_adev; - bool coherent, is_system; - *flags &= ~AMDGPU_PTE_EXECUTABLE; *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE; @@ -519,26 +516,11 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device *adev, *flags &= ~AMDGPU_PTE_VALID; } - if (!bo) - return; - - if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT | - AMDGPU_GEM_CREATE_UNCACHED)) - *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC); - - bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); - coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; - is_system = bo->tbo.resource && - (bo->tbo.resource->mem_type == TTM_PL_TT || - bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT); - if (bo && bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) *flags |= AMDGPU_PTE_DCC; - /* WA for HW bug */ - if (is_system || ((bo_adev != adev) && coherent)) - *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_NC); - + if (bo && bo->flags & AMDGPU_GEM_CREATE_UNCACHED) + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC); } static unsigned gmc_v12_0_get_vbios_fb_size(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 47db483c3516..95c609317a8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -78,12 +78,12 @@ static const struct amdgpu_video_codecs nv_video_codecs_encode = { /* Navi1x */ static const struct amdgpu_video_codec_info nv_video_codecs_decode_array[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 4096, 4096, 3)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 4096, 4096, 5)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 1920, 1088, 3)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 1920, 1088, 5)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 1920, 1088, 4)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 8192, 8192, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)}, }; @@ -104,10 +104,10 @@ static const struct amdgpu_video_codecs sc_video_codecs_encode = { }; static const struct amdgpu_video_codec_info sc_video_codecs_decode_array_vcn0[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 4096, 4096, 3)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 4096, 4096, 5)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 1920, 1088, 3)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 1920, 1088, 5)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 1920, 1088, 4)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)}, @@ -115,10 +115,10 @@ static const struct amdgpu_video_codec_info sc_video_codecs_decode_array_vcn0[] }; static const struct amdgpu_video_codec_info sc_video_codecs_decode_array_vcn1[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 4096, 4096, 3)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 4096, 4096, 5)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 1920, 1088, 3)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 1920, 1088, 5)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 1920, 1088, 4)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)}, diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index a59b4c36cad7..e98fb3fa36a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -103,12 +103,11 @@ static const struct amdgpu_video_codecs vega_video_codecs_encode = /* Vega */ static const struct amdgpu_video_codec_info vega_video_codecs_decode_array[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 4096, 4096, 3)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 4096, 4096, 5)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 1920, 1088, 3)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 1920, 1088, 5)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 1920, 1088, 4)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 4096, 186)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, }; static const struct amdgpu_video_codecs vega_video_codecs_decode = @@ -120,12 +119,12 @@ static const struct amdgpu_video_codecs vega_video_codecs_decode = /* Raven */ static const struct amdgpu_video_codec_info rv_video_codecs_decode_array[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 4096, 4096, 3)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 4096, 4096, 5)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 1920, 1088, 3)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 1920, 1088, 5)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 1920, 1088, 4)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 4096, 186)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 8192, 8192, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 4096, 4096, 0)}, }; @@ -138,10 +137,10 @@ static const struct amdgpu_video_codecs rv_video_codecs_decode = /* Renoir, Arcturus */ static const struct amdgpu_video_codec_info rn_video_codecs_decode_array[] = { - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 4096, 4096, 3)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 4096, 4096, 5)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, 1920, 1088, 3)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, 1920, 1088, 5)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)}, - {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)}, + {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 1920, 1088, 4)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)}, {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)}, diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index 06615f160331..0c9c4d8b7b71 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -167,16 +167,16 @@ static const struct amdgpu_video_codec_info tonga_video_codecs_decode_array[] = { { .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, - .max_width = 4096, - .max_height = 4096, - .max_pixels_per_frame = 4096 * 4096, + .max_width = 1920, + .max_height = 1088, + .max_pixels_per_frame = 1920 * 1088, .max_level = 3, }, { .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, - .max_width = 4096, - .max_height = 4096, - .max_pixels_per_frame = 4096 * 4096, + .max_width = 1920, + .max_height = 1088, + .max_pixels_per_frame = 1920 * 1088, .max_level = 5, }, { @@ -188,9 +188,9 @@ static const struct amdgpu_video_codec_info tonga_video_codecs_decode_array[] = }, { .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, - .max_width = 4096, - .max_height = 4096, - .max_pixels_per_frame = 4096 * 4096, + .max_width = 1920, + .max_height = 1088, + .max_pixels_per_frame = 1920 * 1088, .max_level = 4, }, }; @@ -206,16 +206,16 @@ static const struct amdgpu_video_codec_info cz_video_codecs_decode_array[] = { { .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2, - .max_width = 4096, - .max_height = 4096, - .max_pixels_per_frame = 4096 * 4096, + .max_width = 1920, + .max_height = 1088, + .max_pixels_per_frame = 1920 * 1088, .max_level = 3, }, { .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4, - .max_width = 4096, - .max_height = 4096, - .max_pixels_per_frame = 4096 * 4096, + .max_width = 1920, + .max_height = 1088, + .max_pixels_per_frame = 1920 * 1088, .max_level = 5, }, { @@ -227,9 +227,9 @@ static const struct amdgpu_video_codec_info cz_video_codecs_decode_array[] = }, { .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, - .max_width = 4096, - .max_height = 4096, - .max_pixels_per_frame = 4096 * 4096, + .max_width = 1920, + .max_height = 1088, + .max_pixels_per_frame = 1920 * 1088, .max_level = 4, }, { @@ -239,13 +239,6 @@ static const struct amdgpu_video_codec_info cz_video_codecs_decode_array[] = .max_pixels_per_frame = 4096 * 4096, .max_level = 186, }, - { - .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, - .max_width = 4096, - .max_height = 4096, - .max_pixels_per_frame = 4096 * 4096, - .max_level = 0, - }, }; static const struct amdgpu_video_codecs cz_video_codecs_decode = diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 651660958e5b..0320163b6e74 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -3644,7 +3644,7 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { }; static const uint32_t cwsr_trap_gfx12_hex[] = { - 0xbfa00001, 0xbfa0024b, + 0xbfa00001, 0xbfa002a2, 0xb0804009, 0xb8f8f804, 0x9178ff78, 0x00008c00, 0xb8fbf811, 0x8b6eff78, @@ -3718,7 +3718,15 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x00011677, 0xd7610000, 0x00011a79, 0xd7610000, 0x00011c7e, 0xd7610000, - 0x00011e7f, 0xbefe00ff, + 0x00011e7f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xbefe00ff, 0x00003fff, 0xbeff0080, 0xee0a407a, 0x000c0000, 0x00004000, 0xd760007a, @@ -3755,38 +3763,46 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x00000200, 0xbef600ff, 0x01000000, 0x7e000280, 0x7e020280, 0x7e040280, - 0xbefd0080, 0xbe804ec2, - 0xbf94fffe, 0xb8faf804, - 0x8b7a847a, 0x91788478, - 0x8c787a78, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xd7610002, 0x0000fa6c, - 0x807d817d, 0x917aff6d, - 0x80000000, 0xd7610002, - 0x0000fa7a, 0x807d817d, - 0xd7610002, 0x0000fa6e, - 0x807d817d, 0xd7610002, - 0x0000fa6f, 0x807d817d, - 0xd7610002, 0x0000fa78, - 0x807d817d, 0xb8faf811, - 0xd7610002, 0x0000fa7a, - 0x807d817d, 0xd7610002, - 0x0000fa7b, 0x807d817d, - 0xb8f1f801, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8f1f814, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8f1f815, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8f1f812, 0xd7610002, - 0x0000fa71, 0x807d817d, - 0xb8f1f813, 0xd7610002, - 0x0000fa71, 0x807d817d, + 0xbe804ec2, 0xbf94fffe, + 0xb8faf804, 0x8b7a847a, + 0x91788478, 0x8c787a78, + 0x917aff6d, 0x80000000, + 0xd7610002, 0x00010071, + 0xd7610002, 0x0001026c, + 0xd7610002, 0x0001047a, + 0xd7610002, 0x0001066e, + 0xd7610002, 0x0001086f, + 0xd7610002, 0x00010a78, + 0xd7610002, 0x00010e7b, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xd8500000, 0x00000000, + 0xb8faf811, 0xd7610002, + 0x00010c7a, 0xb8faf801, + 0xd7610002, 0x0001107a, + 0xb8faf814, 0xd7610002, + 0x0001127a, 0xb8faf815, + 0xd7610002, 0x0001147a, + 0xb8faf812, 0xd7610002, + 0x0001167a, 0xb8faf813, + 0xd7610002, 0x0001187a, 0xb8faf802, 0xd7610002, - 0x0000fa7a, 0x807d817d, - 0xbefa50c1, 0xbfc70000, - 0xd7610002, 0x0000fa7a, - 0x807d817d, 0xbefe00ff, + 0x00011a7a, 0xbefa50c1, + 0xbfc70000, 0xd7610002, + 0x00011c7a, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xbefe00ff, 0x0000ffff, 0xbeff0080, 0xc4068070, 0x008ce802, 0x00000000, 0xbefe00c1, @@ -3801,331 +3817,358 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0xbe824102, 0xbe844104, 0xbe864106, 0xbe884108, 0xbe8a410a, 0xbe8c410c, - 0xbe8e410e, 0xd7610002, - 0x0000f200, 0x80798179, - 0xd7610002, 0x0000f201, - 0x80798179, 0xd7610002, - 0x0000f202, 0x80798179, - 0xd7610002, 0x0000f203, - 0x80798179, 0xd7610002, - 0x0000f204, 0x80798179, - 0xd7610002, 0x0000f205, - 0x80798179, 0xd7610002, - 0x0000f206, 0x80798179, - 0xd7610002, 0x0000f207, - 0x80798179, 0xd7610002, - 0x0000f208, 0x80798179, - 0xd7610002, 0x0000f209, - 0x80798179, 0xd7610002, - 0x0000f20a, 0x80798179, - 0xd7610002, 0x0000f20b, - 0x80798179, 0xd7610002, - 0x0000f20c, 0x80798179, - 0xd7610002, 0x0000f20d, - 0x80798179, 0xd7610002, - 0x0000f20e, 0x80798179, - 0xd7610002, 0x0000f20f, - 0x80798179, 0xbf06a079, - 0xbfa10007, 0xc4068070, + 0xbe8e410e, 0xbf068079, + 0xbfa10032, 0xd7610002, + 0x00010000, 0xd7610002, + 0x00010201, 0xd7610002, + 0x00010402, 0xd7610002, + 0x00010603, 0xd7610002, + 0x00010804, 0xd7610002, + 0x00010a05, 0xd7610002, + 0x00010c06, 0xd7610002, + 0x00010e07, 0xd7610002, + 0x00011008, 0xd7610002, + 0x00011209, 0xd7610002, + 0x0001140a, 0xd7610002, + 0x0001160b, 0xd7610002, + 0x0001180c, 0xd7610002, + 0x00011a0d, 0xd7610002, + 0x00011c0e, 0xd7610002, + 0x00011e0f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0x80799079, + 0xbfa00038, 0xd7610002, + 0x00012000, 0xd7610002, + 0x00012201, 0xd7610002, + 0x00012402, 0xd7610002, + 0x00012603, 0xd7610002, + 0x00012804, 0xd7610002, + 0x00012a05, 0xd7610002, + 0x00012c06, 0xd7610002, + 0x00012e07, 0xd7610002, + 0x00013008, 0xd7610002, + 0x00013209, 0xd7610002, + 0x0001340a, 0xd7610002, + 0x0001360b, 0xd7610002, + 0x0001380c, 0xd7610002, + 0x00013a0d, 0xd7610002, + 0x00013c0e, 0xd7610002, + 0x00013e0f, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0x80799079, + 0xc4068070, 0x008ce802, + 0x00000000, 0x8070ff70, + 0x00000080, 0xbef90080, + 0x7e040280, 0x807d907d, + 0xbf0aff7d, 0x00000060, + 0xbfa2ff88, 0xbe804100, + 0xbe824102, 0xbe844104, + 0xbe864106, 0xbe884108, + 0xbe8a410a, 0xd7610002, + 0x00010000, 0xd7610002, + 0x00010201, 0xd7610002, + 0x00010402, 0xd7610002, + 0x00010603, 0xd7610002, + 0x00010804, 0xd7610002, + 0x00010a05, 0xd7610002, + 0x00010c06, 0xd7610002, + 0x00010e07, 0xd7610002, + 0x00011008, 0xd7610002, + 0x00011209, 0xd7610002, + 0x0001140a, 0xd7610002, + 0x0001160b, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xd8500000, + 0x00000000, 0xc4068070, 0x008ce802, 0x00000000, + 0xbefe00c1, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8fb4306, 0x8b7bc17b, + 0xbfa10044, 0x8b7aff6d, + 0x80000000, 0xbfa10041, + 0x847b897b, 0xbef6007b, + 0xb8f03b05, 0x80708170, + 0xbf0d9973, 0xbfa20002, + 0x84708970, 0xbfa00001, + 0x84708a70, 0xb8fa1e06, + 0x847a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, 0x8070ff70, 0x00000080, - 0xbef90080, 0x7e040280, - 0x807d907d, 0xbf0aff7d, - 0x00000060, 0xbfa2ffbb, - 0xbe804100, 0xbe824102, - 0xbe844104, 0xbe864106, - 0xbe884108, 0xbe8a410a, - 0xd7610002, 0x0000f200, - 0x80798179, 0xd7610002, - 0x0000f201, 0x80798179, - 0xd7610002, 0x0000f202, - 0x80798179, 0xd7610002, - 0x0000f203, 0x80798179, - 0xd7610002, 0x0000f204, - 0x80798179, 0xd7610002, - 0x0000f205, 0x80798179, - 0xd7610002, 0x0000f206, - 0x80798179, 0xd7610002, - 0x0000f207, 0x80798179, - 0xd7610002, 0x0000f208, - 0x80798179, 0xd7610002, - 0x0000f209, 0x80798179, - 0xd7610002, 0x0000f20a, - 0x80798179, 0xd7610002, - 0x0000f20b, 0x80798179, - 0xc4068070, 0x008ce802, - 0x00000000, 0xbefe00c1, - 0x857d9973, 0x8b7d817d, - 0xbf06817d, 0xbfa20002, - 0xbeff0080, 0xbfa00001, - 0xbeff00c1, 0xb8fb4306, - 0x8b7bc17b, 0xbfa10044, - 0x8b7aff6d, 0x80000000, - 0xbfa10041, 0x847b897b, - 0xbef6007b, 0xb8f03b05, - 0x80708170, 0xbf0d9973, - 0xbfa20002, 0x84708970, - 0xbfa00001, 0x84708a70, - 0xb8fa1e06, 0x847a8a7a, - 0x80707a70, 0x8070ff70, - 0x00000200, 0x8070ff70, - 0x00000080, 0xbef600ff, - 0x01000000, 0xd71f0000, - 0x000100c1, 0xd7200000, - 0x000200c1, 0x16000084, - 0x857d9973, 0x8b7d817d, - 0xbf06817d, 0xbefd0080, - 0xbfa20013, 0xbe8300ff, - 0x00000080, 0xbf800000, - 0xbf800000, 0xbf800000, - 0xd8d80000, 0x01000000, - 0xbf8a0000, 0xc4068070, - 0x008ce801, 0x00000000, - 0x807d037d, 0x80700370, - 0xd5250000, 0x0001ff00, - 0x00000080, 0xbf0a7b7d, - 0xbfa2fff3, 0xbfa00012, - 0xbe8300ff, 0x00000100, + 0xbef600ff, 0x01000000, + 0xd71f0000, 0x000100c1, + 0xd7200000, 0x000200c1, + 0x16000084, 0x857d9973, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa20013, + 0xbe8300ff, 0x00000080, 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8a0000, 0xc4068070, 0x008ce801, 0x00000000, 0x807d037d, 0x80700370, 0xd5250000, - 0x0001ff00, 0x00000100, + 0x0001ff00, 0x00000080, 0xbf0a7b7d, 0xbfa2fff3, - 0xbefe00c1, 0x857d9973, - 0x8b7d817d, 0xbf06817d, - 0xbfa20004, 0xbef000ff, - 0x00000200, 0xbeff0080, - 0xbfa00003, 0xbef000ff, - 0x00000400, 0xbeff00c1, - 0xb8fb3b05, 0x807b817b, - 0x847b827b, 0x857d9973, - 0x8b7d817d, 0xbf06817d, - 0xbfa2001b, 0xbef600ff, - 0x01000000, 0xbefd0084, - 0xbf0a7b7d, 0xbfa10040, - 0x7e008700, 0x7e028701, - 0x7e048702, 0x7e068703, - 0xc4068070, 0x008ce800, - 0x00000000, 0xc4068070, - 0x008ce801, 0x00008000, - 0xc4068070, 0x008ce802, - 0x00010000, 0xc4068070, - 0x008ce803, 0x00018000, - 0x807d847d, 0x8070ff70, - 0x00000200, 0xbf0a7b7d, - 0xbfa2ffeb, 0xbfa0002a, + 0xbfa00012, 0xbe8300ff, + 0x00000100, 0xbf800000, + 0xbf800000, 0xbf800000, + 0xd8d80000, 0x01000000, + 0xbf8a0000, 0xc4068070, + 0x008ce801, 0x00000000, + 0x807d037d, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000100, 0xbf0a7b7d, + 0xbfa2fff3, 0xbefe00c1, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa20004, + 0xbef000ff, 0x00000200, + 0xbeff0080, 0xbfa00003, + 0xbef000ff, 0x00000400, + 0xbeff00c1, 0xb8fb3b05, + 0x807b817b, 0x847b827b, + 0x857d9973, 0x8b7d817d, + 0xbf06817d, 0xbfa2001b, 0xbef600ff, 0x01000000, 0xbefd0084, 0xbf0a7b7d, - 0xbfa10015, 0x7e008700, + 0xbfa10040, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xc4068070, 0x008ce800, 0x00000000, 0xc4068070, 0x008ce801, - 0x00010000, 0xc4068070, - 0x008ce802, 0x00020000, + 0x00008000, 0xc4068070, + 0x008ce802, 0x00010000, 0xc4068070, 0x008ce803, - 0x00030000, 0x807d847d, - 0x8070ff70, 0x00000400, + 0x00018000, 0x807d847d, + 0x8070ff70, 0x00000200, 0xbf0a7b7d, 0xbfa2ffeb, - 0xb8fb1e06, 0x8b7bc17b, - 0xbfa1000d, 0x847b837b, - 0x807b7d7b, 0xbefe00c1, - 0xbeff0080, 0x7e008700, + 0xbfa0002a, 0xbef600ff, + 0x01000000, 0xbefd0084, + 0xbf0a7b7d, 0xbfa10015, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, 0xc4068070, 0x008ce800, - 0x00000000, 0x807d817d, - 0x8070ff70, 0x00000080, - 0xbf0a7b7d, 0xbfa2fff7, - 0xbfa0016e, 0xbef4007e, - 0x8b75ff7f, 0x0000ffff, - 0x8c75ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x10807fac, 0xbef1007f, - 0xb8f20742, 0x84729972, - 0x8b6eff7f, 0x04000000, - 0xbfa1003b, 0xbefe00c1, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20002, - 0xbeff0080, 0xbfa00001, - 0xbeff00c1, 0xb8ef4306, - 0x8b6fc16f, 0xbfa10030, - 0x846f896f, 0xbef6006f, + 0x00000000, 0xc4068070, + 0x008ce801, 0x00010000, + 0xc4068070, 0x008ce802, + 0x00020000, 0xc4068070, + 0x008ce803, 0x00030000, + 0x807d847d, 0x8070ff70, + 0x00000400, 0xbf0a7b7d, + 0xbfa2ffeb, 0xb8fb1e06, + 0x8b7bc17b, 0xbfa1000d, + 0x847b837b, 0x807b7d7b, + 0xbefe00c1, 0xbeff0080, + 0x7e008700, 0xc4068070, + 0x008ce800, 0x00000000, + 0x807d817d, 0x8070ff70, + 0x00000080, 0xbf0a7b7d, + 0xbfa2fff7, 0xbfa0016e, + 0xbef4007e, 0x8b75ff7f, + 0x0000ffff, 0x8c75ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x10807fac, + 0xbef1007f, 0xb8f20742, + 0x84729972, 0x8b6eff7f, + 0x04000000, 0xbfa1003b, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef4306, 0x8b6fc16f, + 0xbfa10030, 0x846f896f, + 0xbef6006f, 0xb8f83b05, + 0x80788178, 0xbf0d9972, + 0xbfa20002, 0x84788978, + 0xbfa00001, 0x84788a78, + 0xb8ee1e06, 0x846e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef600ff, + 0x01000000, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbefd0080, 0xbfa2000d, + 0xc4050078, 0x0080e800, + 0x00000000, 0xbf8a0000, + 0xdac00000, 0x00000000, + 0x807dff7d, 0x00000080, + 0x8078ff78, 0x00000080, + 0xbf0a6f7d, 0xbfa2fff4, + 0xbfa0000c, 0xc4050078, + 0x0080e800, 0x00000000, + 0xbf8a0000, 0xdac00000, + 0x00000000, 0x807dff7d, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7d, + 0xbfa2fff4, 0xbef80080, + 0xbefe00c1, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa20002, 0xbeff0080, + 0xbfa00001, 0xbeff00c1, + 0xb8ef3b05, 0x806f816f, + 0x846f826f, 0x857d9972, + 0x8b7d817d, 0xbf06817d, + 0xbfa2002c, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000200, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10061, 0xc4050078, + 0x008ce800, 0x00000000, + 0xc4050078, 0x008ce801, + 0x00008000, 0xc4050078, + 0x008ce802, 0x00010000, + 0xc4050078, 0x008ce803, + 0x00018000, 0xbf8a0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000200, 0xbf0a6f7d, + 0xbfa2ffea, 0xc405006e, + 0x008ce800, 0x00000000, + 0xc405006e, 0x008ce801, + 0x00008000, 0xc405006e, + 0x008ce802, 0x00010000, + 0xc405006e, 0x008ce803, + 0x00018000, 0xbf8a0000, + 0xbfa0003d, 0xbef600ff, + 0x01000000, 0xbeee0078, + 0x8078ff78, 0x00000400, + 0xbefd0084, 0xbf0a6f7d, + 0xbfa10016, 0xc4050078, + 0x008ce800, 0x00000000, + 0xc4050078, 0x008ce801, + 0x00010000, 0xc4050078, + 0x008ce802, 0x00020000, + 0xc4050078, 0x008ce803, + 0x00030000, 0xbf8a0000, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807d847d, 0x8078ff78, + 0x00000400, 0xbf0a6f7d, + 0xbfa2ffea, 0xb8ef1e06, + 0x8b6fc16f, 0xbfa1000f, + 0x846f836f, 0x806f7d6f, + 0xbefe00c1, 0xbeff0080, + 0xc4050078, 0x008ce800, + 0x00000000, 0xbf8a0000, + 0x7e008500, 0x807d817d, + 0x8078ff78, 0x00000080, + 0xbf0a6f7d, 0xbfa2fff6, + 0xbeff00c1, 0xc405006e, + 0x008ce800, 0x00000000, + 0xc405006e, 0x008ce801, + 0x00010000, 0xc405006e, + 0x008ce802, 0x00020000, + 0xc405006e, 0x008ce803, + 0x00030000, 0xbf8a0000, 0xb8f83b05, 0x80788178, 0xbf0d9972, 0xbfa20002, 0x84788978, 0xbfa00001, 0x84788a78, 0xb8ee1e06, 0x846e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, - 0x8078ff78, 0x00000080, - 0xbef600ff, 0x01000000, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbefd0080, - 0xbfa2000d, 0xc4050078, - 0x0080e800, 0x00000000, - 0xbf8a0000, 0xdac00000, - 0x00000000, 0x807dff7d, - 0x00000080, 0x8078ff78, - 0x00000080, 0xbf0a6f7d, - 0xbfa2fff4, 0xbfa0000c, - 0xc4050078, 0x0080e800, - 0x00000000, 0xbf8a0000, - 0xdac00000, 0x00000000, - 0x807dff7d, 0x00000100, - 0x8078ff78, 0x00000100, - 0xbf0a6f7d, 0xbfa2fff4, - 0xbef80080, 0xbefe00c1, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa20002, - 0xbeff0080, 0xbfa00001, - 0xbeff00c1, 0xb8ef3b05, - 0x806f816f, 0x846f826f, - 0x857d9972, 0x8b7d817d, - 0xbf06817d, 0xbfa2002c, + 0x80f8ff78, 0x00000050, 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000200, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10061, - 0xc4050078, 0x008ce800, - 0x00000000, 0xc4050078, - 0x008ce801, 0x00008000, - 0xc4050078, 0x008ce802, - 0x00010000, 0xc4050078, - 0x008ce803, 0x00018000, - 0xbf8a0000, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807d847d, + 0xbefd00ff, 0x0000006c, + 0x80f89078, 0xf462403a, + 0xf0000000, 0xbf8a0000, + 0x80fd847d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0x80f8a078, 0xf462603a, + 0xf0000000, 0xbf8a0000, + 0x80fd887d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0xbe844304, 0xbe864306, + 0x80f8c078, 0xf462803a, + 0xf0000000, 0xbf8a0000, + 0x80fd907d, 0xbf800000, + 0xbe804300, 0xbe824302, + 0xbe844304, 0xbe864306, + 0xbe884308, 0xbe8a430a, + 0xbe8c430c, 0xbe8e430e, + 0xbf06807d, 0xbfa1fff0, + 0xb980f801, 0x00000000, + 0xb8f83b05, 0x80788178, + 0xbf0d9972, 0xbfa20002, + 0x84788978, 0xbfa00001, + 0x84788a78, 0xb8ee1e06, + 0x846e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, - 0xbf0a6f7d, 0xbfa2ffea, - 0xc405006e, 0x008ce800, - 0x00000000, 0xc405006e, - 0x008ce801, 0x00008000, - 0xc405006e, 0x008ce802, - 0x00010000, 0xc405006e, - 0x008ce803, 0x00018000, - 0xbf8a0000, 0xbfa0003d, 0xbef600ff, 0x01000000, - 0xbeee0078, 0x8078ff78, - 0x00000400, 0xbefd0084, - 0xbf0a6f7d, 0xbfa10016, - 0xc4050078, 0x008ce800, - 0x00000000, 0xc4050078, - 0x008ce801, 0x00010000, - 0xc4050078, 0x008ce802, - 0x00020000, 0xc4050078, - 0x008ce803, 0x00030000, - 0xbf8a0000, 0x7e008500, - 0x7e028501, 0x7e048502, - 0x7e068503, 0x807d847d, - 0x8078ff78, 0x00000400, - 0xbf0a6f7d, 0xbfa2ffea, - 0xb8ef1e06, 0x8b6fc16f, - 0xbfa1000f, 0x846f836f, - 0x806f7d6f, 0xbefe00c1, - 0xbeff0080, 0xc4050078, - 0x008ce800, 0x00000000, - 0xbf8a0000, 0x7e008500, - 0x807d817d, 0x8078ff78, - 0x00000080, 0xbf0a6f7d, - 0xbfa2fff6, 0xbeff00c1, - 0xc405006e, 0x008ce800, - 0x00000000, 0xc405006e, - 0x008ce801, 0x00010000, - 0xc405006e, 0x008ce802, - 0x00020000, 0xc405006e, - 0x008ce803, 0x00030000, - 0xbf8a0000, 0xb8f83b05, - 0x80788178, 0xbf0d9972, - 0xbfa20002, 0x84788978, - 0xbfa00001, 0x84788a78, - 0xb8ee1e06, 0x846e8a6e, - 0x80786e78, 0x8078ff78, - 0x00000200, 0x80f8ff78, - 0x00000050, 0xbef600ff, - 0x01000000, 0xbefd00ff, - 0x0000006c, 0x80f89078, - 0xf462403a, 0xf0000000, - 0xbf8a0000, 0x80fd847d, - 0xbf800000, 0xbe804300, - 0xbe824302, 0x80f8a078, - 0xf462603a, 0xf0000000, - 0xbf8a0000, 0x80fd887d, - 0xbf800000, 0xbe804300, - 0xbe824302, 0xbe844304, - 0xbe864306, 0x80f8c078, - 0xf462803a, 0xf0000000, - 0xbf8a0000, 0x80fd907d, - 0xbf800000, 0xbe804300, - 0xbe824302, 0xbe844304, - 0xbe864306, 0xbe884308, - 0xbe8a430a, 0xbe8c430c, - 0xbe8e430e, 0xbf06807d, - 0xbfa1fff0, 0xb980f801, - 0x00000000, 0xb8f83b05, - 0x80788178, 0xbf0d9972, - 0xbfa20002, 0x84788978, - 0xbfa00001, 0x84788a78, - 0xb8ee1e06, 0x846e8a6e, - 0x80786e78, 0x8078ff78, - 0x00000200, 0xbef600ff, - 0x01000000, 0xbeff0071, - 0xf4621bfa, 0xf0000000, - 0x80788478, 0xf4621b3a, + 0xbeff0071, 0xf4621bfa, 0xf0000000, 0x80788478, - 0xf4621b7a, 0xf0000000, - 0x80788478, 0xf4621c3a, + 0xf4621b3a, 0xf0000000, + 0x80788478, 0xf4621b7a, 0xf0000000, 0x80788478, - 0xf4621c7a, 0xf0000000, - 0x80788478, 0xf4621eba, + 0xf4621c3a, 0xf0000000, + 0x80788478, 0xf4621c7a, 0xf0000000, 0x80788478, - 0xf4621efa, 0xf0000000, - 0x80788478, 0xf4621e7a, + 0xf4621eba, 0xf0000000, + 0x80788478, 0xf4621efa, 0xf0000000, 0x80788478, - 0xf4621cfa, 0xf0000000, - 0x80788478, 0xf4621bba, + 0xf4621e7a, 0xf0000000, + 0x80788478, 0xf4621cfa, 0xf0000000, 0x80788478, - 0xbf8a0000, 0xb96ef814, 0xf4621bba, 0xf0000000, 0x80788478, 0xbf8a0000, - 0xb96ef815, 0xf4621bba, + 0xb96ef814, 0xf4621bba, 0xf0000000, 0x80788478, - 0xbf8a0000, 0xb96ef812, + 0xbf8a0000, 0xb96ef815, 0xf4621bba, 0xf0000000, 0x80788478, 0xbf8a0000, - 0xb96ef813, 0x8b6eff7f, - 0x04000000, 0xbfa1000d, - 0x80788478, 0xf4621bba, + 0xb96ef812, 0xf4621bba, 0xf0000000, 0x80788478, - 0xbf8a0000, 0xbf0d806e, - 0xbfa10006, 0x856e906e, - 0x8b6e6e6e, 0xbfa10003, - 0xbe804ec1, 0x816ec16e, - 0xbfa0fffb, 0xbefd006f, - 0xbefe0070, 0xbeff0071, - 0xb97b2011, 0x857b867b, - 0xb97b0191, 0x857b827b, - 0xb97bba11, 0xb973f801, - 0xb8ee3b05, 0x806e816e, - 0xbf0d9972, 0xbfa20002, - 0x846e896e, 0xbfa00001, - 0x846e8a6e, 0xb8ef1e06, - 0x846f8a6f, 0x806e6f6e, - 0x806eff6e, 0x00000200, - 0x806e746e, 0x826f8075, - 0x8b6fff6f, 0x0000ffff, - 0xf4605c37, 0xf8000050, - 0xf4605d37, 0xf8000060, - 0xf4601e77, 0xf8000074, - 0xbf8a0000, 0x8b6dff6d, - 0x0000ffff, 0x8bfe7e7e, - 0x8bea6a6a, 0xb97af804, + 0xbf8a0000, 0xb96ef813, + 0x8b6eff7f, 0x04000000, + 0xbfa1000d, 0x80788478, + 0xf4621bba, 0xf0000000, + 0x80788478, 0xbf8a0000, + 0xbf0d806e, 0xbfa10006, + 0x856e906e, 0x8b6e6e6e, + 0xbfa10003, 0xbe804ec1, + 0x816ec16e, 0xbfa0fffb, + 0xbefd006f, 0xbefe0070, + 0xbeff0071, 0xb97b2011, + 0x857b867b, 0xb97b0191, + 0x857b827b, 0xb97bba11, + 0xb973f801, 0xb8ee3b05, + 0x806e816e, 0xbf0d9972, + 0xbfa20002, 0x846e896e, + 0xbfa00001, 0x846e8a6e, + 0xb8ef1e06, 0x846f8a6f, + 0x806e6f6e, 0x806eff6e, + 0x00000200, 0x806e746e, + 0x826f8075, 0x8b6fff6f, + 0x0000ffff, 0xf4605c37, + 0xf8000050, 0xf4605d37, + 0xf8000060, 0xf4601e77, + 0xf8000074, 0xbf8a0000, + 0x8b6dff6d, 0x0000ffff, + 0x8bfe7e7e, 0x8bea6a6a, + 0xb97af804, 0xbe804ec2, + 0xbf94fffe, 0xbe804a6c, 0xbe804ec2, 0xbf94fffe, - 0xbe804a6c, 0xbe804ec2, - 0xbf94fffe, 0xbfb10000, + 0xbfb10000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, - 0xbf9f0000, 0x00000000, }; static const uint32_t cwsr_trap_gfx9_5_0_hex[] = { diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm index 7b9d36e5fa43..5a1a1b1f897f 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm @@ -30,6 +30,7 @@ #define CHIP_GFX12 37 #define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised +#define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12) var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4 var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 @@ -351,6 +352,7 @@ L_HAVE_VGPRS: v_writelane_b32 v0, ttmp13, 0xD v_writelane_b32 v0, exec_lo, 0xE v_writelane_b32 v0, exec_hi, 0xF + valu_sgpr_hazard() s_mov_b32 exec_lo, 0x3FFF s_mov_b32 exec_hi, 0x0 @@ -417,7 +419,6 @@ L_SAVE_HWREG: v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store - s_mov_b32 m0, 0x0 //Next lane of v2 to write to // Ensure no further changes to barrier or LDS state. // STATE_PRIV.BARRIER_COMPLETE may change up to this point. @@ -430,40 +431,41 @@ L_SAVE_HWREG: s_andn2_b32 s_save_state_priv, s_save_state_priv, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK s_or_b32 s_save_state_priv, s_save_state_priv, s_save_tmp - write_hwreg_to_v2(s_save_m0) - write_hwreg_to_v2(s_save_pc_lo) s_andn2_b32 s_save_tmp, s_save_pc_hi, S_SAVE_PC_HI_FIRST_WAVE_MASK - write_hwreg_to_v2(s_save_tmp) - write_hwreg_to_v2(s_save_exec_lo) - write_hwreg_to_v2(s_save_exec_hi) - write_hwreg_to_v2(s_save_state_priv) + v_writelane_b32 v2, s_save_m0, 0x0 + v_writelane_b32 v2, s_save_pc_lo, 0x1 + v_writelane_b32 v2, s_save_tmp, 0x2 + v_writelane_b32 v2, s_save_exec_lo, 0x3 + v_writelane_b32 v2, s_save_exec_hi, 0x4 + v_writelane_b32 v2, s_save_state_priv, 0x5 + v_writelane_b32 v2, s_save_xnack_mask, 0x7 + valu_sgpr_hazard() s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) - write_hwreg_to_v2(s_save_tmp) + v_writelane_b32 v2, s_save_tmp, 0x6 - write_hwreg_to_v2(s_save_xnack_mask) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_MODE) + v_writelane_b32 v2, s_save_tmp, 0x8 - s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_MODE) - write_hwreg_to_v2(s_save_m0) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) + v_writelane_b32 v2, s_save_tmp, 0x9 - s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) - write_hwreg_to_v2(s_save_m0) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI) + v_writelane_b32 v2, s_save_tmp, 0xA - s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI) - write_hwreg_to_v2(s_save_m0) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) + v_writelane_b32 v2, s_save_tmp, 0xB - s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) - write_hwreg_to_v2(s_save_m0) - - s_getreg_b32 s_save_m0, hwreg(HW_REG_WAVE_TRAP_CTRL) - write_hwreg_to_v2(s_save_m0) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_TRAP_CTRL) + v_writelane_b32 v2, s_save_tmp, 0xC s_getreg_b32 s_save_tmp, hwreg(HW_REG_WAVE_STATUS) - write_hwreg_to_v2(s_save_tmp) + v_writelane_b32 v2, s_save_tmp, 0xD s_get_barrier_state s_save_tmp, -1 s_wait_kmcnt (0) - write_hwreg_to_v2(s_save_tmp) + v_writelane_b32 v2, s_save_tmp, 0xE + valu_sgpr_hazard() // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this. s_mov_b32 exec_lo, 0xFFFF @@ -497,10 +499,12 @@ L_SAVE_SGPR_LOOP: s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] - write_16sgpr_to_v2(s0) - - s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? - s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE + s_cmp_eq_u32 ttmp13, 0x0 + s_cbranch_scc0 L_WRITE_V2_SECOND_HALF + write_16sgpr_to_v2(s0, 0x0) + s_branch L_SAVE_SGPR_SKIP_TCP_STORE +L_WRITE_V2_SECOND_HALF: + write_16sgpr_to_v2(s0, 0x10) buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset scope:SCOPE_SYS s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 @@ -1056,27 +1060,21 @@ L_END_PGM: s_endpgm_saved end -function write_hwreg_to_v2(s) - // Copy into VGPR for later TCP store. - v_writelane_b32 v2, s, m0 - s_add_u32 m0, m0, 0x1 -end - - -function write_16sgpr_to_v2(s) +function write_16sgpr_to_v2(s, lane_offset) // Copy into VGPR for later TCP store. for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ - v_writelane_b32 v2, s[sgpr_idx], ttmp13 - s_add_u32 ttmp13, ttmp13, 0x1 + v_writelane_b32 v2, s[sgpr_idx], sgpr_idx + lane_offset end + valu_sgpr_hazard() + s_add_u32 ttmp13, ttmp13, 0x10 end function write_12sgpr_to_v2(s) // Copy into VGPR for later TCP store. for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ - v_writelane_b32 v2, s[sgpr_idx], ttmp13 - s_add_u32 ttmp13, ttmp13, 0x1 + v_writelane_b32 v2, s[sgpr_idx], sgpr_idx end + valu_sgpr_hazard() end function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) @@ -1128,3 +1126,11 @@ function get_wave_size2(s_reg) s_getreg_b32 s_reg, hwreg(HW_REG_WAVE_STATUS,SQ_WAVE_STATUS_WAVE64_SHIFT,SQ_WAVE_STATUS_WAVE64_SIZE) s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE end + +function valu_sgpr_hazard +#if HAVE_VALU_SGPR_HAZARD + for var rep = 0; rep < 8; rep ++ + ds_nop + end +#endif +end diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 24396a2c77bd..4afff7094caf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -233,6 +233,7 @@ void kfd_queue_buffer_put(struct amdgpu_bo **bo) int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { struct kfd_topology_device *topo_dev; + u64 expected_queue_size; struct amdgpu_vm *vm; u32 total_cwsr_size; int err; @@ -241,6 +242,15 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope if (!topo_dev) return -EINVAL; + /* AQL queues on GFX7 and GFX8 appear twice their actual size */ + if (properties->type == KFD_QUEUE_TYPE_COMPUTE && + properties->format == KFD_QUEUE_FORMAT_AQL && + topo_dev->node_props.gfx_target_version >= 70000 && + topo_dev->node_props.gfx_target_version < 90000) + expected_queue_size = properties->queue_size / 2; + else + expected_queue_size = properties->queue_size; + vm = drm_priv_to_vm(pdd->drm_priv); err = amdgpu_bo_reserve(vm->root.bo, false); if (err) @@ -255,7 +265,7 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope goto out_err_unreserve; err = kfd_queue_buffer_get(vm, (void *)properties->queue_address, - &properties->ring_bo, properties->queue_size); + &properties->ring_bo, expected_queue_size); if (err) goto out_err_unreserve; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index bd3e20d981e0..9477a4adcd36 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1286,13 +1286,7 @@ svm_range_get_pte_flags(struct kfd_node *node, break; case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): - if (domain == SVM_RANGE_VRAM_DOMAIN) { - if (bo_node != node) - mapping_flags |= AMDGPU_VM_MTYPE_NC; - } else { - mapping_flags |= coherent ? - AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; - } + mapping_flags |= AMDGPU_VM_MTYPE_NC; break; default: mapping_flags |= coherent ? diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 74ad0d1240fe..39df45f652b3 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1745,7 +1745,7 @@ static void retrieve_dmi_info(struct amdgpu_display_manager *dm, struct dc_init_ } if (quirk_entries.support_edp0_on_dp1) { init_data->flags.support_edp0_on_dp1 = true; - drm_info(dev, "aux_hpd_discon_quirk attached\n"); + drm_info(dev, "support_edp0_on_dp1 attached\n"); } } diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c index bf636b28e3e1..6e2fce329d73 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c @@ -69,5 +69,16 @@ bool should_use_dmub_lock(struct dc_link *link) if (link->replay_settings.replay_feature_enabled) return true; + /* only use HW lock for PSR1 on single eDP */ + if (link->psr_settings.psr_version == DC_PSR_VERSION_1) { + struct dc_link *edp_links[MAX_NUM_EDP]; + int edp_num; + + dc_get_edp_links(link->dc, edp_links, &edp_num); + + if (edp_num == 1) + return true; + } + return false; } diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c index 15ea216e903d..6157886f4802 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c @@ -704,7 +704,7 @@ enum dmub_status dmub_srv_hw_init(struct dmub_srv *dmub, cw6.region.base = DMUB_CW6_BASE; cw6.region.top = cw6.region.base + fw_state_fb->size; - dmub->fw_state = fw_state_fb->cpu_addr; + dmub->fw_state = (void *)((uintptr_t)(fw_state_fb->cpu_addr) + DMUB_DEBUG_FW_STATE_OFFSET); region6.offset.quad_part = shared_state_fb->gpu_addr; region6.region.base = DMUB_CW6_BASE; diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index e8ae7681bf0a..77b1f061bbf0 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -2421,6 +2421,8 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ case IP_VERSION(11, 0, 1): case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): *states = ATTR_STATE_SUPPORTED; break; default: diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 5cad09c5f2ff..3f1fcf8c4ee8 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -1193,16 +1193,9 @@ static int smu_v14_0_2_print_clk_levels(struct smu_context *smu, PP_OD_FEATURE_GFXCLK_BIT)) break; - PPTable_t *pptable = smu->smu_table.driver_pptable; - const OverDriveLimits_t * const overdrive_upperlimits = - &pptable->SkuTable.OverDriveLimitsBasicMax; - const OverDriveLimits_t * const overdrive_lowerlimits = - &pptable->SkuTable.OverDriveLimitsBasicMin; - size += sysfs_emit_at(buf, size, "OD_SCLK_OFFSET:\n"); - size += sysfs_emit_at(buf, size, "0: %dMhz\n1: %uMhz\n", - overdrive_lowerlimits->GfxclkFoffset, - overdrive_upperlimits->GfxclkFoffset); + size += sysfs_emit_at(buf, size, "%dMhz\n", + od_table->OverDriveTable.GfxclkFoffset); break; case SMU_OD_MCLK: @@ -1337,12 +1330,8 @@ static int smu_v14_0_2_print_clk_levels(struct smu_context *smu, if (smu_v14_0_2_is_od_feature_supported(smu, PP_OD_FEATURE_GFXCLK_BIT)) { smu_v14_0_2_get_od_setting_limits(smu, - PP_OD_FEATURE_GFXCLK_FMIN, - &min_value, - NULL); - smu_v14_0_2_get_od_setting_limits(smu, PP_OD_FEATURE_GFXCLK_FMAX, - NULL, + &min_value, &max_value); size += sysfs_emit_at(buf, size, "SCLK_OFFSET: %7dMhz %10uMhz\n", min_value, max_value); @@ -1627,6 +1616,39 @@ out: adev->unique_id = ((uint64_t)upper32 << 32) | lower32; } +static int smu_v14_0_2_get_fan_speed_pwm(struct smu_context *smu, + uint32_t *speed) +{ + int ret; + + if (!speed) + return -EINVAL; + + ret = smu_v14_0_2_get_smu_metrics_data(smu, + METRICS_CURR_FANPWM, + speed); + if (ret) { + dev_err(smu->adev->dev, "Failed to get fan speed(PWM)!"); + return ret; + } + + /* Convert the PMFW output which is in percent to pwm(255) based */ + *speed = min(*speed * 255 / 100, (uint32_t)255); + + return 0; +} + +static int smu_v14_0_2_get_fan_speed_rpm(struct smu_context *smu, + uint32_t *speed) +{ + if (!speed) + return -EINVAL; + + return smu_v14_0_2_get_smu_metrics_data(smu, + METRICS_CURR_FANSPEED, + speed); +} + static int smu_v14_0_2_get_power_limit(struct smu_context *smu, uint32_t *current_power_limit, uint32_t *default_power_limit, @@ -2417,36 +2439,24 @@ static int smu_v14_0_2_od_edit_dpm_table(struct smu_context *smu, return -ENOTSUPP; } - for (i = 0; i < size; i += 2) { - if (i + 2 > size) { - dev_info(adev->dev, "invalid number of input parameters %d\n", size); - return -EINVAL; - } - - switch (input[i]) { - case 1: - smu_v14_0_2_get_od_setting_limits(smu, - PP_OD_FEATURE_GFXCLK_FMAX, - &minimum, - &maximum); - if (input[i + 1] < minimum || - input[i + 1] > maximum) { - dev_info(adev->dev, "GfxclkFmax (%ld) must be within [%u, %u]!\n", - input[i + 1], minimum, maximum); - return -EINVAL; - } - - od_table->OverDriveTable.GfxclkFoffset = input[i + 1]; - od_table->OverDriveTable.FeatureCtrlMask |= 1U << PP_OD_FEATURE_GFXCLK_BIT; - break; + if (size != 1) { + dev_info(adev->dev, "invalid number of input parameters %d\n", size); + return -EINVAL; + } - default: - dev_info(adev->dev, "Invalid SCLK_VDDC_TABLE index: %ld\n", input[i]); - dev_info(adev->dev, "Supported indices: [0:min,1:max]\n"); - return -EINVAL; - } + smu_v14_0_2_get_od_setting_limits(smu, + PP_OD_FEATURE_GFXCLK_FMAX, + &minimum, + &maximum); + if (input[0] < minimum || + input[0] > maximum) { + dev_info(adev->dev, "GfxclkFoffset must be within [%d, %u]!\n", + minimum, maximum); + return -EINVAL; } + od_table->OverDriveTable.GfxclkFoffset = input[0]; + od_table->OverDriveTable.FeatureCtrlMask |= 1U << PP_OD_FEATURE_GFXCLK_BIT; break; case PP_OD_EDIT_MCLK_VDDC_TABLE: @@ -2804,6 +2814,8 @@ static const struct pptable_funcs smu_v14_0_2_ppt_funcs = { .set_performance_level = smu_v14_0_set_performance_level, .gfx_off_control = smu_v14_0_gfx_off_control, .get_unique_id = smu_v14_0_2_get_unique_id, + .get_fan_speed_pwm = smu_v14_0_2_get_fan_speed_pwm, + .get_fan_speed_rpm = smu_v14_0_2_get_fan_speed_rpm, .get_power_limit = smu_v14_0_2_get_power_limit, .set_power_limit = smu_v14_0_2_set_power_limit, .get_power_profile_mode = smu_v14_0_2_get_power_profile_mode, diff --git a/drivers/gpu/drm/i915/gt/uc/intel_huc.c b/drivers/gpu/drm/i915/gt/uc/intel_huc.c index b3cbf85c00cb..00d00c480cc5 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_huc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_huc.c @@ -231,8 +231,8 @@ static void delayed_huc_load_init(struct intel_huc *huc) sw_fence_dummy_notify); i915_sw_fence_commit(&huc->delayed_load.fence); - hrtimer_init(&huc->delayed_load.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - huc->delayed_load.timer.function = huc_delayed_load_timer_callback; + hrtimer_setup(&huc->delayed_load.timer, huc_delayed_load_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } static void delayed_huc_load_fini(struct intel_huc *huc) diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c index 95570cabdf27..f668cd9487f1 100644 --- a/drivers/gpu/drm/i915/gvt/display.c +++ b/drivers/gpu/drm/i915/gvt/display.c @@ -581,8 +581,7 @@ static int setup_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num, vgpu->display.port_num = port_num; /* Init hrtimer based on default refresh rate */ - hrtimer_init(&vblank_timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - vblank_timer->timer.function = vblank_timer_fn; + hrtimer_setup(&vblank_timer->timer, vblank_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); vblank_timer->vrefresh_k = port->vrefresh_k; vblank_timer->period = DIV64_U64_ROUND_CLOSEST(NSEC_PER_SEC * MSEC_PER_SEC, vblank_timer->vrefresh_k); diff --git a/drivers/gpu/drm/i915/gvt/sched_policy.c b/drivers/gpu/drm/i915/gvt/sched_policy.c index c077fb4674f0..9f97f743aa71 100644 --- a/drivers/gpu/drm/i915/gvt/sched_policy.c +++ b/drivers/gpu/drm/i915/gvt/sched_policy.c @@ -286,8 +286,7 @@ static int tbs_sched_init(struct intel_gvt *gvt) return -ENOMEM; INIT_LIST_HEAD(&data->lru_runq_head); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - data->timer.function = tbs_timer_fn; + hrtimer_setup(&data->timer, tbs_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); data->period = GVT_DEFAULT_TIME_SLICE; data->gvt = gvt; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 5384d1bb4923..279e266b4b06 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -3359,9 +3359,8 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, "opening stream oa config uuid=%s\n", stream->oa_config->uuid); - hrtimer_init(&stream->poll_check_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - stream->poll_check_timer.function = oa_poll_check_timer_cb; + hrtimer_setup(&stream->poll_check_timer, oa_poll_check_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); init_waitqueue_head(&stream->poll_wq); spin_lock_init(&stream->oa_buffer.ptr_lock); mutex_init(&stream->lock); diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index e55db036be1b..0ce87f188d11 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -1264,8 +1264,7 @@ void i915_pmu_register(struct drm_i915_private *i915) int ret = -ENOMEM; spin_lock_init(&pmu->lock); - hrtimer_init(&pmu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pmu->timer.function = i915_sample; + hrtimer_setup(&pmu->timer, i915_sample, CLOCK_MONOTONIC, HRTIMER_MODE_REL); pmu->cpuhp.cpu = -1; init_rc6(pmu); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 8f62cfa23fb7..ea0b8e7e4828 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -293,8 +293,7 @@ static void __rq_init_watchdog(struct i915_request *rq) { struct i915_request_watchdog *wdg = &rq->watchdog; - hrtimer_init(&wdg->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wdg->timer.function = __rq_watchdog_expired; + hrtimer_setup(&wdg->timer, __rq_watchdog_expired, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } static void __rq_arm_watchdog(struct i915_request *rq) diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index eed4937c3ff3..bdcfcae83b52 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -2103,8 +2103,7 @@ static int __fw_domain_init(struct intel_uncore *uncore, d->mask = BIT(domain_id); - hrtimer_init(&d->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - d->timer.function = intel_uncore_fw_release_timer; + hrtimer_setup(&d->timer, intel_uncore_fw_release_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); uncore->fw_domains |= BIT(domain_id); diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c index 1a5d4f1c8b42..d41e5a6bbee0 100644 --- a/drivers/gpu/drm/msm/msm_fence.c +++ b/drivers/gpu/drm/msm/msm_fence.c @@ -65,8 +65,7 @@ msm_fence_context_alloc(struct drm_device *dev, volatile uint32_t *fenceptr, fctx->completed_fence = fctx->last_fence; *fctx->fenceptr = fctx->last_fence; - hrtimer_init(&fctx->deadline_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - fctx->deadline_timer.function = deadline_timer; + hrtimer_setup(&fctx->deadline_timer, deadline_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); kthread_init_work(&fctx->deadline_work, deadline_work); diff --git a/drivers/gpu/drm/msm/msm_io_utils.c b/drivers/gpu/drm/msm/msm_io_utils.c index afedd61c3e28..a6efe1eac271 100644 --- a/drivers/gpu/drm/msm/msm_io_utils.c +++ b/drivers/gpu/drm/msm/msm_io_utils.c @@ -135,8 +135,7 @@ void msm_hrtimer_work_init(struct msm_hrtimer_work *work, clockid_t clock_id, enum hrtimer_mode mode) { - hrtimer_init(&work->timer, clock_id, mode); - work->timer.function = msm_hrtimer_worktimer; + hrtimer_setup(&work->timer, msm_hrtimer_worktimer, clock_id, mode); work->worker = worker; kthread_init_work(&work->work, fn); } diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c index d1871af967d4..2355a78e1b69 100644 --- a/drivers/gpu/drm/radeon/radeon_vce.c +++ b/drivers/gpu/drm/radeon/radeon_vce.c @@ -557,7 +557,7 @@ int radeon_vce_cs_parse(struct radeon_cs_parser *p) { int session_idx = -1; bool destroyed = false, created = false, allocated = false; - uint32_t tmp, handle = 0; + uint32_t tmp = 0, handle = 0; uint32_t *size = &tmp; int i, r = 0; diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 69bcf0e99d57..da00572d7d42 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -259,9 +259,16 @@ static void drm_sched_entity_kill(struct drm_sched_entity *entity) struct drm_sched_fence *s_fence = job->s_fence; dma_fence_get(&s_fence->finished); - if (!prev || dma_fence_add_callback(prev, &job->finish_cb, - drm_sched_entity_kill_jobs_cb)) + if (!prev || + dma_fence_add_callback(prev, &job->finish_cb, + drm_sched_entity_kill_jobs_cb)) { + /* + * Adding callback above failed. + * dma_fence_put() checks for NULL. + */ + dma_fence_put(prev); drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); + } prev = &s_fence->finished; } diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index da08ddb01d21..05608c894ed9 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -226,8 +226,12 @@ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job) struct dma_fence *fence; unsigned long irqflags; - if (unlikely(job->base.base.s_fence->finished.error)) + if (unlikely(job->base.base.s_fence->finished.error)) { + spin_lock_irqsave(&v3d->job_lock, irqflags); + v3d->bin_job = NULL; + spin_unlock_irqrestore(&v3d->job_lock, irqflags); return NULL; + } /* Lock required around bin_job update vs * v3d_overflow_mem_work(). @@ -281,8 +285,10 @@ static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job) struct drm_device *dev = &v3d->drm; struct dma_fence *fence; - if (unlikely(job->base.base.s_fence->finished.error)) + if (unlikely(job->base.base.s_fence->finished.error)) { + v3d->render_job = NULL; return NULL; + } v3d->render_job = job; @@ -327,11 +333,17 @@ v3d_tfu_job_run(struct drm_sched_job *sched_job) struct drm_device *dev = &v3d->drm; struct dma_fence *fence; + if (unlikely(job->base.base.s_fence->finished.error)) { + v3d->tfu_job = NULL; + return NULL; + } + + v3d->tfu_job = job; + fence = v3d_fence_create(v3d, V3D_TFU); if (IS_ERR(fence)) return NULL; - v3d->tfu_job = job; if (job->base.irq_fence) dma_fence_put(job->base.irq_fence); job->base.irq_fence = dma_fence_get(fence); @@ -369,6 +381,11 @@ v3d_csd_job_run(struct drm_sched_job *sched_job) struct dma_fence *fence; int i, csd_cfg0_reg; + if (unlikely(job->base.base.s_fence->finished.error)) { + v3d->csd_job = NULL; + return NULL; + } + v3d->csd_job = job; v3d_invalidate_caches(v3d); diff --git a/drivers/gpu/drm/vkms/vkms_crtc.c b/drivers/gpu/drm/vkms/vkms_crtc.c index 28a57ae109fc..ae4e36bc337c 100644 --- a/drivers/gpu/drm/vkms/vkms_crtc.c +++ b/drivers/gpu/drm/vkms/vkms_crtc.c @@ -64,8 +64,8 @@ static int vkms_enable_vblank(struct drm_crtc *crtc) struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); struct vkms_output *out = drm_crtc_to_vkms_output(crtc); - hrtimer_init(&out->vblank_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - out->vblank_hrtimer.function = &vkms_vblank_simulate; + hrtimer_setup(&out->vblank_hrtimer, &vkms_vblank_simulate, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); out->period_ns = ktime_set(0, vblank->framedur_ns); hrtimer_start(&out->vblank_hrtimer, out->period_ns, HRTIMER_MODE_REL); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c index 8651b788e98b..aec774fa4d7b 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_vkms.c @@ -290,8 +290,8 @@ vmw_vkms_enable_vblank(struct drm_crtc *crtc) drm_calc_timestamping_constants(crtc, &crtc->mode); - hrtimer_init(&du->vkms.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - du->vkms.timer.function = &vmw_vkms_vblank_simulate; + hrtimer_setup(&du->vkms.timer, &vmw_vkms_vblank_simulate, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); du->vkms.period_ns = ktime_set(0, vblank->framedur_ns); hrtimer_start(&du->vkms.timer, du->vkms.period_ns, HRTIMER_MODE_REL); diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index d9386ab03140..43bf6f140d40 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -341,7 +341,6 @@ static inline unsigned int xe_sg_segment_size(struct device *dev) return round_down(max / 2, PAGE_SIZE); } -#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) /** * xe_bo_is_mem_type - Whether the bo currently resides in the given * TTM memory type @@ -356,4 +355,3 @@ static inline bool xe_bo_is_mem_type(struct xe_bo *bo, u32 mem_type) return bo->ttm.resource->mem_type == mem_type; } #endif -#endif diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index c5b95470fa32..f67803e15a0e 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -58,7 +58,7 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *attach) * 1) Avoid pinning in a placement not accessible to some importers. * 2) Pinning in VRAM requires PIN accounting which is a to-do. */ - if (xe_bo_is_pinned(bo) && bo->ttm.resource->placement != XE_PL_TT) { + if (xe_bo_is_pinned(bo) && !xe_bo_is_mem_type(bo, XE_PL_TT)) { drm_dbg(&xe->drm, "Can't migrate pinned bo for dma-buf pin.\n"); return -EINVAL; } diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index eb6cd91e1e22..1fa46a04425e 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -1766,8 +1766,8 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, WRITE_ONCE(u->exclusive_stream, stream); - hrtimer_init(&stream->poll_check_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - stream->poll_check_timer.function = xe_oa_poll_check_timer_cb; + hrtimer_setup(&stream->poll_check_timer, xe_oa_poll_check_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); init_waitqueue_head(&stream->poll_wq); spin_lock_init(&stream->oa_buffer.ptr_lock); diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c index 46cae925b095..1f93e5e276c0 100644 --- a/drivers/gpu/host1x/dev.c +++ b/drivers/gpu/host1x/dev.c @@ -361,6 +361,10 @@ static bool host1x_wants_iommu(struct host1x *host1x) return true; } +/* + * Returns ERR_PTR on failure, NULL if the translation is IDENTITY, otherwise a + * valid paging domain. + */ static struct iommu_domain *host1x_iommu_attach(struct host1x *host) { struct iommu_domain *domain = iommu_get_domain_for_dev(host->dev); @@ -385,6 +389,8 @@ static struct iommu_domain *host1x_iommu_attach(struct host1x *host) * Similarly, if host1x is already attached to an IOMMU (via the DMA * API), don't try to attach again. */ + if (domain && domain->type == IOMMU_DOMAIN_IDENTITY) + domain = NULL; if (!host1x_wants_iommu(host) || domain) return domain; diff --git a/drivers/hwmon/nct6775-core.c b/drivers/hwmon/nct6775-core.c index fa3351351825..79bc67ffb998 100644 --- a/drivers/hwmon/nct6775-core.c +++ b/drivers/hwmon/nct6775-core.c @@ -273,8 +273,8 @@ static const s8 NCT6776_BEEP_BITS[NUM_BEEP_BITS] = { static const u16 NCT6776_REG_TOLERANCE_H[] = { 0x10c, 0x20c, 0x30c, 0x80c, 0x90c, 0xa0c, 0xb0c }; -static const u8 NCT6776_REG_PWM_MODE[] = { 0x04, 0, 0, 0, 0, 0 }; -static const u8 NCT6776_PWM_MODE_MASK[] = { 0x01, 0, 0, 0, 0, 0 }; +static const u8 NCT6776_REG_PWM_MODE[] = { 0x04, 0, 0, 0, 0, 0, 0 }; +static const u8 NCT6776_PWM_MODE_MASK[] = { 0x01, 0, 0, 0, 0, 0, 0 }; static const u16 NCT6776_REG_FAN_MIN[] = { 0x63a, 0x63c, 0x63e, 0x640, 0x642, 0x64a, 0x64c }; diff --git a/drivers/hwtracing/stm/heartbeat.c b/drivers/hwtracing/stm/heartbeat.c index e9496fe97baa..495eb1dc8ac5 100644 --- a/drivers/hwtracing/stm/heartbeat.c +++ b/drivers/hwtracing/stm/heartbeat.c @@ -81,10 +81,8 @@ static int stm_heartbeat_init(void) stm_heartbeat[i].data.type = STM_USER; stm_heartbeat[i].data.link = stm_heartbeat_link; stm_heartbeat[i].data.unlink = stm_heartbeat_unlink; - hrtimer_init(&stm_heartbeat[i].hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - stm_heartbeat[i].hrtimer.function = - stm_heartbeat_hrtimer_handler; + hrtimer_setup(&stm_heartbeat[i].hrtimer, stm_heartbeat_hrtimer_handler, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ret = stm_source_register_device(NULL, &stm_heartbeat[i].data); if (ret) diff --git a/drivers/i2c/busses/i2c-amd-mp2-pci.c b/drivers/i2c/busses/i2c-amd-mp2-pci.c index 143165300949..ef7370d3dbea 100644 --- a/drivers/i2c/busses/i2c-amd-mp2-pci.c +++ b/drivers/i2c/busses/i2c-amd-mp2-pci.c @@ -327,13 +327,11 @@ static int amd_mp2_pci_init(struct amd_mp2_dev *privdata, amd_mp2_irq_isr, irq_flag, dev_name(&pci_dev->dev), privdata); if (rc) { pci_err(pci_dev, "Failure requesting irq %i: %d\n", privdata->dev_irq, rc); - goto free_irq_vectors; + goto err_dma_mask; } return rc; -free_irq_vectors: - free_irq(privdata->dev_irq, privdata); err_dma_mask: pci_clear_master(pci_dev); err_pci_enable: @@ -376,7 +374,6 @@ static void amd_mp2_pci_remove(struct pci_dev *pci_dev) pm_runtime_forbid(&pci_dev->dev); pm_runtime_get_noresume(&pci_dev->dev); - free_irq(privdata->dev_irq, privdata); pci_clear_master(pci_dev); amd_mp2_clear_reg(privdata); diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index ee0d25b498cb..9e5d454d8318 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -1723,8 +1723,8 @@ static int i2c_imx_probe(struct platform_device *pdev) return -ENOMEM; spin_lock_init(&i2c_imx->slave_lock); - hrtimer_init(&i2c_imx->slave_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - i2c_imx->slave_timer.function = i2c_imx_slave_timeout; + hrtimer_setup(&i2c_imx->slave_timer, i2c_imx_slave_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); match = device_get_match_data(&pdev->dev); if (match) diff --git a/drivers/idle/Makefile b/drivers/idle/Makefile index 0a3c37510079..a34af1ba09bd 100644 --- a/drivers/idle/Makefile +++ b/drivers/idle/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_INTEL_IDLE) += intel_idle.o +# Branch profiling isn't noinstr-safe +ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING + +obj-$(CONFIG_INTEL_IDLE) += intel_idle.o diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 0fdb1d1316c4..5687089e406a 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -58,6 +58,7 @@ #include <asm/spec-ctrl.h> #include <asm/tsc.h> #include <asm/fpu/api.h> +#include <asm/smp.h> #define INTEL_IDLE_VERSION "0.5.1" @@ -229,6 +230,15 @@ static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev, return 0; } +static void intel_idle_enter_dead(struct cpuidle_device *dev, int index) +{ + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + struct cpuidle_state *state = &drv->states[index]; + unsigned long eax = flg2MWAIT(state->flags); + + mwait_play_dead(eax); +} + /* * States are indexed by the cstate number, * which is also the index into the MWAIT hint array. @@ -1804,6 +1814,7 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) mark_tsc_unstable("TSC halts in idle"); state->enter = intel_idle; + state->enter_dead = intel_idle_enter_dead; state->enter_s2idle = intel_idle_s2idle; } } @@ -2153,6 +2164,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) !cpuidle_state_table[cstate].enter_s2idle) break; + if (!cpuidle_state_table[cstate].enter_dead) + cpuidle_state_table[cstate].enter_dead = intel_idle_enter_dead; + /* If marked as unusable, skip this state. */ if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_UNUSABLE) { pr_debug("state %s is disabled\n", diff --git a/drivers/iio/adc/ti-tsc2046.c b/drivers/iio/adc/ti-tsc2046.c index 7dde5713973f..49560059f4b7 100644 --- a/drivers/iio/adc/ti-tsc2046.c +++ b/drivers/iio/adc/ti-tsc2046.c @@ -812,9 +812,7 @@ static int tsc2046_adc_probe(struct spi_device *spi) spin_lock_init(&priv->state_lock); priv->state = TSC2046_STATE_SHUTDOWN; - hrtimer_init(&priv->trig_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - priv->trig_timer.function = tsc2046_adc_timer; + hrtimer_setup(&priv->trig_timer, tsc2046_adc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); ret = devm_iio_trigger_register(dev, trig); if (ret) { diff --git a/drivers/iio/trigger/iio-trig-hrtimer.c b/drivers/iio/trigger/iio-trig-hrtimer.c index 716c795d08fb..82c72baccb62 100644 --- a/drivers/iio/trigger/iio-trig-hrtimer.c +++ b/drivers/iio/trigger/iio-trig-hrtimer.c @@ -145,8 +145,8 @@ static struct iio_sw_trigger *iio_trig_hrtimer_probe(const char *name) trig_info->swt.trigger->ops = &iio_hrtimer_trigger_ops; trig_info->swt.trigger->dev.groups = iio_hrtimer_attr_groups; - hrtimer_init(&trig_info->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - trig_info->timer.function = iio_hrtimer_trig_handler; + hrtimer_setup(&trig_info->timer, iio_hrtimer_trig_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); trig_info->sampling_frequency[0] = HRTIMER_DEFAULT_SAMPLING_FREQUENCY; trig_info->period = NSEC_PER_SEC / trig_info->sampling_frequency[0]; diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 3721446c6ba4..502a79136d4d 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -53,12 +53,6 @@ #define BNXT_RE_MAX_MR_SIZE_HIGH BIT_ULL(39) #define BNXT_RE_MAX_MR_SIZE BNXT_RE_MAX_MR_SIZE_HIGH -#define BNXT_RE_MAX_QPC_COUNT (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT (64 * 1024) -#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) -#define BNXT_RE_MAX_CQ_COUNT (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) -#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) /* Number of MRs to reserve for PF, leaving remainder for VFs */ #define BNXT_RE_RESVD_MR_FOR_PF (32 * 1024) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index a94c8c5387d9..4659a2f73364 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -2130,8 +2130,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) * memory for the function and all child VFs */ rc = bnxt_qplib_alloc_rcfw_channel(&rdev->qplib_res, &rdev->rcfw, - &rdev->qplib_ctx, - BNXT_RE_MAX_QPC_COUNT); + &rdev->qplib_ctx); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate RCFW Channel: %#x\n", rc); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 5336f74297f8..457eecb99f96 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1217,8 +1217,6 @@ static void __modify_flags_from_init_state(struct bnxt_qplib_qp *qp) qp->path_mtu = CMDQ_MODIFY_QP_PATH_MTU_MTU_2048; } - qp->modify_flags &= - ~CMDQ_MODIFY_QP_MODIFY_MASK_VLAN_ID; /* Bono FW require the max_dest_rd_atomic to be >= 1 */ if (qp->max_dest_rd_atomic < 1) qp->max_dest_rd_atomic = 1; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 17e62f22683b..d23074383428 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -915,7 +915,6 @@ skip_ctx_setup: void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { - kfree(rcfw->qp_tbl); kfree(rcfw->crsqe_tbl); bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq.hwq); bnxt_qplib_free_hwq(rcfw->res, &rcfw->creq.hwq); @@ -924,8 +923,7 @@ void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_ctx *ctx, - int qp_tbl_sz) + struct bnxt_qplib_ctx *ctx) { struct bnxt_qplib_hwq_attr hwq_attr = {}; struct bnxt_qplib_sg_info sginfo = {}; @@ -969,12 +967,6 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, if (!rcfw->crsqe_tbl) goto fail; - /* Allocate one extra to hold the QP1 entries */ - rcfw->qp_tbl_size = qp_tbl_sz + 1; - rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), - GFP_KERNEL); - if (!rcfw->qp_tbl) - goto fail; spin_lock_init(&rcfw->tbl_lock); rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 88814cb3aa74..ff873c5f1b25 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -262,8 +262,7 @@ static inline void bnxt_qplib_fill_cmdqmsg(struct bnxt_qplib_cmdqmsg *msg, void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_ctx *ctx, - int qp_tbl_sz); + struct bnxt_qplib_ctx *ctx); void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill); void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, @@ -285,9 +284,10 @@ int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx, int is_virtfn); void bnxt_qplib_mark_qp_error(void *qp_handle); + static inline u32 map_qp_id_to_tbl_indx(u32 qid, struct bnxt_qplib_rcfw *rcfw) { /* Last index of the qp_tbl is for QP1 ie. qp_tbl_size - 1*/ - return (qid == 1) ? rcfw->qp_tbl_size - 1 : qid % rcfw->qp_tbl_size - 2; + return (qid == 1) ? rcfw->qp_tbl_size - 1 : (qid % (rcfw->qp_tbl_size - 2)); } #endif /* __BNXT_QPLIB_RCFW_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 02922a0987ad..6cd05207ffed 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -871,6 +871,7 @@ int bnxt_qplib_init_res(struct bnxt_qplib_res *res) void bnxt_qplib_free_res(struct bnxt_qplib_res *res) { + kfree(res->rcfw->qp_tbl); bnxt_qplib_free_sgid_tbl(res, &res->sgid_tbl); bnxt_qplib_free_pd_tbl(&res->pd_tbl); bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl); @@ -878,12 +879,20 @@ void bnxt_qplib_free_res(struct bnxt_qplib_res *res) int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev) { + struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct bnxt_qplib_dev_attr *dev_attr; int rc; res->netdev = netdev; dev_attr = res->dattr; + /* Allocate one extra to hold the QP1 entries */ + rcfw->qp_tbl_size = max_t(u32, BNXT_RE_MAX_QPC_COUNT + 1, dev_attr->max_qp); + rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), + GFP_KERNEL); + if (!rcfw->qp_tbl) + return -ENOMEM; + rc = bnxt_qplib_alloc_sgid_tbl(res, &res->sgid_tbl, dev_attr->max_sgid); if (rc) goto fail; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 711990232de1..6a13927674b4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -49,6 +49,13 @@ extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; #define CHIP_NUM_58818 0xd818 #define CHIP_NUM_57608 0x1760 +#define BNXT_RE_MAX_QPC_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT (64 * 1024) +#define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) +#define BNXT_RE_MAX_CQ_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) + #define BNXT_QPLIB_DBR_VALID (0x1UL << 26) #define BNXT_QPLIB_DBR_EPOCH_SHIFT 24 #define BNXT_QPLIB_DBR_TOGGLE_SHIFT 25 @@ -600,4 +607,9 @@ static inline bool _is_cq_coalescing_supported(u16 dev_cap_ext_flags2) return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_CQ_COALESCING_SUPPORTED; } +static inline bool _is_max_srq_ext_supported(u16 dev_cap_ext_flags_2) +{ + return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 4ccd4405355a..f231e886ad9d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -176,6 +176,9 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) attr->dev_cap_flags = le16_to_cpu(sb->dev_cap_flags); attr->dev_cap_flags2 = le16_to_cpu(sb->dev_cap_ext_flags_2); + if (_is_max_srq_ext_supported(attr->dev_cap_flags2)) + attr->max_srq += le16_to_cpu(sb->max_srq_ext); + bnxt_qplib_query_version(rcfw, attr->fw_ver); for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) { diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 0ee60fdc18b3..7eceb3e9f4ce 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2215,11 +2215,12 @@ struct creq_query_func_resp_sb { #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE (0x2UL << 4) #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE + #define CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED 0x40UL #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL __le16 max_xp_qp_size; __le16 create_qp_batch_size; __le16 destroy_qp_batch_size; - __le16 reserved16; + __le16 max_srq_ext; __le64 reserved64; }; diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index cbac4a442d9e..d6fbd9c2b8b4 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -635,12 +635,11 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, spin_lock_init(&ppd->cca_timer_lock); for (i = 0; i < OPA_MAX_SLS; i++) { - hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); ppd->cca_timer[i].ppd = ppd; ppd->cca_timer[i].sl = i; ppd->cca_timer[i].ccti = 0; - ppd->cca_timer[i].hrtimer.function = cca_timer_fn; + hrtimer_setup(&ppd->cca_timer[i].hrtimer, cca_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 950c133d4220..6ee911f6885b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -175,8 +175,10 @@ void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev) if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC) ida_destroy(&hr_dev->xrcd_ida.ida); - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) { ida_destroy(&hr_dev->srq_table.srq_ida.ida); + xa_destroy(&hr_dev->srq_table.xa); + } hns_roce_cleanup_qp_table(hr_dev); hns_roce_cleanup_cq_table(hr_dev); ida_destroy(&hr_dev->mr_table.mtpt_ida.ida); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 4106423a1b39..3a5c93c9fb3e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -537,5 +537,6 @@ void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) ida_destroy(&hr_dev->cq_table.bank[i].ida); + xa_destroy(&hr_dev->cq_table.array); mutex_destroy(&hr_dev->cq_table.bank_mutex); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 605562122ecc..ca0798224e56 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1361,6 +1361,11 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev, return ret; } +/* This is the bottom bt pages number of a 100G MR on 4K OS, assuming + * the bt page size is not expanded by cal_best_bt_pg_sz() + */ +#define RESCHED_LOOP_CNT_THRESHOLD_ON_4K 12800 + /* construct the base address table and link them by address hop config */ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, @@ -1369,6 +1374,7 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, { const struct hns_roce_buf_region *r; int ofs, end; + int loop; int unit; int ret; int i; @@ -1386,7 +1392,10 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, continue; end = r->offset + r->count; - for (ofs = r->offset; ofs < end; ofs += unit) { + for (ofs = r->offset, loop = 1; ofs < end; ofs += unit, loop++) { + if (!(loop % RESCHED_LOOP_CNT_THRESHOLD_ON_4K)) + cond_resched(); + ret = hem_list_alloc_mid_bt(hr_dev, r, unit, ofs, hem_list->mid_bt[i], &hem_list->btm_bt); @@ -1443,9 +1452,14 @@ void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, struct list_head *head = &hem_list->btm_bt; struct hns_roce_hem_item *hem, *temp_hem; void *cpu_base = NULL; + int loop = 1; int nr = 0; list_for_each_entry_safe(hem, temp_hem, head, sibling) { + if (!(loop % RESCHED_LOOP_CNT_THRESHOLD_ON_4K)) + cond_resched(); + loop++; + if (hem_list_page_is_in_range(hem, offset)) { nr = offset - hem->start; cpu_base = hem->addr + nr * BA_BYTE_LEN; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index ae24c81c9812..cf89a8db4f64 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -183,7 +183,7 @@ static int hns_roce_query_device(struct ib_device *ib_dev, IB_DEVICE_RC_RNR_NAK_GEN; props->max_send_sge = hr_dev->caps.max_sq_sg; props->max_recv_sge = hr_dev->caps.max_rq_sg; - props->max_sge_rd = 1; + props->max_sge_rd = hr_dev->caps.max_sq_sg; props->max_cq = hr_dev->caps.num_cqs; props->max_cqe = hr_dev->caps.max_cqes; props->max_mr = hr_dev->caps.num_mtpts; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 9e2e76c59406..8901c142c1b6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -868,12 +868,14 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_ib_create_qp *ucmd, struct hns_roce_ib_create_qp_resp *resp) { + bool has_sdb = user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd); struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, ibucontext); + bool has_rdb = user_qp_has_rdb(hr_dev, init_attr, udata, resp); struct ib_device *ibdev = &hr_dev->ib_dev; int ret; - if (user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd)) { + if (has_sdb) { ret = hns_roce_db_map_user(uctx, ucmd->sdb_addr, &hr_qp->sdb); if (ret) { ibdev_err(ibdev, @@ -884,7 +886,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, hr_qp->en_flags |= HNS_ROCE_QP_CAP_SQ_RECORD_DB; } - if (user_qp_has_rdb(hr_dev, init_attr, udata, resp)) { + if (has_rdb) { ret = hns_roce_db_map_user(uctx, ucmd->db_addr, &hr_qp->rdb); if (ret) { ibdev_err(ibdev, @@ -898,7 +900,7 @@ static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, return 0; err_sdb: - if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) + if (has_sdb) hns_roce_db_unmap_user(uctx, &hr_qp->sdb); err_out: return ret; @@ -1119,24 +1121,23 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, ibucontext); hr_qp->config = uctx->config; ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); - if (ret) + if (ret) { ibdev_err(ibdev, "failed to set user SQ size, ret = %d.\n", ret); + return ret; + } ret = set_congest_param(hr_dev, hr_qp, ucmd); - if (ret) - return ret; } else { if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) hr_qp->config = HNS_ROCE_EXSGE_FLAGS; + default_congest_type(hr_dev, hr_qp); ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, "failed to set kernel SQ size, ret = %d.\n", ret); - - default_congest_type(hr_dev, hr_qp); } return ret; @@ -1219,7 +1220,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, min(udata->outlen, sizeof(resp))); if (ret) { ibdev_err(ibdev, "copy qp resp failed!\n"); - goto err_store; + goto err_flow_ctrl; } } @@ -1602,6 +1603,7 @@ void hns_roce_cleanup_qp_table(struct hns_roce_dev *hr_dev) for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) ida_destroy(&hr_dev->qp_table.bank[i].ida); xa_destroy(&hr_dev->qp_table.dip_xa); + xa_destroy(&hr_dev->qp_table_xa); mutex_destroy(&hr_dev->qp_table.bank_mutex); mutex_destroy(&hr_dev->qp_table.scc_mutex); } diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 99036afb3aef..531a57f9ee7e 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -50,11 +50,12 @@ static __be16 mlx5_ah_get_udp_sport(const struct mlx5_ib_dev *dev, return sport; } -static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, +static int create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, struct rdma_ah_init_attr *init_attr) { struct rdma_ah_attr *ah_attr = init_attr->ah_attr; enum ib_gid_type gid_type; + int rate_val; if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); @@ -67,8 +68,10 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.tclass = grh->traffic_class; } - ah->av.stat_rate_sl = - (mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)) << 4); + rate_val = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)); + if (rate_val < 0) + return rate_val; + ah->av.stat_rate_sl = rate_val << 4; if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { if (init_attr->xmit_slave) @@ -89,6 +92,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f; ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf); } + + return 0; } int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, @@ -121,8 +126,7 @@ int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, return err; } - create_ib_ah(dev, ah, init_attr); - return 0; + return create_ib_ah(dev, ah, init_attr); } int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index e6203e26cc06..614009fb9632 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1107,9 +1107,8 @@ int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, } /* initialize timers needed for rc qp */ timer_setup(&qp->s_timer, rvt_rc_timeout, 0); - hrtimer_init(&qp->s_rnr_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - qp->s_rnr_timer.function = rvt_rc_rnr_retry; + hrtimer_setup(&qp->s_rnr_timer, rvt_rc_rnr_retry, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Driver needs to set up it's private QP structure and do any diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 1ba4a0c8726a..e27478fe9456 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -38,10 +38,8 @@ void rxe_dealloc(struct ib_device *ib_dev) } /* initialize rxe device parameters */ -static void rxe_init_device_param(struct rxe_dev *rxe) +static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) { - struct net_device *ndev; - rxe->max_inline_data = RXE_MAX_INLINE_DATA; rxe->attr.vendor_id = RXE_VENDOR_ID; @@ -74,15 +72,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; - ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); - if (!ndev) - return; - addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, ndev->dev_addr); - dev_put(ndev); - rxe->max_ucontext = RXE_MAX_UCONTEXT; } @@ -115,18 +107,13 @@ static void rxe_init_port_param(struct rxe_port *port) /* initialize port state, note IB convention that HCA ports are always * numbered from 1 */ -static void rxe_init_ports(struct rxe_dev *rxe) +static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev) { struct rxe_port *port = &rxe->port; - struct net_device *ndev; rxe_init_port_param(port); - ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); - if (!ndev) - return; addrconf_addr_eui48((unsigned char *)&port->port_guid, ndev->dev_addr); - dev_put(ndev); spin_lock_init(&port->port_lock); } @@ -144,12 +131,12 @@ static void rxe_init_pools(struct rxe_dev *rxe) } /* initialize rxe device state */ -static void rxe_init(struct rxe_dev *rxe) +static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev) { /* init default device parameters */ - rxe_init_device_param(rxe); + rxe_init_device_param(rxe, ndev); - rxe_init_ports(rxe); + rxe_init_ports(rxe, ndev); rxe_init_pools(rxe); /* init pending mmap list */ @@ -184,7 +171,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, struct net_device *ndev) { - rxe_init(rxe); + rxe_init(rxe, ndev); rxe_set_mtu(rxe, mtu); return rxe_register_device(rxe, ibdev_name, ndev); diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index c11b9965c4ad..083fa9578bb7 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -590,13 +590,7 @@ config RISCV_IMSIC select IRQ_DOMAIN_HIERARCHY select GENERIC_IRQ_MATRIX_ALLOCATOR select GENERIC_MSI_IRQ - -config RISCV_IMSIC_PCI - bool - depends on RISCV_IMSIC - depends on PCI - depends on PCI_MSI - default RISCV_IMSIC + select IRQ_MSI_LIB config SIFIVE_PLIC bool @@ -752,6 +746,18 @@ config MCHP_EIC help Support for Microchip External Interrupt Controller. +config SOPHGO_SG2042_MSI + bool "Sophgo SG2042 MSI Controller" + depends on ARCH_SOPHGO || COMPILE_TEST + depends on PCI + select IRQ_DOMAIN_HIERARCHY + select IRQ_MSI_LIB + select PCI_MSI + help + Support for the Sophgo SG2042 MSI Controller. + This on-chip interrupt controller enables MSI sources to be + routed to the primary PLIC controller on SoC. + config SUNPLUS_SP7021_INTC bool "Sunplus SP7021 interrupt controller" if COMPILE_TEST default SOC_SP7021 diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 25e9ad29b8c4..dd60e597491d 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -128,4 +128,5 @@ obj-$(CONFIG_WPCM450_AIC) += irq-wpcm450-aic.o obj-$(CONFIG_IRQ_IDT3243X) += irq-idt3243x.o obj-$(CONFIG_APPLE_AIC) += irq-apple-aic.o obj-$(CONFIG_MCHP_EIC) += irq-mchp-eic.o +obj-$(CONFIG_SOPHGO_SG2042_MSI) += irq-sg2042-msi.o obj-$(CONFIG_SUNPLUS_SP7021_INTC) += irq-sp7021-intc.o diff --git a/drivers/irqchip/irq-davinci-cp-intc.c b/drivers/irqchip/irq-davinci-cp-intc.c index f4f8e9fadbbf..d7948c55f542 100644 --- a/drivers/irqchip/irq-davinci-cp-intc.c +++ b/drivers/irqchip/irq-davinci-cp-intc.c @@ -11,7 +11,6 @@ #include <linux/init.h> #include <linux/irq.h> #include <linux/irqchip.h> -#include <linux/irqchip/irq-davinci-cp-intc.h> #include <linux/irqdomain.h> #include <linux/io.h> #include <linux/of.h> @@ -154,24 +153,20 @@ static const struct irq_domain_ops davinci_cp_intc_irq_domain_ops = { .xlate = irq_domain_xlate_onetwocell, }; -static int __init -davinci_cp_intc_do_init(const struct davinci_cp_intc_config *config, - struct device_node *node) +static int __init davinci_cp_intc_do_init(struct resource *res, unsigned int num_irqs, + struct device_node *node) { - unsigned int num_regs = BITS_TO_LONGS(config->num_irqs); + unsigned int num_regs = BITS_TO_LONGS(num_irqs); int offset, irq_base; void __iomem *req; - req = request_mem_region(config->reg.start, - resource_size(&config->reg), - "davinci-cp-intc"); + req = request_mem_region(res->start, resource_size(res), "davinci-cp-intc"); if (!req) { pr_err("%s: register range busy\n", __func__); return -EBUSY; } - davinci_cp_intc_base = ioremap(config->reg.start, - resource_size(&config->reg)); + davinci_cp_intc_base = ioremap(res->start, resource_size(res)); if (!davinci_cp_intc_base) { pr_err("%s: unable to ioremap register range\n", __func__); return -EINVAL; @@ -184,8 +179,7 @@ davinci_cp_intc_do_init(const struct davinci_cp_intc_config *config, /* Disable system interrupts */ for (offset = 0; offset < num_regs; offset++) - davinci_cp_intc_write(~0, - DAVINCI_CP_INTC_SYS_ENABLE_CLR(offset)); + davinci_cp_intc_write(~0, DAVINCI_CP_INTC_SYS_ENABLE_CLR(offset)); /* Set to normal mode, no nesting, no priority hold */ davinci_cp_intc_write(0, DAVINCI_CP_INTC_CTRL); @@ -193,28 +187,25 @@ davinci_cp_intc_do_init(const struct davinci_cp_intc_config *config, /* Clear system interrupt status */ for (offset = 0; offset < num_regs; offset++) - davinci_cp_intc_write(~0, - DAVINCI_CP_INTC_SYS_STAT_CLR(offset)); + davinci_cp_intc_write(~0, DAVINCI_CP_INTC_SYS_STAT_CLR(offset)); /* Enable nIRQ (what about nFIQ?) */ davinci_cp_intc_write(1, DAVINCI_CP_INTC_HOST_ENABLE_IDX_SET); + /* 4 channels per register */ + num_regs = (num_irqs + 3) >> 2; /* Default all priorities to channel 7. */ - num_regs = (config->num_irqs + 3) >> 2; /* 4 channels per register */ for (offset = 0; offset < num_regs; offset++) - davinci_cp_intc_write(0x07070707, - DAVINCI_CP_INTC_CHAN_MAP(offset)); + davinci_cp_intc_write(0x07070707, DAVINCI_CP_INTC_CHAN_MAP(offset)); - irq_base = irq_alloc_descs(-1, 0, config->num_irqs, 0); + irq_base = irq_alloc_descs(-1, 0, num_irqs, 0); if (irq_base < 0) { - pr_err("%s: unable to allocate interrupt descriptors: %d\n", - __func__, irq_base); + pr_err("%s: unable to allocate interrupt descriptors: %d\n", __func__, irq_base); return irq_base; } - davinci_cp_intc_irq_domain = irq_domain_add_legacy( - node, config->num_irqs, irq_base, 0, - &davinci_cp_intc_irq_domain_ops, NULL); + davinci_cp_intc_irq_domain = irq_domain_add_legacy(node, num_irqs, irq_base, 0, + &davinci_cp_intc_irq_domain_ops, NULL); if (!davinci_cp_intc_irq_domain) { pr_err("%s: unable to create an interrupt domain\n", __func__); @@ -229,31 +220,25 @@ davinci_cp_intc_do_init(const struct davinci_cp_intc_config *config, return 0; } -int __init davinci_cp_intc_init(const struct davinci_cp_intc_config *config) -{ - return davinci_cp_intc_do_init(config, NULL); -} - static int __init davinci_cp_intc_of_init(struct device_node *node, struct device_node *parent) { - struct davinci_cp_intc_config config = { }; + unsigned int num_irqs; + struct resource res; int ret; - ret = of_address_to_resource(node, 0, &config.reg); + ret = of_address_to_resource(node, 0, &res); if (ret) { - pr_err("%s: unable to get the register range from device-tree\n", - __func__); + pr_err("%s: unable to get the register range from device-tree\n", __func__); return ret; } - ret = of_property_read_u32(node, "ti,intc-size", &config.num_irqs); + ret = of_property_read_u32(node, "ti,intc-size", &num_irqs); if (ret) { - pr_err("%s: unable to read the 'ti,intc-size' property\n", - __func__); + pr_err("%s: unable to read the 'ti,intc-size' property\n", __func__); return ret; } - return davinci_cp_intc_do_init(&config, node); + return davinci_cp_intc_do_init(&res, num_irqs, node); } IRQCHIP_DECLARE(cp_intc, "ti,cp-intc", davinci_cp_intc_of_init); diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index be35c5349986..1e3476c335ca 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -255,6 +255,7 @@ static void __init gicv2m_teardown(void) static struct msi_parent_ops gicv2m_msi_parent_ops = { .supported_flags = GICV2M_MSI_FLAGS_SUPPORTED, .required_flags = GICV2M_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "GICv2m-", diff --git a/drivers/irqchip/irq-gic-v3-its-msi-parent.c b/drivers/irqchip/irq-gic-v3-its-msi-parent.c index e150365fbe89..bdb04c808148 100644 --- a/drivers/irqchip/irq-gic-v3-its-msi-parent.c +++ b/drivers/irqchip/irq-gic-v3-its-msi-parent.c @@ -203,6 +203,7 @@ static bool its_init_dev_msi_info(struct device *dev, struct irq_domain *domain, const struct msi_parent_ops gic_v3_its_msi_parent_ops = { .supported_flags = ITS_MSI_FLAGS_SUPPORTED, .required_flags = ITS_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "ITS-", diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 8c3ec5734f1e..f30ed281882f 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -205,13 +205,15 @@ static DEFINE_IDA(its_vpeid_ida); #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) +static gfp_t gfp_flags_quirk; + static struct page *its_alloc_pages_node(int node, gfp_t gfp, unsigned int order) { struct page *page; int ret = 0; - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_node(node, gfp | gfp_flags_quirk, order); if (!page) return NULL; @@ -4887,6 +4889,17 @@ static bool __maybe_unused its_enable_quirk_hip09_162100801(void *data) return true; } +static bool __maybe_unused its_enable_rk3568002(void *data) +{ + if (!of_machine_is_compatible("rockchip,rk3566") && + !of_machine_is_compatible("rockchip,rk3568")) + return false; + + gfp_flags_quirk |= GFP_DMA32; + + return true; +} + static const struct gic_quirk its_quirks[] = { #ifdef CONFIG_CAVIUM_ERRATUM_22375 { @@ -4954,6 +4967,14 @@ static const struct gic_quirk its_quirks[] = { .property = "dma-noncoherent", .init = its_set_non_coherent, }, +#ifdef CONFIG_ROCKCHIP_ERRATUM_3568002 + { + .desc = "ITS: Rockchip erratum RK3568002", + .iidr = 0x0201743b, + .mask = 0xffffffff, + .init = its_enable_rk3568002, + }, +#endif { } }; diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index 3fe870f8ee17..3e1d8a1cda5e 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -201,6 +201,7 @@ static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain, static const struct msi_parent_ops gic_v3_mbi_msi_parent_ops = { .supported_flags = MBI_MSI_FLAGS_SUPPORTED, .required_flags = MBI_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .prefix = "MBI-", diff --git a/drivers/irqchip/irq-imx-irqsteer.c b/drivers/irqchip/irq-imx-irqsteer.c index b0e9788c0045..afbfcce3b1e3 100644 --- a/drivers/irqchip/irq-imx-irqsteer.c +++ b/drivers/irqchip/irq-imx-irqsteer.c @@ -24,7 +24,7 @@ #define CHAN_MINTDIS(t) (CTRL_STRIDE_OFF(t, 3) + 0x4) #define CHAN_MASTRSTAT(t) (CTRL_STRIDE_OFF(t, 3) + 0x8) -#define CHAN_MAX_OUTPUT_INT 0x8 +#define CHAN_MAX_OUTPUT_INT 0xF struct irqsteer_data { void __iomem *regs; @@ -228,10 +228,8 @@ static int imx_irqsteer_probe(struct platform_device *pdev) for (i = 0; i < data->irq_count; i++) { data->irq[i] = irq_of_parse_and_map(np, i); - if (!data->irq[i]) { - ret = -EINVAL; - goto out; - } + if (!data->irq[i]) + break; irq_set_chained_handler_and_data(data->irq[i], imx_irqsteer_irq_handler, @@ -254,9 +252,13 @@ static void imx_irqsteer_remove(struct platform_device *pdev) struct irqsteer_data *irqsteer_data = platform_get_drvdata(pdev); int i; - for (i = 0; i < irqsteer_data->irq_count; i++) + for (i = 0; i < irqsteer_data->irq_count; i++) { + if (!irqsteer_data->irq[i]) + break; + irq_set_chained_handler_and_data(irqsteer_data->irq[i], NULL, NULL); + } irq_domain_remove(irqsteer_data->domain); diff --git a/drivers/irqchip/irq-imx-mu-msi.c b/drivers/irqchip/irq-imx-mu-msi.c index 4342a21de1eb..69aacdfc8bef 100644 --- a/drivers/irqchip/irq-imx-mu-msi.c +++ b/drivers/irqchip/irq-imx-mu-msi.c @@ -214,6 +214,7 @@ static void imx_mu_msi_irq_handler(struct irq_desc *desc) static const struct msi_parent_ops imx_mu_msi_parent_ops = { .supported_flags = IMX_MU_MSI_FLAGS_SUPPORTED, .required_flags = IMX_MU_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PLATFORM_MSI, .prefix = "MU-MSI-", diff --git a/drivers/irqchip/irq-loongson-pch-msi.c b/drivers/irqchip/irq-loongson-pch-msi.c index bd337ecddb40..9c62108b3ad5 100644 --- a/drivers/irqchip/irq-loongson-pch-msi.c +++ b/drivers/irqchip/irq-loongson-pch-msi.c @@ -146,6 +146,7 @@ static const struct irq_domain_ops pch_msi_middle_domain_ops = { static struct msi_parent_ops pch_msi_parent_ops = { .required_flags = PCH_MSI_FLAGS_REQUIRED, .supported_flags = PCH_MSI_FLAGS_SUPPORTED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_mask = MATCH_PCI_MSI, .bus_select_token = DOMAIN_BUS_NEXUS, .prefix = "PCH-", diff --git a/drivers/irqchip/irq-msi-lib.c b/drivers/irqchip/irq-msi-lib.c index d8e29fc0d406..51464c6257f3 100644 --- a/drivers/irqchip/irq-msi-lib.c +++ b/drivers/irqchip/irq-msi-lib.c @@ -28,6 +28,7 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, struct msi_domain_info *info) { const struct msi_parent_ops *pops = real_parent->msi_parent_ops; + struct irq_chip *chip = info->chip; u32 required_flags; /* Parent ops available? */ @@ -92,10 +93,10 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, info->flags |= required_flags; /* Chip updates for all child bus types */ - if (!info->chip->irq_eoi) - info->chip->irq_eoi = irq_chip_eoi_parent; - if (!info->chip->irq_ack) - info->chip->irq_ack = irq_chip_ack_parent; + if (!chip->irq_eoi && (pops->chip_flags & MSI_CHIP_FLAG_SET_EOI)) + chip->irq_eoi = irq_chip_eoi_parent; + if (!chip->irq_ack && (pops->chip_flags & MSI_CHIP_FLAG_SET_ACK)) + chip->irq_ack = irq_chip_ack_parent; /* * The device MSI domain can never have a set affinity callback. It @@ -105,7 +106,7 @@ bool msi_lib_init_dev_msi_info(struct device *dev, struct irq_domain *domain, * device MSI domain aside of mask/unmask which is provided e.g. by * PCI/MSI device domains. */ - info->chip->irq_set_affinity = msi_domain_set_affinity; + chip->irq_set_affinity = msi_domain_set_affinity; return true; } EXPORT_SYMBOL_GPL(msi_lib_init_dev_msi_info); diff --git a/drivers/irqchip/irq-mvebu-gicp.c b/drivers/irqchip/irq-mvebu-gicp.c index 2b6183919ea4..d67f93f6d750 100644 --- a/drivers/irqchip/irq-mvebu-gicp.c +++ b/drivers/irqchip/irq-mvebu-gicp.c @@ -161,6 +161,7 @@ static const struct irq_domain_ops gicp_domain_ops = { static const struct msi_parent_ops gicp_msi_parent_ops = { .supported_flags = GICP_MSI_FLAGS_SUPPORTED, .required_flags = GICP_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_GENERIC_MSI, .bus_select_mask = MATCH_PLATFORM_MSI, .prefix = "GICP-", diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c index ff19bfd258dc..28f7e81df94f 100644 --- a/drivers/irqchip/irq-mvebu-odmi.c +++ b/drivers/irqchip/irq-mvebu-odmi.c @@ -157,6 +157,7 @@ static const struct irq_domain_ops odmi_domain_ops = { static const struct msi_parent_ops odmi_msi_parent_ops = { .supported_flags = ODMI_MSI_FLAGS_SUPPORTED, .required_flags = ODMI_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_GENERIC_MSI, .bus_select_mask = MATCH_PLATFORM_MSI, .prefix = "ODMI-", diff --git a/drivers/irqchip/irq-mvebu-sei.c b/drivers/irqchip/irq-mvebu-sei.c index 065166ab5dbc..ebd4a9014e8d 100644 --- a/drivers/irqchip/irq-mvebu-sei.c +++ b/drivers/irqchip/irq-mvebu-sei.c @@ -356,6 +356,7 @@ static void mvebu_sei_reset(struct mvebu_sei *sei) static const struct msi_parent_ops sei_msi_parent_ops = { .supported_flags = SEI_MSI_FLAGS_SUPPORTED, .required_flags = SEI_MSI_FLAGS_REQUIRED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK, .bus_select_mask = MATCH_PLATFORM_MSI, .bus_select_token = DOMAIN_BUS_GENERIC_MSI, .prefix = "SEI-", diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 99e27e01b0b1..6a2e41f02446 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -541,43 +541,36 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * return -ENODEV; parent_domain = irq_find_host(parent); - if (!parent_domain) { - dev_err(&pdev->dev, "cannot find parent domain\n"); - return -ENODEV; - } + if (!parent_domain) + return dev_err_probe(dev, -ENODEV, "cannot find parent domain\n"); - rzg2l_irqc_data = devm_kzalloc(&pdev->dev, sizeof(*rzg2l_irqc_data), GFP_KERNEL); + rzg2l_irqc_data = devm_kzalloc(dev, sizeof(*rzg2l_irqc_data), GFP_KERNEL); if (!rzg2l_irqc_data) return -ENOMEM; rzg2l_irqc_data->irqchip = irq_chip; - rzg2l_irqc_data->base = devm_of_iomap(&pdev->dev, pdev->dev.of_node, 0, NULL); + rzg2l_irqc_data->base = devm_of_iomap(dev, dev->of_node, 0, NULL); if (IS_ERR(rzg2l_irqc_data->base)) return PTR_ERR(rzg2l_irqc_data->base); ret = rzg2l_irqc_parse_interrupts(rzg2l_irqc_data, node); - if (ret) { - dev_err(&pdev->dev, "cannot parse interrupts: %d\n", ret); - return ret; - } - - resetn = devm_reset_control_get_exclusive(&pdev->dev, NULL); - if (IS_ERR(resetn)) - return PTR_ERR(resetn); + if (ret) + return dev_err_probe(dev, ret, "cannot parse interrupts: %d\n", ret); - ret = reset_control_deassert(resetn); - if (ret) { - dev_err(&pdev->dev, "failed to deassert resetn pin, %d\n", ret); - return ret; + resetn = devm_reset_control_get_exclusive_deasserted(dev, NULL); + if (IS_ERR(resetn)) { + return dev_err_probe(dev, PTR_ERR(resetn), + "failed to acquire deasserted reset: %d\n", ret); } - pm_runtime_enable(&pdev->dev); - ret = pm_runtime_resume_and_get(&pdev->dev); - if (ret < 0) { - dev_err(&pdev->dev, "pm_runtime_resume_and_get failed: %d\n", ret); - goto pm_disable; - } + ret = devm_pm_runtime_enable(dev); + if (ret) + return dev_err_probe(dev, ret, "devm_pm_runtime_enable failed: %d\n", ret); + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return dev_err_probe(dev, ret, "pm_runtime_resume_and_get failed: %d\n", ret); raw_spin_lock_init(&rzg2l_irqc_data->lock); @@ -585,9 +578,8 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * node, &rzg2l_irqc_domain_ops, rzg2l_irqc_data); if (!irq_domain) { - dev_err(&pdev->dev, "failed to add irq domain\n"); - ret = -ENOMEM; - goto pm_put; + pm_runtime_put(dev); + return dev_err_probe(dev, -ENOMEM, "failed to add irq domain\n"); } register_syscore_ops(&rzg2l_irqc_syscore_ops); @@ -604,13 +596,6 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * dev = NULL; return 0; - -pm_put: - pm_runtime_put(&pdev->dev); -pm_disable: - pm_runtime_disable(&pdev->dev); - reset_control_assert(resetn); - return ret; } static int __init rzg2l_irqc_init(struct device_node *node, diff --git a/drivers/irqchip/irq-renesas-rzv2h.c b/drivers/irqchip/irq-renesas-rzv2h.c index fe2d29e91026..3d5b5fdf9bde 100644 --- a/drivers/irqchip/irq-renesas-rzv2h.c +++ b/drivers/irqchip/irq-renesas-rzv2h.c @@ -64,11 +64,18 @@ #define ICU_TINT_LEVEL_HIGH 2 #define ICU_TINT_LEVEL_LOW 3 -#define ICU_TSSR_K(tint_nr) ((tint_nr) / 4) -#define ICU_TSSR_TSSEL_N(tint_nr) ((tint_nr) % 4) -#define ICU_TSSR_TSSEL_PREP(tssel, n) ((tssel) << ((n) * 8)) -#define ICU_TSSR_TSSEL_MASK(n) ICU_TSSR_TSSEL_PREP(0x7F, n) -#define ICU_TSSR_TIEN(n) (BIT(7) << ((n) * 8)) +#define ICU_TSSR_TSSEL_PREP(tssel, n, field_width) ((tssel) << ((n) * (field_width))) +#define ICU_TSSR_TSSEL_MASK(n, field_width) \ +({\ + typeof(field_width) (_field_width) = (field_width); \ + ICU_TSSR_TSSEL_PREP((GENMASK(((_field_width) - 2), 0)), (n), _field_width); \ +}) + +#define ICU_TSSR_TIEN(n, field_width) \ +({\ + typeof(field_width) (_field_width) = (field_width); \ + BIT((_field_width) - 1) << ((n) * (_field_width)); \ +}) #define ICU_TITSR_K(tint_nr) ((tint_nr) / 16) #define ICU_TITSR_TITSEL_N(tint_nr) ((tint_nr) % 16) @@ -78,20 +85,36 @@ #define ICU_TINT_EXTRACT_HWIRQ(x) FIELD_GET(GENMASK(15, 0), (x)) #define ICU_TINT_EXTRACT_GPIOINT(x) FIELD_GET(GENMASK(31, 16), (x)) -#define ICU_PB5_TINT 0x55 +#define ICU_RZG3E_TINT_OFFSET 0x800 +#define ICU_RZG3E_TSSEL_MAX_VAL 0x8c +#define ICU_RZV2H_TSSEL_MAX_VAL 0x55 + +/** + * struct rzv2h_hw_info - Interrupt Control Unit controller hardware info structure. + * @tssel_lut: TINT lookup table + * @t_offs: TINT offset + * @max_tssel: TSSEL max value + * @field_width: TSSR field width + */ +struct rzv2h_hw_info { + const u8 *tssel_lut; + u16 t_offs; + u8 max_tssel; + u8 field_width; +}; /** * struct rzv2h_icu_priv - Interrupt Control Unit controller private data structure. * @base: Controller's base address - * @irqchip: Pointer to struct irq_chip * @fwspec: IRQ firmware specific data * @lock: Lock to serialize access to hardware registers + * @info: Pointer to struct rzv2h_hw_info */ struct rzv2h_icu_priv { void __iomem *base; - const struct irq_chip *irqchip; struct irq_fwspec fwspec[ICU_NUM_IRQ]; raw_spinlock_t lock; + const struct rzv2h_hw_info *info; }; static inline struct rzv2h_icu_priv *irq_data_to_priv(struct irq_data *data) @@ -111,7 +134,7 @@ static void rzv2h_icu_eoi(struct irq_data *d) tintirq_nr = hw_irq - ICU_TINT_START; bit = BIT(tintirq_nr); if (!irqd_is_level_type(d)) - writel_relaxed(bit, priv->base + ICU_TSCLR); + writel_relaxed(bit, priv->base + priv->info->t_offs + ICU_TSCLR); } else if (hw_irq >= ICU_IRQ_START) { tintirq_nr = hw_irq - ICU_IRQ_START; bit = BIT(tintirq_nr); @@ -130,21 +153,23 @@ static void rzv2h_tint_irq_endisable(struct irq_data *d, bool enable) struct rzv2h_icu_priv *priv = irq_data_to_priv(d); unsigned int hw_irq = irqd_to_hwirq(d); u32 tint_nr, tssel_n, k, tssr; + u8 nr_tint; if (hw_irq < ICU_TINT_START) return; tint_nr = hw_irq - ICU_TINT_START; - k = ICU_TSSR_K(tint_nr); - tssel_n = ICU_TSSR_TSSEL_N(tint_nr); + nr_tint = 32 / priv->info->field_width; + k = tint_nr / nr_tint; + tssel_n = tint_nr % nr_tint; guard(raw_spinlock)(&priv->lock); - tssr = readl_relaxed(priv->base + ICU_TSSR(k)); + tssr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TSSR(k)); if (enable) - tssr |= ICU_TSSR_TIEN(tssel_n); + tssr |= ICU_TSSR_TIEN(tssel_n, priv->info->field_width); else - tssr &= ~ICU_TSSR_TIEN(tssel_n); - writel_relaxed(tssr, priv->base + ICU_TSSR(k)); + tssr &= ~ICU_TSSR_TIEN(tssel_n, priv->info->field_width); + writel_relaxed(tssr, priv->base + priv->info->t_offs + ICU_TSSR(k)); } static void rzv2h_icu_irq_disable(struct irq_data *d) @@ -247,8 +272,8 @@ static void rzv2h_clear_tint_int(struct rzv2h_icu_priv *priv, unsigned int hwirq u32 bit = BIT(tint_nr); int k = tint_nr / 16; - tsctr = readl_relaxed(priv->base + ICU_TSCTR); - titsr = readl_relaxed(priv->base + ICU_TITSR(k)); + tsctr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TSCTR); + titsr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TITSR(k)); titsel = ICU_TITSR_TITSEL_GET(titsr, titsel_n); /* @@ -257,7 +282,7 @@ static void rzv2h_clear_tint_int(struct rzv2h_icu_priv *priv, unsigned int hwirq */ if ((tsctr & bit) && ((titsel == ICU_TINT_EDGE_RISING) || (titsel == ICU_TINT_EDGE_FALLING))) - writel_relaxed(bit, priv->base + ICU_TSCLR); + writel_relaxed(bit, priv->base + priv->info->t_offs + ICU_TSCLR); } static int rzv2h_tint_set_type(struct irq_data *d, unsigned int type) @@ -268,6 +293,7 @@ static int rzv2h_tint_set_type(struct irq_data *d, unsigned int type) unsigned int hwirq; u32 tint, sense; int tint_nr; + u8 nr_tint; switch (type & IRQ_TYPE_SENSE_MASK) { case IRQ_TYPE_LEVEL_LOW: @@ -290,39 +316,42 @@ static int rzv2h_tint_set_type(struct irq_data *d, unsigned int type) return -EINVAL; } + priv = irq_data_to_priv(d); tint = (u32)(uintptr_t)irq_data_get_irq_chip_data(d); - if (tint > ICU_PB5_TINT) + if (tint > priv->info->max_tssel) return -EINVAL; - priv = irq_data_to_priv(d); - hwirq = irqd_to_hwirq(d); + if (priv->info->tssel_lut) + tint = priv->info->tssel_lut[tint]; + hwirq = irqd_to_hwirq(d); tint_nr = hwirq - ICU_TINT_START; - tssr_k = ICU_TSSR_K(tint_nr); - tssel_n = ICU_TSSR_TSSEL_N(tint_nr); + nr_tint = 32 / priv->info->field_width; + tssr_k = tint_nr / nr_tint; + tssel_n = tint_nr % nr_tint; + tien = ICU_TSSR_TIEN(tssel_n, priv->info->field_width); titsr_k = ICU_TITSR_K(tint_nr); titsel_n = ICU_TITSR_TITSEL_N(tint_nr); - tien = ICU_TSSR_TIEN(titsel_n); guard(raw_spinlock)(&priv->lock); - tssr = readl_relaxed(priv->base + ICU_TSSR(tssr_k)); - tssr &= ~(ICU_TSSR_TSSEL_MASK(tssel_n) | tien); - tssr |= ICU_TSSR_TSSEL_PREP(tint, tssel_n); + tssr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TSSR(tssr_k)); + tssr &= ~(ICU_TSSR_TSSEL_MASK(tssel_n, priv->info->field_width) | tien); + tssr |= ICU_TSSR_TSSEL_PREP(tint, tssel_n, priv->info->field_width); - writel_relaxed(tssr, priv->base + ICU_TSSR(tssr_k)); + writel_relaxed(tssr, priv->base + priv->info->t_offs + ICU_TSSR(tssr_k)); - titsr = readl_relaxed(priv->base + ICU_TITSR(titsr_k)); + titsr = readl_relaxed(priv->base + priv->info->t_offs + ICU_TITSR(titsr_k)); titsr &= ~ICU_TITSR_TITSEL_MASK(titsel_n); titsr |= ICU_TITSR_TITSEL_PREP(sense, titsel_n); - writel_relaxed(titsr, priv->base + ICU_TITSR(titsr_k)); + writel_relaxed(titsr, priv->base + priv->info->t_offs + ICU_TITSR(titsr_k)); rzv2h_clear_tint_int(priv, hwirq); - writel_relaxed(tssr | tien, priv->base + ICU_TSSR(tssr_k)); + writel_relaxed(tssr | tien, priv->base + priv->info->t_offs + ICU_TSSR(tssr_k)); return 0; } @@ -390,7 +419,7 @@ static int rzv2h_icu_alloc(struct irq_domain *domain, unsigned int virq, unsigne if (hwirq > (ICU_NUM_IRQ - 1)) return -EINVAL; - ret = irq_domain_set_hwirq_and_chip(domain, virq, hwirq, priv->irqchip, + ret = irq_domain_set_hwirq_and_chip(domain, virq, hwirq, &rzv2h_icu_chip, (void *)(uintptr_t)tint); if (ret) return ret; @@ -421,7 +450,13 @@ static int rzv2h_icu_parse_interrupts(struct rzv2h_icu_priv *priv, struct device return 0; } -static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) +static void rzv2h_icu_put_device(void *data) +{ + put_device(data); +} + +static int rzv2h_icu_init_common(struct device_node *node, struct device_node *parent, + const struct rzv2h_hw_info *hw_info) { struct irq_domain *irq_domain, *parent_domain; struct rzv2h_icu_priv *rzv2h_icu_data; @@ -433,50 +468,48 @@ static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) if (!pdev) return -ENODEV; + ret = devm_add_action_or_reset(&pdev->dev, rzv2h_icu_put_device, + &pdev->dev); + if (ret < 0) + return ret; + parent_domain = irq_find_host(parent); if (!parent_domain) { dev_err(&pdev->dev, "cannot find parent domain\n"); - ret = -ENODEV; - goto put_dev; + return -ENODEV; } rzv2h_icu_data = devm_kzalloc(&pdev->dev, sizeof(*rzv2h_icu_data), GFP_KERNEL); - if (!rzv2h_icu_data) { - ret = -ENOMEM; - goto put_dev; - } - - rzv2h_icu_data->irqchip = &rzv2h_icu_chip; + if (!rzv2h_icu_data) + return -ENOMEM; rzv2h_icu_data->base = devm_of_iomap(&pdev->dev, pdev->dev.of_node, 0, NULL); - if (IS_ERR(rzv2h_icu_data->base)) { - ret = PTR_ERR(rzv2h_icu_data->base); - goto put_dev; - } + if (IS_ERR(rzv2h_icu_data->base)) + return PTR_ERR(rzv2h_icu_data->base); ret = rzv2h_icu_parse_interrupts(rzv2h_icu_data, node); if (ret) { dev_err(&pdev->dev, "cannot parse interrupts: %d\n", ret); - goto put_dev; + return ret; } - resetn = devm_reset_control_get_exclusive(&pdev->dev, NULL); + resetn = devm_reset_control_get_exclusive_deasserted(&pdev->dev, NULL); if (IS_ERR(resetn)) { ret = PTR_ERR(resetn); - goto put_dev; + dev_err(&pdev->dev, "failed to acquire deasserted reset: %d\n", ret); + return ret; } - ret = reset_control_deassert(resetn); - if (ret) { - dev_err(&pdev->dev, "failed to deassert resetn pin, %d\n", ret); - goto put_dev; + ret = devm_pm_runtime_enable(&pdev->dev); + if (ret < 0) { + dev_err(&pdev->dev, "devm_pm_runtime_enable failed, %d\n", ret); + return ret; } - pm_runtime_enable(&pdev->dev); ret = pm_runtime_resume_and_get(&pdev->dev); if (ret < 0) { dev_err(&pdev->dev, "pm_runtime_resume_and_get failed: %d\n", ret); - goto pm_disable; + return ret; } raw_spin_lock_init(&rzv2h_icu_data->lock); @@ -489,6 +522,8 @@ static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) goto pm_put; } + rzv2h_icu_data->info = hw_info; + /* * coccicheck complains about a missing put_device call before returning, but it's a false * positive. We still need &pdev->dev after successfully returning from this function. @@ -497,16 +532,61 @@ static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) pm_put: pm_runtime_put(&pdev->dev); -pm_disable: - pm_runtime_disable(&pdev->dev); - reset_control_assert(resetn); -put_dev: - put_device(&pdev->dev); return ret; } +/* Mapping based on port index on Table 4.2-6 and TSSEL bits on Table 4.6-4 */ +static const u8 rzg3e_tssel_lut[] = { + 81, 82, 83, 84, 85, 86, 87, 88, /* P00-P07 */ + 89, 90, 91, 92, 93, 94, 95, 96, /* P10-P17 */ + 111, 112, /* P20-P21 */ + 97, 98, 99, 100, 101, 102, 103, 104, /* P30-P37 */ + 105, 106, 107, 108, 109, 110, /* P40-P45 */ + 113, 114, 115, 116, 117, 118, 119, /* P50-P56 */ + 120, 121, 122, 123, 124, 125, 126, /* P60-P66 */ + 127, 128, 129, 130, 131, 132, 133, 134, /* P70-P77 */ + 135, 136, 137, 138, 139, 140, /* P80-P85 */ + 43, 44, 45, 46, 47, 48, 49, 50, /* PA0-PA7 */ + 51, 52, 53, 54, 55, 56, 57, 58, /* PB0-PB7 */ + 59, 60, 61, /* PC0-PC2 */ + 62, 63, 64, 65, 66, 67, 68, 69, /* PD0-PD7 */ + 70, 71, 72, 73, 74, 75, 76, 77, /* PE0-PE7 */ + 78, 79, 80, /* PF0-PF2 */ + 25, 26, 27, 28, 29, 30, 31, 32, /* PG0-PG7 */ + 33, 34, 35, 36, 37, 38, /* PH0-PH5 */ + 4, 5, 6, 7, 8, /* PJ0-PJ4 */ + 39, 40, 41, 42, /* PK0-PK3 */ + 9, 10, 11, 12, 21, 22, 23, 24, /* PL0-PL7 */ + 13, 14, 15, 16, 17, 18, 19, 20, /* PM0-PM7 */ + 0, 1, 2, 3 /* PS0-PS3 */ +}; + +static const struct rzv2h_hw_info rzg3e_hw_params = { + .tssel_lut = rzg3e_tssel_lut, + .t_offs = ICU_RZG3E_TINT_OFFSET, + .max_tssel = ICU_RZG3E_TSSEL_MAX_VAL, + .field_width = 16, +}; + +static const struct rzv2h_hw_info rzv2h_hw_params = { + .t_offs = 0, + .max_tssel = ICU_RZV2H_TSSEL_MAX_VAL, + .field_width = 8, +}; + +static int rzg3e_icu_init(struct device_node *node, struct device_node *parent) +{ + return rzv2h_icu_init_common(node, parent, &rzg3e_hw_params); +} + +static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) +{ + return rzv2h_icu_init_common(node, parent, &rzv2h_hw_params); +} + IRQCHIP_PLATFORM_DRIVER_BEGIN(rzv2h_icu) +IRQCHIP_MATCH("renesas,r9a09g047-icu", rzg3e_icu_init) IRQCHIP_MATCH("renesas,r9a09g057-icu", rzv2h_icu_init) IRQCHIP_PLATFORM_DRIVER_END(rzv2h_icu) MODULE_AUTHOR("Fabrizio Castro <fabrizio.castro.jz@renesas.com>"); diff --git a/drivers/irqchip/irq-riscv-aplic-direct.c b/drivers/irqchip/irq-riscv-aplic-direct.c index 7cd6b646774b..205ad61d15e4 100644 --- a/drivers/irqchip/irq-riscv-aplic-direct.c +++ b/drivers/irqchip/irq-riscv-aplic-direct.c @@ -31,7 +31,7 @@ struct aplic_direct { }; struct aplic_idc { - unsigned int hart_index; + u32 hart_index; void __iomem *regs; struct aplic_direct *direct; }; @@ -219,6 +219,20 @@ static int aplic_direct_parse_parent_hwirq(struct device *dev, u32 index, return 0; } +static int aplic_direct_get_hart_index(struct device *dev, u32 logical_index, + u32 *hart_index) +{ + const char *prop_hart_index = "riscv,hart-indexes"; + struct device_node *np = to_of_node(dev->fwnode); + + if (!np || !of_property_present(np, prop_hart_index)) { + *hart_index = logical_index; + return 0; + } + + return of_property_read_u32_index(np, prop_hart_index, logical_index, hart_index); +} + int aplic_direct_setup(struct device *dev, void __iomem *regs) { int i, j, rc, cpu, current_cpu, setup_count = 0; @@ -265,8 +279,12 @@ int aplic_direct_setup(struct device *dev, void __iomem *regs) cpumask_set_cpu(cpu, &direct->lmask); idc = per_cpu_ptr(&aplic_idcs, cpu); - idc->hart_index = i; - idc->regs = priv->regs + APLIC_IDC_BASE + i * APLIC_IDC_SIZE; + rc = aplic_direct_get_hart_index(dev, i, &idc->hart_index); + if (rc) { + dev_warn(dev, "hart index not found for IDC%d\n", i); + continue; + } + idc->regs = priv->regs + APLIC_IDC_BASE + idc->hart_index * APLIC_IDC_SIZE; idc->direct = direct; aplic_idc_set_delivery(idc, true); diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c index 275df5005705..d9ae87808651 100644 --- a/drivers/irqchip/irq-riscv-imsic-early.c +++ b/drivers/irqchip/irq-riscv-imsic-early.c @@ -73,10 +73,16 @@ static int __init imsic_ipi_domain_init(void) { return 0; } static void imsic_handle_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - int err, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct imsic_vector *vec; unsigned long local_id; + /* + * Process pending local synchronization instead of waiting + * for per-CPU local timer to expire. + */ + imsic_local_sync_all(false); + chained_irq_enter(chip, desc); while ((local_id = csr_swap(CSR_TOPEI, 0))) { @@ -97,9 +103,7 @@ static void imsic_handle_irq(struct irq_desc *desc) continue; } - err = generic_handle_domain_irq(imsic->base_domain, vec->hwirq); - if (unlikely(err)) - pr_warn_ratelimited("hwirq 0x%x mapping not found\n", vec->hwirq); + generic_handle_irq(vec->irq); } chained_irq_exit(chip, desc); @@ -120,7 +124,7 @@ static int imsic_starting_cpu(unsigned int cpu) * Interrupts identities might have been enabled/disabled while * this CPU was not running so sync-up local enable/disable state. */ - imsic_local_sync_all(); + imsic_local_sync_all(true); /* Enable local interrupt delivery */ imsic_local_delivery(true); diff --git a/drivers/irqchip/irq-riscv-imsic-platform.c b/drivers/irqchip/irq-riscv-imsic-platform.c index c708780e8760..b8ae67c25b37 100644 --- a/drivers/irqchip/irq-riscv-imsic-platform.c +++ b/drivers/irqchip/irq-riscv-imsic-platform.c @@ -20,6 +20,7 @@ #include <linux/spinlock.h> #include <linux/smp.h> +#include "irq-msi-lib.h" #include "irq-riscv-imsic-state.h" static bool imsic_cpu_page_phys(unsigned int cpu, unsigned int guest_index, @@ -63,6 +64,11 @@ static int imsic_irq_retrigger(struct irq_data *d) return 0; } +static void imsic_irq_ack(struct irq_data *d) +{ + irq_move_irq(d); +} + static void imsic_irq_compose_vector_msg(struct imsic_vector *vec, struct msi_msg *msg) { phys_addr_t msi_addr; @@ -96,9 +102,23 @@ static int imsic_irq_set_affinity(struct irq_data *d, const struct cpumask *mask bool force) { struct imsic_vector *old_vec, *new_vec; - struct irq_data *pd = d->parent_data; + struct imsic_vector tmp_vec; + + /* + * Requirements for the downstream irqdomains (or devices): + * + * 1) Downstream irqdomains (or devices) with atomic MSI update can + * happily do imsic_irq_set_affinity() in the process-context on + * any CPU so the irqchip of such irqdomains must not set the + * IRQCHIP_MOVE_DEFERRED flag. + * + * 2) Downstream irqdomains (or devices) with non-atomic MSI update + * must use imsic_irq_set_affinity() in nterrupt-context upon + * the next device interrupt so the irqchip of such irqdomains + * must set the IRQCHIP_MOVE_DEFERRED flag. + */ - old_vec = irq_data_get_irq_chip_data(pd); + old_vec = irq_data_get_irq_chip_data(d); if (WARN_ON(!old_vec)) return -ENOENT; @@ -111,34 +131,95 @@ static int imsic_irq_set_affinity(struct irq_data *d, const struct cpumask *mask return -EBUSY; /* Get a new vector on the desired set of CPUs */ - new_vec = imsic_vector_alloc(old_vec->hwirq, mask_val); + new_vec = imsic_vector_alloc(old_vec->irq, mask_val); if (!new_vec) return -ENOSPC; + /* + * Device having non-atomic MSI update might see an intermediate + * state when changing target IMSIC vector from one CPU to another. + * + * To avoid losing interrupt to such intermediate state, do the + * following (just like x86 APIC): + * + * 1) First write a temporary IMSIC vector to the device which + * has MSI address same as the old IMSIC vector but MSI data + * matches the new IMSIC vector. + * + * 2) Next write the new IMSIC vector to the device. + * + * Based on the above, __imsic_local_sync() must check pending + * status of both old MSI data and new MSI data on the old CPU. + */ + if (!irq_can_move_in_process_context(d) && + new_vec->local_id != old_vec->local_id) { + /* Setup temporary vector */ + tmp_vec.cpu = old_vec->cpu; + tmp_vec.local_id = new_vec->local_id; + + /* Point device to the temporary vector */ + imsic_msi_update_msg(irq_get_irq_data(d->irq), &tmp_vec); + } + /* Point device to the new vector */ - imsic_msi_update_msg(d, new_vec); + imsic_msi_update_msg(irq_get_irq_data(d->irq), new_vec); /* Update irq descriptors with the new vector */ - pd->chip_data = new_vec; + d->chip_data = new_vec; - /* Update effective affinity of parent irq data */ - irq_data_update_effective_affinity(pd, cpumask_of(new_vec->cpu)); + /* Update effective affinity */ + irq_data_update_effective_affinity(d, cpumask_of(new_vec->cpu)); /* Move state of the old vector to the new vector */ imsic_vector_move(old_vec, new_vec); return IRQ_SET_MASK_OK_DONE; } + +static void imsic_irq_force_complete_move(struct irq_data *d) +{ + struct imsic_vector *mvec, *vec = irq_data_get_irq_chip_data(d); + unsigned int cpu = smp_processor_id(); + + if (WARN_ON(!vec)) + return; + + /* Do nothing if there is no in-flight move */ + mvec = imsic_vector_get_move(vec); + if (!mvec) + return; + + /* Do nothing if the old IMSIC vector does not belong to current CPU */ + if (mvec->cpu != cpu) + return; + + /* + * The best we can do is force cleanup the old IMSIC vector. + * + * The challenges over here are same as x86 vector domain so + * refer to the comments in irq_force_complete_move() function + * implemented at arch/x86/kernel/apic/vector.c. + */ + + /* Force cleanup in-flight move */ + pr_info("IRQ fixup: irq %d move in progress, old vector cpu %d local_id %d\n", + d->irq, mvec->cpu, mvec->local_id); + imsic_vector_force_move_cleanup(vec); +} #endif static struct irq_chip imsic_irq_base_chip = { - .name = "IMSIC", - .irq_mask = imsic_irq_mask, - .irq_unmask = imsic_irq_unmask, - .irq_retrigger = imsic_irq_retrigger, - .irq_compose_msi_msg = imsic_irq_compose_msg, - .flags = IRQCHIP_SKIP_SET_WAKE | - IRQCHIP_MASK_ON_SUSPEND, + .name = "IMSIC", + .irq_mask = imsic_irq_mask, + .irq_unmask = imsic_irq_unmask, +#ifdef CONFIG_SMP + .irq_set_affinity = imsic_irq_set_affinity, + .irq_force_complete_move = imsic_irq_force_complete_move, +#endif + .irq_retrigger = imsic_irq_retrigger, + .irq_ack = imsic_irq_ack, + .irq_compose_msi_msg = imsic_irq_compose_msg, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MASK_ON_SUSPEND, }; static int imsic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, @@ -155,7 +236,7 @@ static int imsic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, return -ENOSPC; irq_domain_set_info(domain, virq, virq, &imsic_irq_base_chip, vec, - handle_simple_irq, NULL, NULL); + handle_edge_irq, NULL, NULL); irq_set_noprobe(virq); irq_set_affinity(virq, cpu_online_mask); irq_data_update_effective_affinity(irq_get_irq_data(virq), cpumask_of(vec->cpu)); @@ -172,22 +253,6 @@ static void imsic_irq_domain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_parent(domain, virq, nr_irqs); } -static int imsic_irq_domain_select(struct irq_domain *domain, struct irq_fwspec *fwspec, - enum irq_domain_bus_token bus_token) -{ - const struct msi_parent_ops *ops = domain->msi_parent_ops; - u32 busmask = BIT(bus_token); - - if (fwspec->fwnode != domain->fwnode || fwspec->param_count != 0) - return 0; - - /* Handle pure domain searches */ - if (bus_token == ops->bus_select_token) - return 1; - - return !!(ops->bus_select_mask & busmask); -} - #ifdef CONFIG_GENERIC_IRQ_DEBUGFS static void imsic_irq_debug_show(struct seq_file *m, struct irq_domain *d, struct irq_data *irqd, int ind) @@ -204,107 +269,37 @@ static void imsic_irq_debug_show(struct seq_file *m, struct irq_domain *d, static const struct irq_domain_ops imsic_base_domain_ops = { .alloc = imsic_irq_domain_alloc, .free = imsic_irq_domain_free, - .select = imsic_irq_domain_select, + .select = msi_lib_irq_domain_select, #ifdef CONFIG_GENERIC_IRQ_DEBUGFS .debug_show = imsic_irq_debug_show, #endif }; -#ifdef CONFIG_RISCV_IMSIC_PCI - -static void imsic_pci_mask_irq(struct irq_data *d) -{ - pci_msi_mask_irq(d); - irq_chip_mask_parent(d); -} - -static void imsic_pci_unmask_irq(struct irq_data *d) +static bool imsic_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) { - irq_chip_unmask_parent(d); - pci_msi_unmask_irq(d); -} - -#define MATCH_PCI_MSI BIT(DOMAIN_BUS_PCI_MSI) - -#else - -#define MATCH_PCI_MSI 0 - -#endif - -static bool imsic_init_dev_msi_info(struct device *dev, - struct irq_domain *domain, - struct irq_domain *real_parent, - struct msi_domain_info *info) -{ - const struct msi_parent_ops *pops = real_parent->msi_parent_ops; - - /* MSI parent domain specific settings */ - switch (real_parent->bus_token) { - case DOMAIN_BUS_NEXUS: - if (WARN_ON_ONCE(domain != real_parent)) - return false; -#ifdef CONFIG_SMP - info->chip->irq_set_affinity = imsic_irq_set_affinity; -#endif - break; - default: - WARN_ON_ONCE(1); + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) return false; - } - /* Is the target supported? */ switch (info->bus_token) { -#ifdef CONFIG_RISCV_IMSIC_PCI case DOMAIN_BUS_PCI_DEVICE_MSI: case DOMAIN_BUS_PCI_DEVICE_MSIX: - info->chip->irq_mask = imsic_pci_mask_irq; - info->chip->irq_unmask = imsic_pci_unmask_irq; - break; -#endif - case DOMAIN_BUS_DEVICE_MSI: - /* - * Per-device MSI should never have any MSI feature bits - * set. It's sole purpose is to create a dumb interrupt - * chip which has a device specific irq_write_msi_msg() - * callback. - */ - if (WARN_ON_ONCE(info->flags)) - return false; - - /* Core managed MSI descriptors */ - info->flags |= MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS | - MSI_FLAG_FREE_MSI_DESCS; - break; - case DOMAIN_BUS_WIRED_TO_MSI: + info->chip->flags |= IRQCHIP_MOVE_DEFERRED; break; default: - WARN_ON_ONCE(1); - return false; + break; } - /* Use hierarchial chip operations re-trigger */ - info->chip->irq_retrigger = irq_chip_retrigger_hierarchy; - - /* - * Mask out the domain specific MSI feature flags which are not - * supported by the real parent. - */ - info->flags &= pops->supported_flags; - - /* Enforce the required flags */ - info->flags |= pops->required_flags; - return true; } -#define MATCH_PLATFORM_MSI BIT(DOMAIN_BUS_PLATFORM_MSI) - static const struct msi_parent_ops imsic_msi_parent_ops = { .supported_flags = MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX, .required_flags = MSI_FLAG_USE_DEF_DOM_OPS | - MSI_FLAG_USE_DEF_CHIP_OPS, + MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSI_MASK_PARENT, + .chip_flags = MSI_CHIP_FLAG_SET_ACK, .bus_select_token = DOMAIN_BUS_NEXUS, .bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI, .init_dev_msi_info = imsic_init_dev_msi_info, diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c index b97e6cd89ed7..bdf5cd2037f2 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.c +++ b/drivers/irqchip/irq-riscv-imsic-state.c @@ -124,10 +124,11 @@ void __imsic_eix_update(unsigned long base_id, unsigned long num_id, bool pend, } } -static void __imsic_local_sync(struct imsic_local_priv *lpriv) +static bool __imsic_local_sync(struct imsic_local_priv *lpriv) { - struct imsic_local_config *mlocal; - struct imsic_vector *vec, *mvec; + struct imsic_local_config *tlocal, *mlocal; + struct imsic_vector *vec, *tvec, *mvec; + bool ret = true; int i; lockdep_assert_held(&lpriv->lock); @@ -143,35 +144,97 @@ static void __imsic_local_sync(struct imsic_local_priv *lpriv) __imsic_id_clear_enable(i); /* - * If the ID was being moved to a new ID on some other CPU - * then we can get a MSI during the movement so check the - * ID pending bit and re-trigger the new ID on other CPU - * using MMIO write. + * Clear the previous vector pointer of the new vector only + * after the movement is complete on the old CPU. */ - mvec = READ_ONCE(vec->move); - WRITE_ONCE(vec->move, NULL); - if (mvec && mvec != vec) { - if (__imsic_id_read_clear_pending(i)) { + mvec = READ_ONCE(vec->move_prev); + if (mvec) { + /* + * If the old vector has not been updated then + * try again in the next sync-up call. + */ + if (READ_ONCE(mvec->move_next)) { + ret = false; + continue; + } + + WRITE_ONCE(vec->move_prev, NULL); + } + + /* + * If a vector was being moved to a new vector on some other + * CPU then we can get a MSI during the movement so check the + * ID pending bit and re-trigger the new ID on other CPU using + * MMIO write. + */ + mvec = READ_ONCE(vec->move_next); + if (mvec) { + /* + * Devices having non-atomic MSI update might see + * an intermediate state so check both old ID and + * new ID for pending interrupts. + * + * For details, see imsic_irq_set_affinity(). + */ + tvec = vec->local_id == mvec->local_id ? + NULL : &lpriv->vectors[mvec->local_id]; + + if (tvec && !irq_can_move_in_process_context(irq_get_irq_data(vec->irq)) && + __imsic_id_read_clear_pending(tvec->local_id)) { + /* Retrigger temporary vector if it was already in-use */ + if (READ_ONCE(tvec->enable)) { + tlocal = per_cpu_ptr(imsic->global.local, tvec->cpu); + writel_relaxed(tvec->local_id, tlocal->msi_va); + } + + mlocal = per_cpu_ptr(imsic->global.local, mvec->cpu); + writel_relaxed(mvec->local_id, mlocal->msi_va); + } + + if (__imsic_id_read_clear_pending(vec->local_id)) { mlocal = per_cpu_ptr(imsic->global.local, mvec->cpu); writel_relaxed(mvec->local_id, mlocal->msi_va); } - imsic_vector_free(&lpriv->vectors[i]); + WRITE_ONCE(vec->move_next, NULL); + imsic_vector_free(vec); } skip: bitmap_clear(lpriv->dirty_bitmap, i, 1); } + + return ret; } -void imsic_local_sync_all(void) +#ifdef CONFIG_SMP +static void __imsic_local_timer_start(struct imsic_local_priv *lpriv) +{ + lockdep_assert_held(&lpriv->lock); + + if (!timer_pending(&lpriv->timer)) { + lpriv->timer.expires = jiffies + 1; + add_timer_on(&lpriv->timer, smp_processor_id()); + } +} +#else +static inline void __imsic_local_timer_start(struct imsic_local_priv *lpriv) +{ +} +#endif + +void imsic_local_sync_all(bool force_all) { struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv); unsigned long flags; raw_spin_lock_irqsave(&lpriv->lock, flags); - bitmap_fill(lpriv->dirty_bitmap, imsic->global.nr_ids + 1); - __imsic_local_sync(lpriv); + + if (force_all) + bitmap_fill(lpriv->dirty_bitmap, imsic->global.nr_ids + 1); + if (!__imsic_local_sync(lpriv)) + __imsic_local_timer_start(lpriv); + raw_spin_unlock_irqrestore(&lpriv->lock, flags); } @@ -190,12 +253,7 @@ void imsic_local_delivery(bool enable) #ifdef CONFIG_SMP static void imsic_local_timer_callback(struct timer_list *timer) { - struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv); - unsigned long flags; - - raw_spin_lock_irqsave(&lpriv->lock, flags); - __imsic_local_sync(lpriv); - raw_spin_unlock_irqrestore(&lpriv->lock, flags); + imsic_local_sync_all(false); } static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu) @@ -216,14 +274,11 @@ static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu */ if (cpu_online(cpu)) { if (cpu == smp_processor_id()) { - __imsic_local_sync(lpriv); - return; + if (__imsic_local_sync(lpriv)) + return; } - if (!timer_pending(&lpriv->timer)) { - lpriv->timer.expires = jiffies + 1; - add_timer_on(&lpriv->timer, cpu); - } + __imsic_local_timer_start(lpriv); } } #else @@ -278,8 +333,26 @@ void imsic_vector_unmask(struct imsic_vector *vec) raw_spin_unlock(&lpriv->lock); } -static bool imsic_vector_move_update(struct imsic_local_priv *lpriv, struct imsic_vector *vec, - bool new_enable, struct imsic_vector *new_move) +void imsic_vector_force_move_cleanup(struct imsic_vector *vec) +{ + struct imsic_local_priv *lpriv; + struct imsic_vector *mvec; + unsigned long flags; + + lpriv = per_cpu_ptr(imsic->lpriv, vec->cpu); + raw_spin_lock_irqsave(&lpriv->lock, flags); + + mvec = READ_ONCE(vec->move_prev); + WRITE_ONCE(vec->move_prev, NULL); + if (mvec) + imsic_vector_free(mvec); + + raw_spin_unlock_irqrestore(&lpriv->lock, flags); +} + +static bool imsic_vector_move_update(struct imsic_local_priv *lpriv, + struct imsic_vector *vec, bool is_old_vec, + bool new_enable, struct imsic_vector *move_vec) { unsigned long flags; bool enabled; @@ -289,7 +362,10 @@ static bool imsic_vector_move_update(struct imsic_local_priv *lpriv, struct imsi /* Update enable and move details */ enabled = READ_ONCE(vec->enable); WRITE_ONCE(vec->enable, new_enable); - WRITE_ONCE(vec->move, new_move); + if (is_old_vec) + WRITE_ONCE(vec->move_next, move_vec); + else + WRITE_ONCE(vec->move_prev, move_vec); /* Mark the vector as dirty and synchronize */ bitmap_set(lpriv->dirty_bitmap, vec->local_id, 1); @@ -322,8 +398,8 @@ void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_ve * interrupt on the old vector while device was being moved * to the new vector. */ - enabled = imsic_vector_move_update(old_lpriv, old_vec, false, new_vec); - imsic_vector_move_update(new_lpriv, new_vec, enabled, new_vec); + enabled = imsic_vector_move_update(old_lpriv, old_vec, true, false, new_vec); + imsic_vector_move_update(new_lpriv, new_vec, false, enabled, old_vec); } #ifdef CONFIG_GENERIC_IRQ_DEBUGFS @@ -368,7 +444,7 @@ struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int l return &lpriv->vectors[local_id]; } -struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask *mask) +struct imsic_vector *imsic_vector_alloc(unsigned int irq, const struct cpumask *mask) { struct imsic_vector *vec = NULL; struct imsic_local_priv *lpriv; @@ -384,9 +460,10 @@ struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask lpriv = per_cpu_ptr(imsic->lpriv, cpu); vec = &lpriv->vectors[local_id]; - vec->hwirq = hwirq; + vec->irq = irq; vec->enable = false; - vec->move = NULL; + vec->move_next = NULL; + vec->move_prev = NULL; return vec; } @@ -396,7 +473,7 @@ void imsic_vector_free(struct imsic_vector *vec) unsigned long flags; raw_spin_lock_irqsave(&imsic->matrix_lock, flags); - vec->hwirq = UINT_MAX; + vec->irq = 0; irq_matrix_free(imsic->matrix, vec->cpu, vec->local_id, false); raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags); } @@ -455,7 +532,7 @@ static int __init imsic_local_init(void) vec = &lpriv->vectors[i]; vec->cpu = cpu; vec->local_id = i; - vec->hwirq = UINT_MAX; + vec->irq = 0; } } diff --git a/drivers/irqchip/irq-riscv-imsic-state.h b/drivers/irqchip/irq-riscv-imsic-state.h index 391e44280827..3202ffa4e849 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.h +++ b/drivers/irqchip/irq-riscv-imsic-state.h @@ -20,10 +20,11 @@ struct imsic_vector { unsigned int cpu; unsigned int local_id; /* Details saved by driver in the vector */ - unsigned int hwirq; + unsigned int irq; /* Details accessed using local lock held */ bool enable; - struct imsic_vector *move; + struct imsic_vector *move_next; + struct imsic_vector *move_prev; }; struct imsic_local_priv { @@ -74,7 +75,7 @@ static inline void __imsic_id_clear_enable(unsigned long id) __imsic_eix_update(id, 1, false, false); } -void imsic_local_sync_all(void); +void imsic_local_sync_all(bool force_all); void imsic_local_delivery(bool enable); void imsic_vector_mask(struct imsic_vector *vec); @@ -87,14 +88,15 @@ static inline bool imsic_vector_isenabled(struct imsic_vector *vec) static inline struct imsic_vector *imsic_vector_get_move(struct imsic_vector *vec) { - return READ_ONCE(vec->move); + return READ_ONCE(vec->move_prev); } +void imsic_vector_force_move_cleanup(struct imsic_vector *vec); void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_vec); struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int local_id); -struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask *mask); +struct imsic_vector *imsic_vector_alloc(unsigned int irq, const struct cpumask *mask); void imsic_vector_free(struct imsic_vector *vector); void imsic_vector_debug_show(struct seq_file *m, struct imsic_vector *vec, int ind); diff --git a/drivers/irqchip/irq-sg2042-msi.c b/drivers/irqchip/irq-sg2042-msi.c new file mode 100644 index 000000000000..ee682e87eb8b --- /dev/null +++ b/drivers/irqchip/irq-sg2042-msi.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SG2042 MSI Controller + * + * Copyright (C) 2024 Sophgo Technology Inc. + * Copyright (C) 2024 Chen Wang <unicorn_wang@outlook.com> + */ + +#include <linux/cleanup.h> +#include <linux/io.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/msi.h> +#include <linux/platform_device.h> +#include <linux/property.h> +#include <linux/slab.h> + +#include "irq-msi-lib.h" + +#define SG2042_MAX_MSI_VECTOR 32 + +struct sg2042_msi_chipdata { + void __iomem *reg_clr; // clear reg, see TRM, 10.1.33, GP_INTR0_CLR + + phys_addr_t doorbell_addr; // see TRM, 10.1.32, GP_INTR0_SET + + u32 irq_first; // The vector number that MSIs starts + u32 num_irqs; // The number of vectors for MSIs + + DECLARE_BITMAP(msi_map, SG2042_MAX_MSI_VECTOR); + struct mutex msi_map_lock; // lock for msi_map +}; + +static int sg2042_msi_allocate_hwirq(struct sg2042_msi_chipdata *data, int num_req) +{ + int first; + + guard(mutex)(&data->msi_map_lock); + first = bitmap_find_free_region(data->msi_map, data->num_irqs, + get_count_order(num_req)); + return first >= 0 ? first : -ENOSPC; +} + +static void sg2042_msi_free_hwirq(struct sg2042_msi_chipdata *data, int hwirq, int num_req) +{ + guard(mutex)(&data->msi_map_lock); + bitmap_release_region(data->msi_map, hwirq, get_count_order(num_req)); +} + +static void sg2042_msi_irq_ack(struct irq_data *d) +{ + struct sg2042_msi_chipdata *data = irq_data_get_irq_chip_data(d); + int bit_off = d->hwirq; + + writel(1 << bit_off, data->reg_clr); + + irq_chip_ack_parent(d); +} + +static void sg2042_msi_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct sg2042_msi_chipdata *data = irq_data_get_irq_chip_data(d); + + msg->address_hi = upper_32_bits(data->doorbell_addr); + msg->address_lo = lower_32_bits(data->doorbell_addr); + msg->data = 1 << d->hwirq; +} + +static const struct irq_chip sg2042_msi_middle_irq_chip = { + .name = "SG2042 MSI", + .irq_ack = sg2042_msi_irq_ack, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, +#ifdef CONFIG_SMP + .irq_set_affinity = irq_chip_set_affinity_parent, +#endif + .irq_compose_msi_msg = sg2042_msi_irq_compose_msi_msg, +}; + +static int sg2042_msi_parent_domain_alloc(struct irq_domain *domain, unsigned int virq, int hwirq) +{ + struct sg2042_msi_chipdata *data = domain->host_data; + struct irq_fwspec fwspec; + struct irq_data *d; + int ret; + + fwspec.fwnode = domain->parent->fwnode; + fwspec.param_count = 2; + fwspec.param[0] = data->irq_first + hwirq; + fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); + if (ret) + return ret; + + d = irq_domain_get_irq_data(domain->parent, virq); + return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); +} + +static int sg2042_msi_middle_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct sg2042_msi_chipdata *data = domain->host_data; + int hwirq, err, i; + + hwirq = sg2042_msi_allocate_hwirq(data, nr_irqs); + if (hwirq < 0) + return hwirq; + + for (i = 0; i < nr_irqs; i++) { + err = sg2042_msi_parent_domain_alloc(domain, virq + i, hwirq + i); + if (err) + goto err_hwirq; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &sg2042_msi_middle_irq_chip, data); + } + + return 0; + +err_hwirq: + sg2042_msi_free_hwirq(data, hwirq, nr_irqs); + irq_domain_free_irqs_parent(domain, virq, i); + + return err; +} + +static void sg2042_msi_middle_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct sg2042_msi_chipdata *data = irq_data_get_irq_chip_data(d); + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + sg2042_msi_free_hwirq(data, d->hwirq, nr_irqs); +} + +static const struct irq_domain_ops sg2042_msi_middle_domain_ops = { + .alloc = sg2042_msi_middle_domain_alloc, + .free = sg2042_msi_middle_domain_free, + .select = msi_lib_irq_domain_select, +}; + +#define SG2042_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS) + +#define SG2042_MSI_FLAGS_SUPPORTED MSI_GENERIC_FLAGS_MASK + +static const struct msi_parent_ops sg2042_msi_parent_ops = { + .required_flags = SG2042_MSI_FLAGS_REQUIRED, + .supported_flags = SG2042_MSI_FLAGS_SUPPORTED, + .bus_select_mask = MATCH_PCI_MSI, + .bus_select_token = DOMAIN_BUS_NEXUS, + .prefix = "SG2042-", + .init_dev_msi_info = msi_lib_init_dev_msi_info, +}; + +static int sg2042_msi_init_domains(struct sg2042_msi_chipdata *data, + struct irq_domain *plic_domain, struct device *dev) +{ + struct fwnode_handle *fwnode = dev_fwnode(dev); + struct irq_domain *middle_domain; + + middle_domain = irq_domain_create_hierarchy(plic_domain, 0, data->num_irqs, fwnode, + &sg2042_msi_middle_domain_ops, data); + if (!middle_domain) { + pr_err("Failed to create the MSI middle domain\n"); + return -ENOMEM; + } + + irq_domain_update_bus_token(middle_domain, DOMAIN_BUS_NEXUS); + + middle_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + middle_domain->msi_parent_ops = &sg2042_msi_parent_ops; + + return 0; +} + +static int sg2042_msi_probe(struct platform_device *pdev) +{ + struct fwnode_reference_args args = { }; + struct sg2042_msi_chipdata *data; + struct device *dev = &pdev->dev; + struct irq_domain *plic_domain; + struct resource *res; + int ret; + + data = devm_kzalloc(dev, sizeof(struct sg2042_msi_chipdata), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->reg_clr = devm_platform_ioremap_resource_byname(pdev, "clr"); + if (IS_ERR(data->reg_clr)) { + dev_err(dev, "Failed to map clear register\n"); + return PTR_ERR(data->reg_clr); + } + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "doorbell"); + if (!res) { + dev_err(dev, "Failed get resource from set\n"); + return -EINVAL; + } + data->doorbell_addr = res->start; + + ret = fwnode_property_get_reference_args(dev_fwnode(dev), "msi-ranges", + "#interrupt-cells", 0, 0, &args); + if (ret) { + dev_err(dev, "Unable to parse MSI vec base\n"); + return ret; + } + fwnode_handle_put(args.fwnode); + + ret = fwnode_property_get_reference_args(dev_fwnode(dev), "msi-ranges", NULL, + args.nargs + 1, 0, &args); + if (ret) { + dev_err(dev, "Unable to parse MSI vec number\n"); + return ret; + } + + plic_domain = irq_find_matching_fwnode(args.fwnode, DOMAIN_BUS_ANY); + fwnode_handle_put(args.fwnode); + if (!plic_domain) { + pr_err("Failed to find the PLIC domain\n"); + return -ENXIO; + } + + data->irq_first = (u32)args.args[0]; + data->num_irqs = (u32)args.args[args.nargs - 1]; + + mutex_init(&data->msi_map_lock); + + return sg2042_msi_init_domains(data, plic_domain, dev); +} + +static const struct of_device_id sg2042_msi_of_match[] = { + { .compatible = "sophgo,sg2042-msi" }, + { } +}; + +static struct platform_driver sg2042_msi_driver = { + .driver = { + .name = "sg2042-msi", + .of_match_table = sg2042_msi_of_match, + }, + .probe = sg2042_msi_probe, +}; +builtin_platform_driver(sg2042_msi_driver); diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c index 0b4312152024..01b0d8321728 100644 --- a/drivers/irqchip/irq-sunxi-nmi.c +++ b/drivers/irqchip/irq-sunxi-nmi.c @@ -48,32 +48,41 @@ enum { SUNXI_SRC_TYPE_EDGE_RISING, }; -struct sunxi_sc_nmi_reg_offs { - u32 ctrl; - u32 pend; - u32 enable; +struct sunxi_sc_nmi_data { + struct { + u32 ctrl; + u32 pend; + u32 enable; + } reg_offs; + u32 enable_val; }; -static const struct sunxi_sc_nmi_reg_offs sun6i_reg_offs __initconst = { - .ctrl = SUN6I_NMI_CTRL, - .pend = SUN6I_NMI_PENDING, - .enable = SUN6I_NMI_ENABLE, +static const struct sunxi_sc_nmi_data sun6i_data __initconst = { + .reg_offs.ctrl = SUN6I_NMI_CTRL, + .reg_offs.pend = SUN6I_NMI_PENDING, + .reg_offs.enable = SUN6I_NMI_ENABLE, }; -static const struct sunxi_sc_nmi_reg_offs sun7i_reg_offs __initconst = { - .ctrl = SUN7I_NMI_CTRL, - .pend = SUN7I_NMI_PENDING, - .enable = SUN7I_NMI_ENABLE, +static const struct sunxi_sc_nmi_data sun7i_data __initconst = { + .reg_offs.ctrl = SUN7I_NMI_CTRL, + .reg_offs.pend = SUN7I_NMI_PENDING, + .reg_offs.enable = SUN7I_NMI_ENABLE, }; -static const struct sunxi_sc_nmi_reg_offs sun9i_reg_offs __initconst = { - .ctrl = SUN9I_NMI_CTRL, - .pend = SUN9I_NMI_PENDING, - .enable = SUN9I_NMI_ENABLE, +static const struct sunxi_sc_nmi_data sun9i_data __initconst = { + .reg_offs.ctrl = SUN9I_NMI_CTRL, + .reg_offs.pend = SUN9I_NMI_PENDING, + .reg_offs.enable = SUN9I_NMI_ENABLE, }; -static inline void sunxi_sc_nmi_write(struct irq_chip_generic *gc, u32 off, - u32 val) +static const struct sunxi_sc_nmi_data sun55i_a523_data __initconst = { + .reg_offs.ctrl = SUN9I_NMI_CTRL, + .reg_offs.pend = SUN9I_NMI_PENDING, + .reg_offs.enable = SUN9I_NMI_ENABLE, + .enable_val = BIT(31), +}; + +static inline void sunxi_sc_nmi_write(struct irq_chip_generic *gc, u32 off, u32 val) { irq_reg_writel(gc, val, off); } @@ -143,15 +152,13 @@ static int sunxi_sc_nmi_set_type(struct irq_data *data, unsigned int flow_type) } static int __init sunxi_sc_nmi_irq_init(struct device_node *node, - const struct sunxi_sc_nmi_reg_offs *reg_offs) + const struct sunxi_sc_nmi_data *data) { - struct irq_domain *domain; + unsigned int irq, clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; struct irq_chip_generic *gc; - unsigned int irq; - unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; + struct irq_domain *domain; int ret; - domain = irq_domain_add_linear(node, 1, &irq_generic_chip_ops, NULL); if (!domain) { pr_err("Could not register interrupt domain.\n"); @@ -186,27 +193,28 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node, gc->chip_types[0].chip.irq_unmask = irq_gc_mask_set_bit; gc->chip_types[0].chip.irq_eoi = irq_gc_ack_set_bit; gc->chip_types[0].chip.irq_set_type = sunxi_sc_nmi_set_type; - gc->chip_types[0].chip.flags = IRQCHIP_EOI_THREADED | IRQCHIP_EOI_IF_HANDLED | + gc->chip_types[0].chip.flags = IRQCHIP_EOI_THREADED | + IRQCHIP_EOI_IF_HANDLED | IRQCHIP_SKIP_SET_WAKE; - gc->chip_types[0].regs.ack = reg_offs->pend; - gc->chip_types[0].regs.mask = reg_offs->enable; - gc->chip_types[0].regs.type = reg_offs->ctrl; + gc->chip_types[0].regs.ack = data->reg_offs.pend; + gc->chip_types[0].regs.mask = data->reg_offs.enable; + gc->chip_types[0].regs.type = data->reg_offs.ctrl; gc->chip_types[1].type = IRQ_TYPE_EDGE_BOTH; gc->chip_types[1].chip.irq_ack = irq_gc_ack_set_bit; gc->chip_types[1].chip.irq_mask = irq_gc_mask_clr_bit; gc->chip_types[1].chip.irq_unmask = irq_gc_mask_set_bit; gc->chip_types[1].chip.irq_set_type = sunxi_sc_nmi_set_type; - gc->chip_types[1].regs.ack = reg_offs->pend; - gc->chip_types[1].regs.mask = reg_offs->enable; - gc->chip_types[1].regs.type = reg_offs->ctrl; + gc->chip_types[1].regs.ack = data->reg_offs.pend; + gc->chip_types[1].regs.mask = data->reg_offs.enable; + gc->chip_types[1].regs.type = data->reg_offs.ctrl; gc->chip_types[1].handler = handle_edge_irq; /* Disable any active interrupts */ - sunxi_sc_nmi_write(gc, reg_offs->enable, 0); + sunxi_sc_nmi_write(gc, data->reg_offs.enable, data->enable_val); /* Clear any pending NMI interrupts */ - sunxi_sc_nmi_write(gc, reg_offs->pend, SUNXI_NMI_IRQ_BIT); + sunxi_sc_nmi_write(gc, data->reg_offs.pend, SUNXI_NMI_IRQ_BIT); irq_set_chained_handler_and_data(irq, sunxi_sc_nmi_handle_irq, domain); @@ -221,20 +229,27 @@ fail_irqd_remove: static int __init sun6i_sc_nmi_irq_init(struct device_node *node, struct device_node *parent) { - return sunxi_sc_nmi_irq_init(node, &sun6i_reg_offs); + return sunxi_sc_nmi_irq_init(node, &sun6i_data); } IRQCHIP_DECLARE(sun6i_sc_nmi, "allwinner,sun6i-a31-sc-nmi", sun6i_sc_nmi_irq_init); static int __init sun7i_sc_nmi_irq_init(struct device_node *node, struct device_node *parent) { - return sunxi_sc_nmi_irq_init(node, &sun7i_reg_offs); + return sunxi_sc_nmi_irq_init(node, &sun7i_data); } IRQCHIP_DECLARE(sun7i_sc_nmi, "allwinner,sun7i-a20-sc-nmi", sun7i_sc_nmi_irq_init); static int __init sun9i_nmi_irq_init(struct device_node *node, struct device_node *parent) { - return sunxi_sc_nmi_irq_init(node, &sun9i_reg_offs); + return sunxi_sc_nmi_irq_init(node, &sun9i_data); } IRQCHIP_DECLARE(sun9i_nmi, "allwinner,sun9i-a80-nmi", sun9i_nmi_irq_init); + +static int __init sun55i_nmi_irq_init(struct device_node *node, + struct device_node *parent) +{ + return sunxi_sc_nmi_irq_init(node, &sun55i_a523_data); +} +IRQCHIP_DECLARE(sun55i_nmi, "allwinner,sun55i-a523-nmi", sun55i_nmi_irq_init); diff --git a/drivers/leds/trigger/ledtrig-pattern.c b/drivers/leds/trigger/ledtrig-pattern.c index aad48c2540fc..a594bd5e2233 100644 --- a/drivers/leds/trigger/ledtrig-pattern.c +++ b/drivers/leds/trigger/ledtrig-pattern.c @@ -483,8 +483,8 @@ static int pattern_trig_activate(struct led_classdev *led_cdev) data->led_cdev = led_cdev; led_set_trigger_data(led_cdev, data); timer_setup(&data->timer, pattern_trig_timer_function, 0); - hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - data->hrtimer.function = pattern_trig_hrtimer_function; + hrtimer_setup(&data->hrtimer, pattern_trig_hrtimer_function, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); led_cdev->activated = true; if (led_cdev->flags & LED_INIT_DEFAULT_TRIGGER) { diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index d3d26a2c9895..118beaf447aa 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -534,9 +534,7 @@ int mbox_controller_register(struct mbox_controller *mbox) return -EINVAL; } - hrtimer_init(&mbox->poll_hrt, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - mbox->poll_hrt.function = txdone_hrtimer; + hrtimer_setup(&mbox->poll_hrt, txdone_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); spin_lock_init(&mbox->poll_hrt_lock); } diff --git a/drivers/media/cec/core/cec-pin.c b/drivers/media/cec/core/cec-pin.c index a70451d99ebc..bebaa40e0eb5 100644 --- a/drivers/media/cec/core/cec-pin.c +++ b/drivers/media/cec/core/cec-pin.c @@ -1346,9 +1346,8 @@ struct cec_adapter *cec_pin_allocate_adapter(const struct cec_pin_ops *pin_ops, if (pin == NULL) return ERR_PTR(-ENOMEM); pin->ops = pin_ops; - hrtimer_init(&pin->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); atomic_set(&pin->work_pin_num_events, 0); - pin->timer.function = cec_pin_timer; + hrtimer_setup(&pin->timer, cec_pin_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); init_waitqueue_head(&pin->kthread_waitq); pin->tx_custom_low_usecs = CEC_TIM_CUSTOM_DEFAULT; pin->tx_custom_high_usecs = CEC_TIM_CUSTOM_DEFAULT; diff --git a/drivers/media/pci/cx88/cx88-input.c b/drivers/media/pci/cx88/cx88-input.c index a04a1d33fadb..b9f2c14d62b4 100644 --- a/drivers/media/pci/cx88/cx88-input.c +++ b/drivers/media/pci/cx88/cx88-input.c @@ -190,8 +190,7 @@ static int __cx88_ir_start(void *priv) ir = core->ir; if (ir->polling) { - hrtimer_init(&ir->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ir->timer.function = cx88_ir_work; + hrtimer_setup(&ir->timer, cx88_ir_work, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_start(&ir->timer, ktime_set(0, ir->polling * 1000000), HRTIMER_MODE_REL); diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu.c b/drivers/media/platform/chips-media/wave5/wave5-vpu.c index d1320298a0f7..8479dc9c9a8f 100644 --- a/drivers/media/platform/chips-media/wave5/wave5-vpu.c +++ b/drivers/media/platform/chips-media/wave5/wave5-vpu.c @@ -269,8 +269,8 @@ static int wave5_vpu_probe(struct platform_device *pdev) dev->irq = platform_get_irq(pdev, 0); if (dev->irq < 0) { dev_err(&pdev->dev, "failed to get irq resource, falling back to polling\n"); - hrtimer_init(&dev->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - dev->hrtimer.function = &wave5_vpu_timer_callback; + hrtimer_setup(&dev->hrtimer, &wave5_vpu_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); dev->worker = kthread_run_worker(0, "vpu_irq_thread"); if (IS_ERR(dev->worker)) { dev_err(&pdev->dev, "failed to create vpu irq worker\n"); diff --git a/drivers/media/rc/pwm-ir-tx.c b/drivers/media/rc/pwm-ir-tx.c index fe368aebbc13..84533fdd61aa 100644 --- a/drivers/media/rc/pwm-ir-tx.c +++ b/drivers/media/rc/pwm-ir-tx.c @@ -172,8 +172,7 @@ static int pwm_ir_probe(struct platform_device *pdev) rcdev->tx_ir = pwm_ir_tx_sleep; } else { init_completion(&pwm_ir->tx_done); - hrtimer_init(&pwm_ir->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pwm_ir->timer.function = pwm_ir_timer; + hrtimer_setup(&pwm_ir->timer, pwm_ir_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rcdev->tx_ir = pwm_ir_tx_atomic; } diff --git a/drivers/memory/omap-gpmc.c b/drivers/memory/omap-gpmc.c index e2a75a52563f..53f1888cc84f 100644 --- a/drivers/memory/omap-gpmc.c +++ b/drivers/memory/omap-gpmc.c @@ -2226,26 +2226,6 @@ static int gpmc_probe_generic_child(struct platform_device *pdev, goto err; } - if (of_node_name_eq(child, "nand")) { - /* Warn about older DT blobs with no compatible property */ - if (!of_property_read_bool(child, "compatible")) { - dev_warn(&pdev->dev, - "Incompatible NAND node: missing compatible"); - ret = -EINVAL; - goto err; - } - } - - if (of_node_name_eq(child, "onenand")) { - /* Warn about older DT blobs with no compatible property */ - if (!of_property_read_bool(child, "compatible")) { - dev_warn(&pdev->dev, - "Incompatible OneNAND node: missing compatible"); - ret = -EINVAL; - goto err; - } - } - if (of_match_node(omap_nand_ids, child)) { /* NAND specific setup */ val = 8; diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 7e79da9684ed..185c08eab4ca 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -2834,10 +2834,10 @@ struct rep_manu_reply{ u8 sas_format:1; u8 reserved1:7; u8 reserved2[3]; - u8 vendor_id[SAS_EXPANDER_VENDOR_ID_LEN]; - u8 product_id[SAS_EXPANDER_PRODUCT_ID_LEN]; - u8 product_rev[SAS_EXPANDER_PRODUCT_REV_LEN]; - u8 component_vendor_id[SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN]; + u8 vendor_id[SAS_EXPANDER_VENDOR_ID_LEN] __nonstring; + u8 product_id[SAS_EXPANDER_PRODUCT_ID_LEN] __nonstring; + u8 product_rev[SAS_EXPANDER_PRODUCT_REV_LEN] __nonstring; + u8 component_vendor_id[SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN] __nonstring; u16 component_id; u8 component_revision_id; u8 reserved3; diff --git a/drivers/misc/mei/Kconfig b/drivers/misc/mei/Kconfig index 67d9391f1855..7575fee96cc6 100644 --- a/drivers/misc/mei/Kconfig +++ b/drivers/misc/mei/Kconfig @@ -3,7 +3,7 @@ config INTEL_MEI tristate "Intel Management Engine Interface" depends on X86 && PCI - default GENERIC_CPU || MCORE2 || MATOM || X86_GENERIC + default X86_64 || MATOM help The Intel Management Engine (Intel ME) provides Manageability, Security and Media services for system containing Intel chipsets. diff --git a/drivers/misc/vcpu_stall_detector.c b/drivers/misc/vcpu_stall_detector.c index f0b1fc87490e..26166357b255 100644 --- a/drivers/misc/vcpu_stall_detector.c +++ b/drivers/misc/vcpu_stall_detector.c @@ -111,8 +111,7 @@ static int start_stall_detector_cpu(unsigned int cpu) ping_timeout_ms = vcpu_stall_config.stall_timeout_sec * MSEC_PER_SEC / 2; - hrtimer_init(vcpu_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - vcpu_hrtimer->function = vcpu_stall_detect_timer_fn; + hrtimer_setup(vcpu_hrtimer, vcpu_stall_detect_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); vcpu_stall_detector->is_initialized = true; hrtimer_start(vcpu_hrtimer, ms_to_ktime(ping_timeout_ms), diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index fc360902729d..24fffc702a94 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -2499,8 +2499,10 @@ static int atmci_probe(struct platform_device *pdev) /* Get MCI capabilities and set operations according to it */ atmci_get_cap(host); ret = atmci_configure_dma(host); - if (ret == -EPROBE_DEFER) + if (ret == -EPROBE_DEFER) { + clk_disable_unprepare(host->mck); goto err_dma_probe_defer; + } if (ret == 0) { host->prepare_data = &atmci_prepare_data_dma; host->submit_data = &atmci_submit_data_dma; diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 3cbda98d08d2..31f40c04afda 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1875,8 +1875,7 @@ static void dw_mci_init_fault(struct dw_mci *host) { host->fail_data_crc = (struct fault_attr) FAULT_ATTR_INITIALIZER; - hrtimer_init(&host->fault_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - host->fault_timer.function = dw_mci_fault_timer; + hrtimer_setup(&host->fault_timer, dw_mci_fault_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } #else static void dw_mci_init_fault(struct dw_mci *host) diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c index 0ef4d578ade8..48cdcba0f39c 100644 --- a/drivers/mmc/host/sdhci-brcmstb.c +++ b/drivers/mmc/host/sdhci-brcmstb.c @@ -503,8 +503,15 @@ static int sdhci_brcmstb_suspend(struct device *dev) struct sdhci_host *host = dev_get_drvdata(dev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); struct sdhci_brcmstb_priv *priv = sdhci_pltfm_priv(pltfm_host); + int ret; clk_disable_unprepare(priv->base_clk); + if (host->mmc->caps2 & MMC_CAP2_CQE) { + ret = cqhci_suspend(host->mmc); + if (ret) + return ret; + } + return sdhci_pltfm_suspend(dev); } @@ -529,6 +536,9 @@ static int sdhci_brcmstb_resume(struct device *dev) ret = clk_set_rate(priv->base_clk, priv->base_freq_hz); } + if (host->mmc->caps2 & MMC_CAP2_CQE) + ret = cqhci_resume(host->mmc); + return ret; } #endif diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c index ac1a860986df..b080740bcb10 100644 --- a/drivers/net/can/flexcan/flexcan-core.c +++ b/drivers/net/can/flexcan/flexcan-core.c @@ -2260,14 +2260,19 @@ static int __maybe_unused flexcan_suspend(struct device *device) flexcan_chip_interrupts_disable(dev); + err = flexcan_transceiver_disable(priv); + if (err) + return err; + err = pinctrl_pm_select_sleep_state(device); if (err) return err; } netif_stop_queue(dev); netif_device_detach(dev); + + priv->can.state = CAN_STATE_SLEEPING; } - priv->can.state = CAN_STATE_SLEEPING; return 0; } @@ -2278,7 +2283,6 @@ static int __maybe_unused flexcan_resume(struct device *device) struct flexcan_priv *priv = netdev_priv(dev); int err; - priv->can.state = CAN_STATE_ERROR_ACTIVE; if (netif_running(dev)) { netif_device_attach(dev); netif_start_queue(dev); @@ -2292,12 +2296,20 @@ static int __maybe_unused flexcan_resume(struct device *device) if (err) return err; - err = flexcan_chip_start(dev); + err = flexcan_transceiver_enable(priv); if (err) return err; + err = flexcan_chip_start(dev); + if (err) { + flexcan_transceiver_disable(priv); + return err; + } + flexcan_chip_interrupts_enable(dev); } + + priv->can.state = CAN_STATE_ERROR_ACTIVE; } return 0; diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index d025d4163fd1..884a6352c42b 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -2420,12 +2420,11 @@ int m_can_class_register(struct m_can_classdev *cdev) if (!cdev->net->irq) { dev_dbg(cdev->dev, "Polling enabled, initialize hrtimer"); - hrtimer_init(&cdev->hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - cdev->hrtimer.function = &hrtimer_callback; + hrtimer_setup(&cdev->hrtimer, &hrtimer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); } else { - hrtimer_init(&cdev->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cdev->hrtimer.function = m_can_coalescing_timer; + hrtimer_setup(&cdev->hrtimer, m_can_coalescing_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ret = m_can_dev_setup(cdev); diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index df1a5d0b37b2..aa3df0d05b85 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -787,22 +787,14 @@ static void rcar_canfd_configure_controller(struct rcar_canfd_global *gpriv) } static void rcar_canfd_configure_afl_rules(struct rcar_canfd_global *gpriv, - u32 ch) + u32 ch, u32 rule_entry) { - u32 cfg; - int offset, start, page, num_rules = RCANFD_CHANNEL_NUMRULES; + int offset, page, num_rules = RCANFD_CHANNEL_NUMRULES; + u32 rule_entry_index = rule_entry % 16; u32 ridx = ch + RCANFD_RFFIFO_IDX; - if (ch == 0) { - start = 0; /* Channel 0 always starts from 0th rule */ - } else { - /* Get number of Channel 0 rules and adjust */ - cfg = rcar_canfd_read(gpriv->base, RCANFD_GAFLCFG(ch)); - start = RCANFD_GAFLCFG_GETRNC(gpriv, 0, cfg); - } - /* Enable write access to entry */ - page = RCANFD_GAFL_PAGENUM(start); + page = RCANFD_GAFL_PAGENUM(rule_entry); rcar_canfd_set_bit(gpriv->base, RCANFD_GAFLECTR, (RCANFD_GAFLECTR_AFLPN(gpriv, page) | RCANFD_GAFLECTR_AFLDAE)); @@ -818,13 +810,13 @@ static void rcar_canfd_configure_afl_rules(struct rcar_canfd_global *gpriv, offset = RCANFD_C_GAFL_OFFSET; /* Accept all IDs */ - rcar_canfd_write(gpriv->base, RCANFD_GAFLID(offset, start), 0); + rcar_canfd_write(gpriv->base, RCANFD_GAFLID(offset, rule_entry_index), 0); /* IDE or RTR is not considered for matching */ - rcar_canfd_write(gpriv->base, RCANFD_GAFLM(offset, start), 0); + rcar_canfd_write(gpriv->base, RCANFD_GAFLM(offset, rule_entry_index), 0); /* Any data length accepted */ - rcar_canfd_write(gpriv->base, RCANFD_GAFLP0(offset, start), 0); + rcar_canfd_write(gpriv->base, RCANFD_GAFLP0(offset, rule_entry_index), 0); /* Place the msg in corresponding Rx FIFO entry */ - rcar_canfd_set_bit(gpriv->base, RCANFD_GAFLP1(offset, start), + rcar_canfd_set_bit(gpriv->base, RCANFD_GAFLP1(offset, rule_entry_index), RCANFD_GAFLP1_GAFLFDP(ridx)); /* Disable write access to page */ @@ -1851,6 +1843,7 @@ static int rcar_canfd_probe(struct platform_device *pdev) unsigned long channels_mask = 0; int err, ch_irq, g_irq; int g_err_irq, g_recc_irq; + u32 rule_entry = 0; bool fdmode = true; /* CAN FD only mode - default */ char name[9] = "channelX"; int i; @@ -2023,7 +2016,8 @@ static int rcar_canfd_probe(struct platform_device *pdev) rcar_canfd_configure_tx(gpriv, ch); /* Configure receive rules */ - rcar_canfd_configure_afl_rules(gpriv, ch); + rcar_canfd_configure_afl_rules(gpriv, ch, rule_entry); + rule_entry += RCANFD_CHANNEL_NUMRULES; } /* Configure common interrupts */ diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c index 7209a831f0f2..c34f2067a989 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-ring.c @@ -541,11 +541,11 @@ int mcp251xfd_ring_alloc(struct mcp251xfd_priv *priv) } priv->rx_ring_num = i; - hrtimer_init(&priv->rx_irq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - priv->rx_irq_timer.function = mcp251xfd_rx_irq_timer; + hrtimer_setup(&priv->rx_irq_timer, mcp251xfd_rx_irq_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&priv->tx_irq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - priv->tx_irq_timer.function = mcp251xfd_tx_irq_timer; + hrtimer_setup(&priv->tx_irq_timer, mcp251xfd_tx_irq_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); return 0; } diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c index 39a63b7313a4..07406daf7c88 100644 --- a/drivers/net/can/usb/ucan.c +++ b/drivers/net/can/usb/ucan.c @@ -186,7 +186,7 @@ union ucan_ctl_payload { */ struct ucan_ctl_cmd_get_protocol_version cmd_get_protocol_version; - u8 raw[128]; + u8 fw_str[128]; } __packed; enum { @@ -424,18 +424,20 @@ static int ucan_ctrl_command_out(struct ucan_priv *up, UCAN_USB_CTL_PIPE_TIMEOUT); } -static int ucan_device_request_in(struct ucan_priv *up, - u8 cmd, u16 subcmd, u16 datalen) +static void ucan_get_fw_str(struct ucan_priv *up, char *fw_str, size_t size) { - return usb_control_msg(up->udev, - usb_rcvctrlpipe(up->udev, 0), - cmd, - USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, - subcmd, - 0, - up->ctl_msg_buffer, - datalen, - UCAN_USB_CTL_PIPE_TIMEOUT); + int ret; + + ret = usb_control_msg(up->udev, usb_rcvctrlpipe(up->udev, 0), + UCAN_DEVICE_GET_FW_STRING, + USB_DIR_IN | USB_TYPE_VENDOR | + USB_RECIP_DEVICE, + 0, 0, fw_str, size - 1, + UCAN_USB_CTL_PIPE_TIMEOUT); + if (ret > 0) + fw_str[ret] = '\0'; + else + strscpy(fw_str, "unknown", size); } /* Parse the device information structure reported by the device and @@ -1314,7 +1316,6 @@ static int ucan_probe(struct usb_interface *intf, u8 in_ep_addr; u8 out_ep_addr; union ucan_ctl_payload *ctl_msg_buffer; - char firmware_str[sizeof(union ucan_ctl_payload) + 1]; udev = interface_to_usbdev(intf); @@ -1527,17 +1528,6 @@ static int ucan_probe(struct usb_interface *intf, */ ucan_parse_device_info(up, &ctl_msg_buffer->cmd_get_device_info); - /* just print some device information - if available */ - ret = ucan_device_request_in(up, UCAN_DEVICE_GET_FW_STRING, 0, - sizeof(union ucan_ctl_payload)); - if (ret > 0) { - /* copy string while ensuring zero termination */ - strscpy(firmware_str, up->ctl_msg_buffer->raw, - sizeof(union ucan_ctl_payload) + 1); - } else { - strcpy(firmware_str, "unknown"); - } - /* device is compatible, reset it */ ret = ucan_ctrl_command_out(up, UCAN_COMMAND_RESET, 0, 0); if (ret < 0) @@ -1555,7 +1545,10 @@ static int ucan_probe(struct usb_interface *intf, /* initialisation complete, log device info */ netdev_info(up->netdev, "registered device\n"); - netdev_info(up->netdev, "firmware string: %s\n", firmware_str); + ucan_get_fw_str(up, up->ctl_msg_buffer->fw_str, + sizeof(up->ctl_msg_buffer->fw_str)); + netdev_info(up->netdev, "firmware string: %s\n", + up->ctl_msg_buffer->fw_str); /* success */ return 0; diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 991e3839858b..2b4bb74f21bf 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1833,9 +1833,8 @@ static int gmac_open(struct net_device *netdev) gmac_enable_tx_rx(netdev); netif_tx_start_all_queues(netdev); - hrtimer_init(&port->rx_coalesce_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - port->rx_coalesce_timer.function = &gmac_coalesce_delay_expired; + hrtimer_setup(&port->rx_coalesce_timer, &gmac_coalesce_delay_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); netdev_dbg(netdev, "opened\n"); diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c index 44af1d13d931..67275aa4f65b 100644 --- a/drivers/net/ethernet/ec_bhf.c +++ b/drivers/net/ethernet/ec_bhf.c @@ -416,8 +416,7 @@ static int ec_bhf_open(struct net_device *net_dev) netif_start_queue(net_dev); - hrtimer_init(&priv->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - priv->hrtimer.function = ec_bhf_timer_fun; + hrtimer_setup(&priv->hrtimer, ec_bhf_timer_fun, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer_start(&priv->hrtimer, polling_frequency, HRTIMER_MODE_REL); return 0; diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index 7f6b57432071..fe4e7f99b6a3 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -739,8 +739,8 @@ void fec_ptp_init(struct platform_device *pdev, int irq_idx) INIT_DELAYED_WORK(&fep->time_keep, fec_time_keep); - hrtimer_init(&fep->perout_timer, CLOCK_REALTIME, HRTIMER_MODE_REL); - fep->perout_timer.function = fec_ptp_pps_perout_handler; + hrtimer_setup(&fep->perout_timer, fec_ptp_pps_perout_handler, CLOCK_REALTIME, + HRTIMER_MODE_REL); irq = platform_get_irq_byname_optional(pdev, "pps"); if (irq < 0) diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c index a376d4bdf281..18376bcc718a 100644 --- a/drivers/net/ethernet/hisilicon/hip04_eth.c +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -934,8 +934,6 @@ static int hip04_mac_probe(struct platform_device *pdev) priv->chan = arg.args[1] * RX_DESC_NUM; priv->group = arg.args[2]; - hrtimer_init(&priv->tx_coalesce_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - /* BQL will try to keep the TX queue as short as possible, but it can't * be faster than tx_coalesce_usecs, so we need a fast timeout here, * but also long enough to gather up enough frames to ensure we don't @@ -944,7 +942,7 @@ static int hip04_mac_probe(struct platform_device *pdev) */ priv->tx_coalesce_frames = TX_DESC_NUM * 3 / 4; priv->tx_coalesce_usecs = 200; - priv->tx_coalesce_timer.function = tx_done; + hrtimer_setup(&priv->tx_coalesce_timer, tx_done, CLOCK_MONOTONIC, HRTIMER_MODE_REL); priv->map = syscon_node_to_regmap(arg.np); of_node_put(arg.np); diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 0676fc547b6f..f9ba79c1165b 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -234,11 +234,17 @@ static int ibmvnic_set_queue_affinity(struct ibmvnic_sub_crq_queue *queue, (*stragglers)--; } /* atomic write is safer than writing bit by bit directly */ - for (i = 0; i < stride; i++) { - cpumask_set_cpu(*cpu, mask); - *cpu = cpumask_next_wrap(*cpu, cpu_online_mask, - nr_cpu_ids, false); + for_each_online_cpu_wrap(i, *cpu) { + if (!stride--) { + /* For the next queue we start from the first + * unused CPU in this queue + */ + *cpu = i; + break; + } + cpumask_set_cpu(i, mask); } + /* set queue affinity mask */ cpumask_copy(queue->affinity_mask, mask); rc = irq_set_affinity_and_hint(queue->irq, queue->affinity_mask); @@ -256,7 +262,7 @@ static void ibmvnic_set_affinity(struct ibmvnic_adapter *adapter) int num_rxqs = adapter->num_active_rx_scrqs, i_rxqs = 0; int num_txqs = adapter->num_active_tx_scrqs, i_txqs = 0; int total_queues, stride, stragglers, i; - unsigned int num_cpu, cpu; + unsigned int num_cpu, cpu = 0; bool is_rx_queue; int rc = 0; @@ -274,8 +280,6 @@ static void ibmvnic_set_affinity(struct ibmvnic_adapter *adapter) stride = max_t(int, num_cpu / total_queues, 1); /* number of leftover cpu's */ stragglers = num_cpu >= total_queues ? num_cpu % total_queues : 0; - /* next available cpu to assign irq to */ - cpu = cpumask_next(-1, cpu_online_mask); for (i = 0; i < total_queues; i++) { is_rx_queue = false; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 84307bb7313e..733820a0c350 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7090,8 +7090,8 @@ static int igc_probe(struct pci_dev *pdev, INIT_WORK(&adapter->reset_task, igc_reset_task); INIT_WORK(&adapter->watchdog_task, igc_watchdog_task); - hrtimer_init(&adapter->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - adapter->hrtimer.function = &igc_qbv_scheduling_timer; + hrtimer_setup(&adapter->hrtimer, &igc_qbv_scheduling_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* Initialize link properties that are user-changeable */ adapter->fc_autoneg = true; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index dd76c1b7ed3a..3c7b43712d25 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -6985,9 +6985,8 @@ static int mvpp2_port_probe(struct platform_device *pdev, for (thread = 0; thread < priv->nthreads; thread++) { port_pcpu = per_cpu_ptr(port->pcpu, thread); - hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED_SOFT); - port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb; + hrtimer_setup(&port_pcpu->tx_done_timer, mvpp2_hr_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); port_pcpu->timer_scheduled = false; port_pcpu->dev = dev; } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/ptp.c b/drivers/net/ethernet/marvell/octeontx2/af/ptp.c index bcc96eed2481..66749b3649c1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/ptp.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/ptp.c @@ -545,8 +545,7 @@ static int ptp_probe(struct pci_dev *pdev, spin_lock_init(&ptp->ptp_lock); if (cn10k_ptp_errata(ptp)) { ptp->read_ptp_tstmp = &read_ptp_tstmp_sec_nsec; - hrtimer_init(&ptp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ptp->hrtimer.function = ptp_reset_thresh; + hrtimer_setup(&ptp->hrtimer, ptp_reset_thresh, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } else { ptp->read_ptp_tstmp = &read_ptp_tstmp_nsec; } diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c index 138ac58fae51..f713656f1fae 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c @@ -375,6 +375,6 @@ irqreturn_t sparx5_xtr_handler(int irq, void *_sparx5) void sparx5_port_inj_timer_setup(struct sparx5_port *port) { - hrtimer_init(&port->inj_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->inj_timer.function = sparx5_injection_timeout; + hrtimer_setup(&port->inj_timer, sparx5_injection_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 11457b6296cc..638ef64d639f 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -134,9 +134,10 @@ static int mana_gd_detect_devices(struct pci_dev *pdev) struct gdma_list_devices_resp resp = {}; struct gdma_general_req req = {}; struct gdma_dev_id dev; - u32 i, max_num_devs; + int found_dev = 0; u16 dev_type; int err; + u32 i; mana_gd_init_req_hdr(&req.hdr, GDMA_LIST_DEVICES, sizeof(req), sizeof(resp)); @@ -148,12 +149,17 @@ static int mana_gd_detect_devices(struct pci_dev *pdev) return err ? err : -EPROTO; } - max_num_devs = min_t(u32, MAX_NUM_GDMA_DEVICES, resp.num_of_devs); - - for (i = 0; i < max_num_devs; i++) { + for (i = 0; i < GDMA_DEV_LIST_SIZE && + found_dev < resp.num_of_devs; i++) { dev = resp.devs[i]; dev_type = dev.type; + /* Skip empty devices */ + if (dev.as_uint32 == 0) + continue; + + found_dev++; + /* HWC is already detected in mana_hwc_create_channel(). */ if (dev_type == GDMA_DEVICE_HWC) continue; diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index a5e3d1a88305..8b4640c5d61e 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -686,8 +686,8 @@ void rmnet_map_update_ul_agg_config(struct rmnet_port *port, u32 size, void rmnet_map_tx_aggregate_init(struct rmnet_port *port) { - hrtimer_init(&port->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->hrtimer.function = rmnet_map_flush_tx_packet_queue; + hrtimer_setup(&port->hrtimer, rmnet_map_flush_tx_packet_queue, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); spin_lock_init(&port->agg_lock); rmnet_map_update_ul_agg_config(port, 4096, 1, 800); INIT_WORK(&port->agg_wq, rmnet_map_flush_tx_packet_work); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index bd4eb187f8c6..b5a7e05ab7a7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -46,7 +46,9 @@ static int dwc_eth_dwmac_config_dt(struct platform_device *pdev, u32 a_index = 0; if (!plat_dat->axi) { - plat_dat->axi = kzalloc(sizeof(struct stmmac_axi), GFP_KERNEL); + plat_dat->axi = devm_kzalloc(&pdev->dev, + sizeof(struct stmmac_axi), + GFP_KERNEL); if (!plat_dat->axi) return -ENOMEM; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index c0ae7db96f46..554d2c0a8fde 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3199,8 +3199,7 @@ static void stmmac_init_coalesce(struct stmmac_priv *priv) priv->tx_coal_frames[chan] = STMMAC_TX_FRAMES; priv->tx_coal_timer[chan] = STMMAC_COAL_TX_TIMER; - hrtimer_init(&tx_q->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - tx_q->txtimer.function = stmmac_tx_timer; + hrtimer_setup(&tx_q->txtimer, stmmac_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } for (chan = 0; chan < rx_channel_count; chan++) @@ -6970,8 +6969,7 @@ int stmmac_xdp_open(struct net_device *dev) stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, chan); - hrtimer_init(&tx_q->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - tx_q->txtimer.function = stmmac_tx_timer; + hrtimer_setup(&tx_q->txtimer, stmmac_tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } /* Enable the MAC Rx/Tx */ diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2806238629f8..4a8e6b9413e3 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2306,13 +2306,17 @@ static void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common) static int am65_cpsw_nuss_ndev_add_tx_napi(struct am65_cpsw_common *common) { struct device *dev = common->dev; + struct am65_cpsw_tx_chn *tx_chn; int i, ret = 0; for (i = 0; i < common->tx_ch_num; i++) { - struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i]; + tx_chn = &common->tx_chns[i]; + + hrtimer_setup(&tx_chn->tx_hrtimer, &am65_cpsw_nuss_tx_timer_callback, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - hrtimer_init(&tx_chn->tx_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - tx_chn->tx_hrtimer.function = &am65_cpsw_nuss_tx_timer_callback; + netif_napi_add_tx(common->dma_ndev, &tx_chn->napi_tx, + am65_cpsw_nuss_tx_poll); ret = devm_request_irq(dev, tx_chn->irq, am65_cpsw_nuss_tx_irq, @@ -2323,19 +2327,16 @@ static int am65_cpsw_nuss_ndev_add_tx_napi(struct am65_cpsw_common *common) tx_chn->id, tx_chn->irq, ret); goto err; } - - netif_napi_add_tx(common->dma_ndev, &tx_chn->napi_tx, - am65_cpsw_nuss_tx_poll); } return 0; err: - for (--i ; i >= 0 ; i--) { - struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i]; - - netif_napi_del(&tx_chn->napi_tx); + netif_napi_del(&tx_chn->napi_tx); + for (--i; i >= 0; i--) { + tx_chn = &common->tx_chns[i]; devm_free_irq(dev, tx_chn->irq, tx_chn); + netif_napi_del(&tx_chn->napi_tx); } return ret; @@ -2565,9 +2566,11 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) snprintf(flow->name, sizeof(flow->name), "%s-rx%d", dev_name(dev), i); - hrtimer_init(&flow->rx_hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - flow->rx_hrtimer.function = &am65_cpsw_nuss_rx_timer_callback; + hrtimer_setup(&flow->rx_hrtimer, &am65_cpsw_nuss_rx_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); + + netif_napi_add(common->dma_ndev, &flow->napi_rx, + am65_cpsw_nuss_rx_poll); ret = devm_request_irq(dev, flow->irq, am65_cpsw_nuss_rx_irq, @@ -2577,11 +2580,8 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) dev_err(dev, "failure requesting rx %d irq %u, %d\n", i, flow->irq, ret); flow->irq = -EINVAL; - goto err_flow; + goto err_request_irq; } - - netif_napi_add(common->dma_ndev, &flow->napi_rx, - am65_cpsw_nuss_rx_poll); } /* setup classifier to route priorities to flows */ @@ -2589,11 +2589,14 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) return 0; +err_request_irq: + netif_napi_del(&flow->napi_rx); + err_flow: - for (--i; i >= 0 ; i--) { + for (--i; i >= 0; i--) { flow = &rx_chn->flows[i]; - netif_napi_del(&flow->napi_rx); devm_free_irq(dev, flow->irq, flow); + netif_napi_del(&flow->napi_rx); } err: diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index 74f0f200a89d..6c1b8ff563e0 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -249,9 +249,8 @@ int prueth_ndev_add_tx_napi(struct prueth_emac *emac) struct prueth_tx_chn *tx_chn = &emac->tx_chns[i]; netif_napi_add_tx(emac->ndev, &tx_chn->napi_tx, emac_napi_tx_poll); - hrtimer_init(&tx_chn->tx_hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - tx_chn->tx_hrtimer.function = &emac_tx_timer_callback; + hrtimer_setup(&tx_chn->tx_hrtimer, &emac_tx_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); ret = request_irq(tx_chn->irq, prueth_tx_irq, IRQF_TRIGGER_HIGH, tx_chn->name, tx_chn); diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 00ed97860547..4d496c8f479d 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -1169,9 +1169,8 @@ static int prueth_netdev_init(struct prueth *prueth, ndev->hw_features |= NETIF_PRUETH_HSR_OFFLOAD_FEATURES; netif_napi_add(ndev, &emac->napi_rx, icssg_napi_rx_poll); - hrtimer_init(&emac->rx_hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - emac->rx_hrtimer.function = &emac_rx_timer_callback; + hrtimer_setup(&emac->rx_hrtimer, &emac_rx_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); prueth->emac[mac] = emac; return 0; @@ -1679,6 +1678,7 @@ static int prueth_probe(struct platform_device *pdev) } spin_lock_init(&prueth->vtbl_lock); + spin_lock_init(&prueth->stats_lock); /* setup netdev interfaces */ if (eth0_node) { ret = prueth_netdev_init(prueth, eth0_node); diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index 329b46e9ee53..f41786b05741 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -305,6 +305,8 @@ struct prueth { int default_vlan; /** @vtbl_lock: Lock for vtbl in shared memory */ spinlock_t vtbl_lock; + /** @stats_lock: Lock for reading icssg stats */ + spinlock_t stats_lock; }; struct emac_tx_ts_response { diff --git a/drivers/net/ethernet/ti/icssg/icssg_stats.c b/drivers/net/ethernet/ti/icssg/icssg_stats.c index 8800bd3a8d07..6f0edae38ea2 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_stats.c +++ b/drivers/net/ethernet/ti/icssg/icssg_stats.c @@ -26,6 +26,8 @@ void emac_update_hardware_stats(struct prueth_emac *emac) u32 val, reg; int i; + spin_lock(&prueth->stats_lock); + for (i = 0; i < ARRAY_SIZE(icssg_all_miig_stats); i++) { regmap_read(prueth->miig_rt, base + icssg_all_miig_stats[i].offset, @@ -51,6 +53,8 @@ void emac_update_hardware_stats(struct prueth_emac *emac) emac->pa_stats[i] += val; } } + + spin_unlock(&prueth->stats_lock); } void icssg_stats_work_handler(struct work_struct *work) diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c index f632b0cfd5ae..fd91f8a45bce 100644 --- a/drivers/net/ieee802154/at86rf230.c +++ b/drivers/net/ieee802154/at86rf230.c @@ -776,8 +776,8 @@ at86rf230_setup_spi_messages(struct at86rf230_local *lp, state->trx.tx_buf = state->buf; state->trx.rx_buf = state->buf; spi_message_add_tail(&state->trx, &state->msg); - hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - state->timer.function = at86rf230_async_state_timer; + hrtimer_setup(&state->timer, at86rf230_async_state_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } static irqreturn_t at86rf230_isr(int irq, void *data) diff --git a/drivers/net/phy/phy_link_topology.c b/drivers/net/phy/phy_link_topology.c index 4a5d73002a1a..0e9e987f37dd 100644 --- a/drivers/net/phy/phy_link_topology.c +++ b/drivers/net/phy/phy_link_topology.c @@ -73,7 +73,7 @@ int phy_link_topo_add_phy(struct net_device *dev, xa_limit_32b, &topo->next_phy_index, GFP_KERNEL); - if (ret) + if (ret < 0) goto err; return 0; diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index d5c47a2a62dc..34e82f1e37d9 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -833,8 +833,7 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ ctx->dev = dev; - hrtimer_init(&ctx->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ctx->tx_timer.function = &cdc_ncm_tx_timer_cb; + hrtimer_setup(&ctx->tx_timer, &cdc_ncm_tx_timer_cb, CLOCK_MONOTONIC, HRTIMER_MODE_REL); tasklet_setup(&ctx->bh, cdc_ncm_txpath_bh); atomic_set(&ctx->stop, 0); spin_lock_init(&ctx->mtx); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7646ddd9bef7..9d7c37e968b5 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3826,7 +3826,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) cpumask_var_t mask; int stragglers; int group_size; - int i, j, cpu; + int i, start = 0, cpu; int num_cpu; int stride; @@ -3840,16 +3840,18 @@ static void virtnet_set_affinity(struct virtnet_info *vi) stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; - cpu = cpumask_first(cpu_online_mask); for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); - for (j = 0; j < group_size; j++) { + for_each_online_cpu_wrap(cpu, start) { + if (!group_size--) { + start = cpu; + break; + } cpumask_set_cpu(cpu, mask); - cpu = cpumask_next_wrap(cpu, cpu_online_mask, - nr_cpu_ids, false); } + virtqueue_set_affinity(vi->rq[i].vq, mask); virtqueue_set_affinity(vi->sq[i].vq, mask); __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c index 0e1ede9314d8..4840d0b500b3 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c @@ -264,8 +264,8 @@ void mt76x02u_init_beacon_config(struct mt76x02_dev *dev) }; dev->beacon_ops = &beacon_ops; - hrtimer_init(&dev->pre_tbtt_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - dev->pre_tbtt_timer.function = mt76x02u_pre_tbtt_interrupt; + hrtimer_setup(&dev->pre_tbtt_timer, mt76x02u_pre_tbtt_interrupt, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); INIT_WORK(&dev->pre_tbtt_work, mt76x02u_pre_tbtt_work); mt76x02_init_beacon_config(dev); diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c b/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c index 5323acff962a..45775ecdf221 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c @@ -842,7 +842,7 @@ int rt2800mmio_probe_hw(struct rt2x00_dev *rt2x00dev) /* * Set txstatus timer function. */ - rt2x00dev->txstatus_timer.function = rt2800mmio_tx_sta_fifo_timeout; + hrtimer_update_function(&rt2x00dev->txstatus_timer, rt2800mmio_tx_sta_fifo_timeout); /* * Overwrite TX done handler diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c index 160bef79acdb..b51a23300ba2 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c @@ -618,7 +618,7 @@ static int rt2800usb_probe_hw(struct rt2x00_dev *rt2x00dev) /* * Set txstatus timer function. */ - rt2x00dev->txstatus_timer.function = rt2800usb_tx_sta_fifo_timeout; + hrtimer_update_function(&rt2x00dev->txstatus_timer, rt2800usb_tx_sta_fifo_timeout); /* * Overwrite TX done handler diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c index 9e7d9dbe954c..432ddfac2c33 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c @@ -1391,8 +1391,8 @@ int rt2x00lib_probe_dev(struct rt2x00_dev *rt2x00dev) mutex_init(&rt2x00dev->conf_mutex); INIT_LIST_HEAD(&rt2x00dev->bar_list); spin_lock_init(&rt2x00dev->bar_list_lock); - hrtimer_init(&rt2x00dev->txstatus_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); + hrtimer_setup(&rt2x00dev->txstatus_timer, hrtimer_dummy_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); set_bit(DEVICE_STATE_PRESENT, &rt2x00dev->flags); diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index cf6a331d4042..fb187a9e984e 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -5548,10 +5548,8 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, wiphy_ext_feature_set(hw->wiphy, NL80211_EXT_FEATURE_CQM_RSSI_LIST); for (i = 0; i < ARRAY_SIZE(data->link_data); i++) { - hrtimer_init(&data->link_data[i].beacon_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_SOFT); - data->link_data[i].beacon_timer.function = - mac80211_hwsim_beacon; + hrtimer_setup(&data->link_data[i].beacon_timer, mac80211_hwsim_beacon, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS_SOFT); data->link_data[i].link_id = i; } diff --git a/drivers/net/wwan/iosm/iosm_ipc_imem.c b/drivers/net/wwan/iosm/iosm_ipc_imem.c index 829515a601b3..530a3ea47a1a 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_imem.c +++ b/drivers/net/wwan/iosm/iosm_ipc_imem.c @@ -1381,24 +1381,20 @@ struct iosm_imem *ipc_imem_init(struct iosm_pcie *pcie, unsigned int device_id, /* The phase is set to power off. */ ipc_imem->phase = IPC_P_OFF; - hrtimer_init(&ipc_imem->startup_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ipc_imem->startup_timer.function = ipc_imem_startup_timer_cb; + hrtimer_setup(&ipc_imem->startup_timer, ipc_imem_startup_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&ipc_imem->tdupdate_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ipc_imem->tdupdate_timer.function = ipc_imem_td_update_timer_cb; + hrtimer_setup(&ipc_imem->tdupdate_timer, ipc_imem_td_update_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&ipc_imem->fast_update_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ipc_imem->fast_update_timer.function = ipc_imem_fast_update_timer_cb; + hrtimer_setup(&ipc_imem->fast_update_timer, ipc_imem_fast_update_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&ipc_imem->td_alloc_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ipc_imem->td_alloc_timer.function = ipc_imem_td_alloc_timer_cb; + hrtimer_setup(&ipc_imem->td_alloc_timer, ipc_imem_td_alloc_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&ipc_imem->adb_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ipc_imem->adb_timer.function = ipc_imem_adb_timer_cb; + hrtimer_setup(&ipc_imem->adb_timer, ipc_imem_adb_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); if (ipc_imem_config(ipc_imem)) { dev_err(ipc_imem->dev, "failed to initialize the imem"); diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c index 6295e55ef85e..368f6d894bba 100644 --- a/drivers/ntb/msi.c +++ b/drivers/ntb/msi.c @@ -106,10 +106,10 @@ int ntb_msi_setup_mws(struct ntb_dev *ntb) if (!ntb->msi) return -EINVAL; - msi_lock_descs(&ntb->pdev->dev); - desc = msi_first_desc(&ntb->pdev->dev, MSI_DESC_ASSOCIATED); - addr = desc->msg.address_lo + ((uint64_t)desc->msg.address_hi << 32); - msi_unlock_descs(&ntb->pdev->dev); + scoped_guard (msi_descs_lock, &ntb->pdev->dev) { + desc = msi_first_desc(&ntb->pdev->dev, MSI_DESC_ASSOCIATED); + addr = desc->msg.address_lo + ((uint64_t)desc->msg.address_hi << 32); + } for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) { peer_widx = ntb_peer_highest_mw_idx(ntb, peer); @@ -289,7 +289,7 @@ int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler, if (!ntb->msi) return -EINVAL; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_for_each_desc(entry, dev, MSI_DESC_ASSOCIATED) { if (irq_has_action(entry->irq)) continue; @@ -307,17 +307,11 @@ int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler, ret = ntbm_msi_setup_callback(ntb, entry, msi_desc); if (ret) { devm_free_irq(&ntb->dev, entry->irq, dev_id); - goto unlock; + return ret; } - - ret = entry->irq; - goto unlock; + return entry->irq; } - ret = -ENODEV; - -unlock: - msi_unlock_descs(dev); - return ret; + return -ENODEV; } EXPORT_SYMBOL(ntbm_msi_request_threaded_irq); diff --git a/drivers/ntb/test/ntb_pingpong.c b/drivers/ntb/test/ntb_pingpong.c index 8aeca7914050..1c1c74f4ff2d 100644 --- a/drivers/ntb/test/ntb_pingpong.c +++ b/drivers/ntb/test/ntb_pingpong.c @@ -284,8 +284,7 @@ static struct pp_ctx *pp_create_data(struct ntb_dev *ntb) pp->ntb = ntb; atomic_set(&pp->count, 0); spin_lock_init(&pp->lock); - hrtimer_init(&pp->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - pp->timer.function = pp_timer_func; + hrtimer_setup(&pp->timer, pp_timer_func, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return pp; } diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2fbd379923fd..5c3054aaec8c 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -203,6 +203,12 @@ config PCI_P2PDMA P2P DMA transactions must be between devices behind the same root port. + Enabling this option will reduce the entropy of x86 KASLR memory + regions. For example - on a 46 bit system, the entropy goes down + from 16 bits to 15 bits. The actual reduction in entropy depends + on the physical address bits, on processor features, kernel config + (5 level page table) and physical memory present on the system. + If unsure, say N. config PCI_LABEL diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 6084b38bdda1..178da6b9fc33 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1757,8 +1757,7 @@ static int hv_compose_multi_msi_req_get_cpu(void) spin_lock_irqsave(&multi_msi_cpu_lock, flags); - cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids, - false); + cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask); cpu = cpu_next; spin_unlock_irqrestore(&multi_msi_cpu_lock, flags); @@ -3976,24 +3975,18 @@ static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) { struct irq_data *irq_data; struct msi_desc *entry; - int ret = 0; if (!pdev->msi_enabled && !pdev->msix_enabled) return 0; - msi_lock_descs(&pdev->dev); + guard(msi_descs_lock)(&pdev->dev); msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) { irq_data = irq_get_irq_data(entry->irq); - if (WARN_ON_ONCE(!irq_data)) { - ret = -EINVAL; - break; - } - + if (WARN_ON_ONCE(!irq_data)) + return -EINVAL; hv_compose_msi_msg(irq_data, &entry->msg); } - msi_unlock_descs(&pdev->dev); - - return ret; + return 0; } /* diff --git a/drivers/pci/msi/api.c b/drivers/pci/msi/api.c index b956ce591f96..d89f491afdf0 100644 --- a/drivers/pci/msi/api.c +++ b/drivers/pci/msi/api.c @@ -53,10 +53,9 @@ void pci_disable_msi(struct pci_dev *dev) if (!pci_msi_enabled() || !dev || !dev->msi_enabled) return; - msi_lock_descs(&dev->dev); + guard(msi_descs_lock)(&dev->dev); pci_msi_shutdown(dev); pci_free_msi_irqs(dev); - msi_unlock_descs(&dev->dev); } EXPORT_SYMBOL(pci_disable_msi); @@ -196,10 +195,9 @@ void pci_disable_msix(struct pci_dev *dev) if (!pci_msi_enabled() || !dev || !dev->msix_enabled) return; - msi_lock_descs(&dev->dev); + guard(msi_descs_lock)(&dev->dev); pci_msix_shutdown(dev); pci_free_msi_irqs(dev); - msi_unlock_descs(&dev->dev); } EXPORT_SYMBOL(pci_disable_msix); diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 2f647cac4cae..dc78d9d402c3 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -336,41 +336,11 @@ static int msi_verify_entries(struct pci_dev *dev) return !entry ? 0 : -EIO; } -/** - * msi_capability_init - configure device's MSI capability structure - * @dev: pointer to the pci_dev data structure of MSI device function - * @nvec: number of interrupts to allocate - * @affd: description of automatic IRQ affinity assignments (may be %NULL) - * - * Setup the MSI capability structure of the device with the requested - * number of interrupts. A return value of zero indicates the successful - * setup of an entry with the new MSI IRQ. A negative return value indicates - * an error, and a positive return value indicates the number of interrupts - * which could have been allocated. - */ -static int msi_capability_init(struct pci_dev *dev, int nvec, - struct irq_affinity *affd) +static int __msi_capability_init(struct pci_dev *dev, int nvec, struct irq_affinity_desc *masks) { - struct irq_affinity_desc *masks = NULL; + int ret = msi_setup_msi_desc(dev, nvec, masks); struct msi_desc *entry, desc; - int ret; - /* Reject multi-MSI early on irq domain enabled architectures */ - if (nvec > 1 && !pci_msi_domain_supports(dev, MSI_FLAG_MULTI_PCI_MSI, ALLOW_LEGACY)) - return 1; - - /* - * Disable MSI during setup in the hardware, but mark it enabled - * so that setup code can evaluate it. - */ - pci_msi_set_enable(dev, 0); - dev->msi_enabled = 1; - - if (affd) - masks = irq_create_affinity_masks(nvec, affd); - - msi_lock_descs(&dev->dev); - ret = msi_setup_msi_desc(dev, nvec, masks); if (ret) goto fail; @@ -399,19 +369,48 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, pcibios_free_irq(dev); dev->irq = entry->irq; - goto unlock; - + return 0; err: pci_msi_unmask(&desc, msi_multi_mask(&desc)); pci_free_msi_irqs(dev); fail: dev->msi_enabled = 0; -unlock: - msi_unlock_descs(&dev->dev); - kfree(masks); return ret; } +/** + * msi_capability_init - configure device's MSI capability structure + * @dev: pointer to the pci_dev data structure of MSI device function + * @nvec: number of interrupts to allocate + * @affd: description of automatic IRQ affinity assignments (may be %NULL) + * + * Setup the MSI capability structure of the device with the requested + * number of interrupts. A return value of zero indicates the successful + * setup of an entry with the new MSI IRQ. A negative return value indicates + * an error, and a positive return value indicates the number of interrupts + * which could have been allocated. + */ +static int msi_capability_init(struct pci_dev *dev, int nvec, + struct irq_affinity *affd) +{ + /* Reject multi-MSI early on irq domain enabled architectures */ + if (nvec > 1 && !pci_msi_domain_supports(dev, MSI_FLAG_MULTI_PCI_MSI, ALLOW_LEGACY)) + return 1; + + /* + * Disable MSI during setup in the hardware, but mark it enabled + * so that setup code can evaluate it. + */ + pci_msi_set_enable(dev, 0); + dev->msi_enabled = 1; + + struct irq_affinity_desc *masks __free(kfree) = + affd ? irq_create_affinity_masks(nvec, affd) : NULL; + + guard(msi_descs_lock)(&dev->dev); + return __msi_capability_init(dev, nvec, masks); +} + int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, struct irq_affinity *affd) { @@ -666,40 +665,41 @@ static void msix_mask_all(void __iomem *base, int tsize) writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL); } -static int msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries, - int nvec, struct irq_affinity *affd) +static int __msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries, + int nvec, struct irq_affinity_desc *masks) { - struct irq_affinity_desc *masks = NULL; - int ret; - - if (affd) - masks = irq_create_affinity_masks(nvec, affd); + int ret = msix_setup_msi_descs(dev, entries, nvec, masks); - msi_lock_descs(&dev->dev); - ret = msix_setup_msi_descs(dev, entries, nvec, masks); if (ret) - goto out_free; + goto fail; ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX); if (ret) - goto out_free; + goto fail; /* Check if all MSI entries honor device restrictions */ ret = msi_verify_entries(dev); if (ret) - goto out_free; + goto fail; msix_update_entries(dev, entries); - goto out_unlock; + return 0; -out_free: +fail: pci_free_msi_irqs(dev); -out_unlock: - msi_unlock_descs(&dev->dev); - kfree(masks); return ret; } +static int msix_setup_interrupts(struct pci_dev *dev, struct msix_entry *entries, + int nvec, struct irq_affinity *affd) +{ + struct irq_affinity_desc *masks __free(kfree) = + affd ? irq_create_affinity_masks(nvec, affd) : NULL; + + guard(msi_descs_lock)(&dev->dev); + return __msix_setup_interrupts(dev, entries, nvec, masks); +} + /** * msix_capability_init - configure device's MSI-X capability * @dev: pointer to the pci_dev data structure of MSI-X device function @@ -871,13 +871,13 @@ void __pci_restore_msix_state(struct pci_dev *dev) write_msg = arch_restore_msi_irqs(dev); - msi_lock_descs(&dev->dev); - msi_for_each_desc(entry, &dev->dev, MSI_DESC_ALL) { - if (write_msg) - __pci_write_msi_msg(entry, &entry->msg); - pci_msix_write_vector_ctrl(entry, entry->pci.msix_ctrl); + scoped_guard (msi_descs_lock, &dev->dev) { + msi_for_each_desc(entry, &dev->dev, MSI_DESC_ALL) { + if (write_msg) + __pci_write_msi_msg(entry, &entry->msg); + pci_msix_write_vector_ctrl(entry, entry->pci.msix_ctrl); + } } - msi_unlock_descs(&dev->dev); pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0); } @@ -916,6 +916,53 @@ void pci_free_msi_irqs(struct pci_dev *dev) } } +#ifdef CONFIG_PCIE_TPH +/** + * pci_msix_write_tph_tag - Update the TPH tag for a given MSI-X vector + * @pdev: The PCIe device to update + * @index: The MSI-X index to update + * @tag: The tag to write + * + * Returns: 0 on success, error code on failure + */ +int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int index, u16 tag) +{ + struct msi_desc *msi_desc; + struct irq_desc *irq_desc; + unsigned int virq; + + if (!pdev->msix_enabled) + return -ENXIO; + + guard(msi_descs_lock)(&pdev->dev); + virq = msi_get_virq(&pdev->dev, index); + if (!virq) + return -ENXIO; + /* + * This is a horrible hack, but short of implementing a PCI + * specific interrupt chip callback and a huge pile of + * infrastructure, this is the minor nuissance. It provides the + * protection against concurrent operations on this entry and keeps + * the control word cache in sync. + */ + irq_desc = irq_to_desc(virq); + if (!irq_desc) + return -ENXIO; + + guard(raw_spinlock_irq)(&irq_desc->lock); + msi_desc = irq_data_get_msi_desc(&irq_desc->irq_data); + if (!msi_desc || msi_desc->pci.msi_attrib.is_virtual) + return -ENXIO; + + msi_desc->pci.msix_ctrl &= ~PCI_MSIX_ENTRY_CTRL_ST; + msi_desc->pci.msix_ctrl |= FIELD_PREP(PCI_MSIX_ENTRY_CTRL_ST, tag); + pci_msix_write_vector_ctrl(msi_desc, msi_desc->pci.msix_ctrl); + /* Flush the write */ + readl(pci_msix_desc_addr(msi_desc)); + return 0; +} +#endif + /* Misc. infrastructure */ struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 01e51db8d285..2e9cf26a9ee9 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -989,6 +989,15 @@ int pcim_request_region_exclusive(struct pci_dev *pdev, int bar, const char *name); void pcim_release_region(struct pci_dev *pdev, int bar); +#ifdef CONFIG_PCI_MSI +int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int index, u16 tag); +#else +static inline int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int index, u16 tag) +{ + return -ENODEV; +} +#endif + /* * Config Address for PCI Configuration Mechanism #1 * diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c index 07de59ca2ebf..77fce5e1b830 100644 --- a/drivers/pci/tph.c +++ b/drivers/pci/tph.c @@ -204,48 +204,6 @@ static u8 get_rp_completer_type(struct pci_dev *pdev) return FIELD_GET(PCI_EXP_DEVCAP2_TPH_COMP_MASK, reg); } -/* Write ST to MSI-X vector control reg - Return 0 if OK, otherwise -errno */ -static int write_tag_to_msix(struct pci_dev *pdev, int msix_idx, u16 tag) -{ -#ifdef CONFIG_PCI_MSI - struct msi_desc *msi_desc = NULL; - void __iomem *vec_ctrl; - u32 val; - int err = 0; - - msi_lock_descs(&pdev->dev); - - /* Find the msi_desc entry with matching msix_idx */ - msi_for_each_desc(msi_desc, &pdev->dev, MSI_DESC_ASSOCIATED) { - if (msi_desc->msi_index == msix_idx) - break; - } - - if (!msi_desc) { - err = -ENXIO; - goto err_out; - } - - /* Get the vector control register (offset 0xc) pointed by msix_idx */ - vec_ctrl = pdev->msix_base + msix_idx * PCI_MSIX_ENTRY_SIZE; - vec_ctrl += PCI_MSIX_ENTRY_VECTOR_CTRL; - - val = readl(vec_ctrl); - val &= ~PCI_MSIX_ENTRY_CTRL_ST; - val |= FIELD_PREP(PCI_MSIX_ENTRY_CTRL_ST, tag); - writel(val, vec_ctrl); - - /* Read back to flush the update */ - val = readl(vec_ctrl); - -err_out: - msi_unlock_descs(&pdev->dev); - return err; -#else - return -ENODEV; -#endif -} - /* Write tag to ST table - Return 0 if OK, otherwise -errno */ static int write_tag_to_st_table(struct pci_dev *pdev, int index, u16 tag) { @@ -346,7 +304,7 @@ int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) switch (loc) { case PCI_TPH_LOC_MSIX: - err = write_tag_to_msix(pdev, index, tag); + err = pci_msix_write_tph_tag(pdev, index, tag); break; case PCI_TPH_LOC_CAP: err = write_tag_to_st_table(pdev, index, tag); diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index 6be703619a97..df9a28ba69dc 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -428,10 +428,6 @@ static void m1_pmu_enable_event(struct perf_event *event) user = event->hw.config_base & M1_PMU_CFG_COUNT_USER; kernel = event->hw.config_base & M1_PMU_CFG_COUNT_KERNEL; - m1_pmu_disable_counter_interrupt(event->hw.idx); - m1_pmu_disable_counter(event->hw.idx); - isb(); - m1_pmu_configure_counter(event->hw.idx, event->hw.config_base); m1_pmu_enable_counter(event->hw.idx); m1_pmu_enable_counter_interrupt(event->hw.idx); diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index d5fcea3d4328..1a0d0e1a2263 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -1273,9 +1273,8 @@ static int arm_ccn_pmu_init(struct arm_ccn *ccn) /* No overflow interrupt? Have to use a timer instead. */ if (!ccn->irq) { dev_info(ccn->dev, "No access to interrupts, using timer.\n"); - hrtimer_init(&ccn->dt.hrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - ccn->dt.hrtimer.function = arm_ccn_pmu_timer_handler; + hrtimer_setup(&ccn->dt.hrtimer, arm_ccn_pmu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } /* Pick one CPU which we will use to collect data from CCN... */ diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index ef959e66db7c..d4fe30ff225b 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -802,8 +802,6 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, CMN_EVENT_ATTR(_model, ccha_##_name, CMN_TYPE_CCHA, _event) #define CMN_EVENT_CCLA(_name, _event) \ CMN_EVENT_ATTR(CMN_ANY, ccla_##_name, CMN_TYPE_CCLA, _event) -#define CMN_EVENT_CCLA_RNI(_name, _event) \ - CMN_EVENT_ATTR(CMN_ANY, ccla_rni_##_name, CMN_TYPE_CCLA_RNI, _event) #define CMN_EVENT_HNS(_name, _event) \ CMN_EVENT_ATTR(CMN_ANY, hns_##_name, CMN_TYPE_HNS, _event) @@ -1798,6 +1796,9 @@ static int arm_cmn_event_init(struct perf_event *event) } else if (type == CMN_TYPE_XP && (cmn->part == PART_CMN700 || cmn->part == PART_CMN_S3)) { hw->wide_sel = true; + } else if (type == CMN_TYPE_RND) { + /* Secretly permit this as an alias for "rnid" events */ + type = CMN_TYPE_RNI; } /* This is sufficiently annoying to recalculate, so cache it */ diff --git a/drivers/perf/arm_cspmu/ampere_cspmu.c b/drivers/perf/arm_cspmu/ampere_cspmu.c index f72f5689923c..b8ca69fd9d1d 100644 --- a/drivers/perf/arm_cspmu/ampere_cspmu.c +++ b/drivers/perf/arm_cspmu/ampere_cspmu.c @@ -10,10 +10,10 @@ #include "arm_cspmu.h" -#define PMAUXR0 0xD80 -#define PMAUXR1 0xD84 -#define PMAUXR2 0xD88 -#define PMAUXR3 0xD8C +#define PMAUXR0 PMIMPDEF +#define PMAUXR1 (PMIMPDEF + 0x4) +#define PMAUXR2 (PMIMPDEF + 0x8) +#define PMAUXR3 (PMIMPDEF + 0xC) #define to_ampere_cspmu_ctx(cspmu) ((struct ampere_cspmu_ctx *)(cspmu->impl.ctx)) @@ -132,32 +132,20 @@ ampere_cspmu_get_name(const struct arm_cspmu *cspmu) return ctx->name; } -static u32 ampere_cspmu_event_filter(const struct perf_event *event) +static void ampere_cspmu_set_cc_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) { /* - * PMEVFILTR or PMCCFILTR aren't used in Ampere SoC PMU but are marked - * as RES0. Make sure, PMCCFILTR is written zero. + * PMCCFILTR is RES0, so this is just a dummy callback to override + * the default implementation and avoid writing to it. */ - return 0; } static void ampere_cspmu_set_ev_filter(struct arm_cspmu *cspmu, - struct hw_perf_event *hwc, - u32 filter) + const struct perf_event *event) { - struct perf_event *event; - unsigned int idx; u32 threshold, rank, bank; - /* - * At this point, all the events have the same filter settings. - * Therefore, take the first event and use its configuration. - */ - idx = find_first_bit(cspmu->hw_events.used_ctrs, - cspmu->cycle_counter_logical_idx); - - event = cspmu->hw_events.events[idx]; - threshold = get_threshold(event); rank = get_rank(event); bank = get_bank(event); @@ -233,7 +221,7 @@ static int ampere_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; - impl_ops->event_filter = ampere_cspmu_event_filter; + impl_ops->set_cc_filter = ampere_cspmu_set_cc_filter; impl_ops->set_ev_filter = ampere_cspmu_set_ev_filter; impl_ops->validate_event = ampere_cspmu_validate_event; impl_ops->get_name = ampere_cspmu_get_name; diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 81e8b97e9353..efa9b229e701 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -40,51 +40,6 @@ ARM_CSPMU_EXT_ATTR(_name, arm_cspmu_cpumask_show, \ (unsigned long)_config) -/* - * CoreSight PMU Arch register offsets. - */ -#define PMEVCNTR_LO 0x0 -#define PMEVCNTR_HI 0x4 -#define PMEVTYPER 0x400 -#define PMCCFILTR 0x47C -#define PMEVFILTR 0xA00 -#define PMCNTENSET 0xC00 -#define PMCNTENCLR 0xC20 -#define PMINTENSET 0xC40 -#define PMINTENCLR 0xC60 -#define PMOVSCLR 0xC80 -#define PMOVSSET 0xCC0 -#define PMCFGR 0xE00 -#define PMCR 0xE04 -#define PMIIDR 0xE08 - -/* PMCFGR register field */ -#define PMCFGR_NCG GENMASK(31, 28) -#define PMCFGR_HDBG BIT(24) -#define PMCFGR_TRO BIT(23) -#define PMCFGR_SS BIT(22) -#define PMCFGR_FZO BIT(21) -#define PMCFGR_MSI BIT(20) -#define PMCFGR_UEN BIT(19) -#define PMCFGR_NA BIT(17) -#define PMCFGR_EX BIT(16) -#define PMCFGR_CCD BIT(15) -#define PMCFGR_CC BIT(14) -#define PMCFGR_SIZE GENMASK(13, 8) -#define PMCFGR_N GENMASK(7, 0) - -/* PMCR register field */ -#define PMCR_TRO BIT(11) -#define PMCR_HDBG BIT(10) -#define PMCR_FZO BIT(9) -#define PMCR_NA BIT(8) -#define PMCR_DP BIT(5) -#define PMCR_X BIT(4) -#define PMCR_D BIT(3) -#define PMCR_C BIT(2) -#define PMCR_P BIT(1) -#define PMCR_E BIT(0) - /* Each SET/CLR register supports up to 32 counters. */ #define ARM_CSPMU_SET_CLR_COUNTER_SHIFT 5 #define ARM_CSPMU_SET_CLR_COUNTER_NUM \ @@ -111,7 +66,9 @@ static unsigned long arm_cspmu_cpuhp_state; static DEFINE_MUTEX(arm_cspmu_lock); static void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu, - struct hw_perf_event *hwc, u32 filter); + const struct perf_event *event); +static void arm_cspmu_set_cc_filter(struct arm_cspmu *cspmu, + const struct perf_event *event); static struct acpi_apmt_node *arm_cspmu_apmt_node(struct device *dev) { @@ -226,6 +183,7 @@ arm_cspmu_event_attr_is_visible(struct kobject *kobj, static struct attribute *arm_cspmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, + ARM_CSPMU_FORMAT_FILTER2_ATTR, NULL, }; @@ -250,11 +208,6 @@ static bool arm_cspmu_is_cycle_counter_event(const struct perf_event *event) return (event->attr.config == ARM_CSPMU_EVT_CYCLES_DEFAULT); } -static u32 arm_cspmu_event_filter(const struct perf_event *event) -{ - return event->attr.config1 & ARM_CSPMU_FILTER_MASK; -} - static ssize_t arm_cspmu_identifier_show(struct device *dev, struct device_attribute *attr, char *page) @@ -416,7 +369,7 @@ static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu) DEFAULT_IMPL_OP(get_name), DEFAULT_IMPL_OP(is_cycle_counter_event), DEFAULT_IMPL_OP(event_type), - DEFAULT_IMPL_OP(event_filter), + DEFAULT_IMPL_OP(set_cc_filter), DEFAULT_IMPL_OP(set_ev_filter), DEFAULT_IMPL_OP(event_attr_is_visible), }; @@ -812,26 +765,28 @@ static inline void arm_cspmu_set_event(struct arm_cspmu *cspmu, } static void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu, - struct hw_perf_event *hwc, - u32 filter) + const struct perf_event *event) { - u32 offset = PMEVFILTR + (4 * hwc->idx); + u32 filter = event->attr.config1 & ARM_CSPMU_FILTER_MASK; + u32 filter2 = event->attr.config2 & ARM_CSPMU_FILTER_MASK; + u32 offset = 4 * event->hw.idx; - writel(filter, cspmu->base0 + offset); + writel(filter, cspmu->base0 + PMEVFILTR + offset); + writel(filter2, cspmu->base0 + PMEVFILT2R + offset); } -static inline void arm_cspmu_set_cc_filter(struct arm_cspmu *cspmu, u32 filter) +static void arm_cspmu_set_cc_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) { - u32 offset = PMCCFILTR; + u32 filter = event->attr.config1 & ARM_CSPMU_FILTER_MASK; - writel(filter, cspmu->base0 + offset); + writel(filter, cspmu->base0 + PMCCFILTR); } static void arm_cspmu_start(struct perf_event *event, int pmu_flags) { struct arm_cspmu *cspmu = to_arm_cspmu(event->pmu); struct hw_perf_event *hwc = &event->hw; - u32 filter; /* We always reprogram the counter */ if (pmu_flags & PERF_EF_RELOAD) @@ -839,13 +794,11 @@ static void arm_cspmu_start(struct perf_event *event, int pmu_flags) arm_cspmu_set_event_period(event); - filter = cspmu->impl.ops.event_filter(event); - if (event->hw.extra_reg.idx == cspmu->cycle_counter_logical_idx) { - arm_cspmu_set_cc_filter(cspmu, filter); + cspmu->impl.ops.set_cc_filter(cspmu, event); } else { arm_cspmu_set_event(cspmu, hwc); - cspmu->impl.ops.set_ev_filter(cspmu, hwc, filter); + cspmu->impl.ops.set_ev_filter(cspmu, event); } hwc->state = 0; diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h index 2621f3111148..19684b76bd96 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.h +++ b/drivers/perf/arm_cspmu/arm_cspmu.h @@ -47,6 +47,8 @@ /* Default filter format */ #define ARM_CSPMU_FORMAT_FILTER_ATTR \ ARM_CSPMU_FORMAT_ATTR(filter, "config1:0-31") +#define ARM_CSPMU_FORMAT_FILTER2_ATTR \ + ARM_CSPMU_FORMAT_ATTR(filter2, "config2:0-31") /* * This is the default event number for cycle count, if supported, since the @@ -65,6 +67,53 @@ /* The cycle counter, if implemented, is located at counter[31]. */ #define ARM_CSPMU_CYCLE_CNTR_IDX 31 +/* + * CoreSight PMU Arch register offsets. + */ +#define PMEVCNTR_LO 0x0 +#define PMEVCNTR_HI 0x4 +#define PMEVTYPER 0x400 +#define PMCCFILTR 0x47C +#define PMEVFILT2R 0x800 +#define PMEVFILTR 0xA00 +#define PMCNTENSET 0xC00 +#define PMCNTENCLR 0xC20 +#define PMINTENSET 0xC40 +#define PMINTENCLR 0xC60 +#define PMOVSCLR 0xC80 +#define PMOVSSET 0xCC0 +#define PMIMPDEF 0xD80 +#define PMCFGR 0xE00 +#define PMCR 0xE04 +#define PMIIDR 0xE08 + +/* PMCFGR register field */ +#define PMCFGR_NCG GENMASK(31, 28) +#define PMCFGR_HDBG BIT(24) +#define PMCFGR_TRO BIT(23) +#define PMCFGR_SS BIT(22) +#define PMCFGR_FZO BIT(21) +#define PMCFGR_MSI BIT(20) +#define PMCFGR_UEN BIT(19) +#define PMCFGR_NA BIT(17) +#define PMCFGR_EX BIT(16) +#define PMCFGR_CCD BIT(15) +#define PMCFGR_CC BIT(14) +#define PMCFGR_SIZE GENMASK(13, 8) +#define PMCFGR_N GENMASK(7, 0) + +/* PMCR register field */ +#define PMCR_TRO BIT(11) +#define PMCR_HDBG BIT(10) +#define PMCR_FZO BIT(9) +#define PMCR_NA BIT(8) +#define PMCR_DP BIT(5) +#define PMCR_X BIT(4) +#define PMCR_D BIT(3) +#define PMCR_C BIT(2) +#define PMCR_P BIT(1) +#define PMCR_E BIT(0) + /* PMIIDR register field */ #define ARM_CSPMU_PMIIDR_IMPLEMENTER GENMASK(11, 0) #define ARM_CSPMU_PMIIDR_PRODUCTID GENMASK(31, 20) @@ -103,11 +152,11 @@ struct arm_cspmu_impl_ops { bool (*is_cycle_counter_event)(const struct perf_event *event); /* Decode event type/id from configs */ u32 (*event_type)(const struct perf_event *event); - /* Decode filter value from configs */ - u32 (*event_filter)(const struct perf_event *event); - /* Set event filter */ + /* Set event filters */ + void (*set_cc_filter)(struct arm_cspmu *cspmu, + const struct perf_event *event); void (*set_ev_filter)(struct arm_cspmu *cspmu, - struct hw_perf_event *hwc, u32 filter); + const struct perf_event *event); /* Implementation specific event validation */ int (*validate_event)(struct arm_cspmu *cspmu, struct perf_event *event); diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 8116c7846a46..dc6d4e3e2a1b 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -6,6 +6,7 @@ /* Support for NVIDIA specific attributes. */ +#include <linux/io.h> #include <linux/module.h> #include <linux/topology.h> @@ -183,6 +184,24 @@ static u32 nv_cspmu_event_filter(const struct perf_event *event) return filter_val; } +static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + u32 filter = nv_cspmu_event_filter(event); + u32 offset = PMEVFILTR + (4 * event->hw.idx); + + writel(filter, cspmu->base0 + offset); +} + +static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + u32 filter = nv_cspmu_event_filter(event); + + writel(filter, cspmu->base0 + PMCCFILTR); +} + + enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, NAME_FMT_SOCKET @@ -322,7 +341,8 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; /* NVIDIA specific callbacks. */ - impl_ops->event_filter = nv_cspmu_event_filter; + impl_ops->set_cc_filter = nv_cspmu_set_cc_filter; + impl_ops->set_ev_filter = nv_cspmu_set_ev_filter; impl_ops->get_event_attrs = nv_cspmu_get_event_attrs; impl_ops->get_format_attrs = nv_cspmu_get_format_attrs; impl_ops->get_name = nv_cspmu_get_name; diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 398cce3d76fc..2f33e69a8caf 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -342,12 +342,10 @@ armpmu_add(struct perf_event *event, int flags) if (idx < 0) return idx; - /* - * If there is an event in the counter we are going to use then make - * sure it is disabled. - */ + /* The newly-allocated counter should be empty */ + WARN_ON_ONCE(hw_events->events[idx]); + event->hw.idx = idx; - armpmu->disable(event); hw_events->events[idx] = event; hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 0e360feb3432..e506d59654e7 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -795,11 +795,6 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) static void armv8pmu_enable_event(struct perf_event *event) { - /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - armv8pmu_disable_event_counter(event); armv8pmu_write_event_type(event); armv8pmu_enable_event_irq(event); armv8pmu_enable_event_counter(event); @@ -825,10 +820,10 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu) else armv8pmu_disable_user_access(); + kvm_vcpu_pmu_resync_el0(); + /* Enable all counters */ armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); - - kvm_vcpu_pmu_resync_el0(); } static void armv8pmu_stop(struct arm_pmu *cpu_pmu) @@ -1369,6 +1364,7 @@ PMUV3_INIT_SIMPLE(armv8_neoverse_v1) PMUV3_INIT_SIMPLE(armv8_neoverse_v2) PMUV3_INIT_SIMPLE(armv8_neoverse_v3) PMUV3_INIT_SIMPLE(armv8_neoverse_v3ae) +PMUV3_INIT_SIMPLE(armv8_rainier) PMUV3_INIT_SIMPLE(armv8_nvidia_carmel) PMUV3_INIT_SIMPLE(armv8_nvidia_denver) @@ -1416,6 +1412,7 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "arm,neoverse-v2-pmu", .data = armv8_neoverse_v2_pmu_init}, {.compatible = "arm,neoverse-v3-pmu", .data = armv8_neoverse_v3_pmu_init}, {.compatible = "arm,neoverse-v3ae-pmu", .data = armv8_neoverse_v3ae_pmu_init}, + {.compatible = "arm,rainier-pmu", .data = armv8_rainier_pmu_init}, {.compatible = "cavium,thunder-pmu", .data = armv8_cavium_thunder_pmu_init}, {.compatible = "brcm,vulcan-pmu", .data = armv8_brcm_vulcan_pmu_init}, {.compatible = "nvidia,carmel-pmu", .data = armv8_nvidia_carmel_pmu_init}, diff --git a/drivers/perf/arm_v7_pmu.c b/drivers/perf/arm_v7_pmu.c index 420cadd108e7..17831e1920bd 100644 --- a/drivers/perf/arm_v7_pmu.c +++ b/drivers/perf/arm_v7_pmu.c @@ -858,16 +858,6 @@ static void armv7pmu_enable_event(struct perf_event *event) } /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - - /* - * Disable counter - */ - armv7_pmnc_disable_counter(idx); - - /* * Set event (if destined for PMNx counters) * We only need to set the event for the cycle counter if we * have the ability to perform event filtering. @@ -875,14 +865,7 @@ static void armv7pmu_enable_event(struct perf_event *event) if (cpu_pmu->set_event_filter || idx != ARMV7_IDX_CYCLE_COUNTER) armv7_pmnc_write_evtsel(idx, hwc->config_base); - /* - * Enable interrupt for this counter - */ armv7_pmnc_enable_intens(idx); - - /* - * Enable counter - */ armv7_pmnc_enable_counter(idx); } @@ -898,18 +881,7 @@ static void armv7pmu_disable_event(struct perf_event *event) return; } - /* - * Disable counter and interrupt - */ - - /* - * Disable counter - */ armv7_pmnc_disable_counter(idx); - - /* - * Disable interrupt for this counter - */ armv7_pmnc_disable_intens(idx); } @@ -1477,14 +1449,6 @@ static void krait_pmu_enable_event(struct perf_event *event) int idx = hwc->idx; /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - - /* Disable counter */ - armv7_pmnc_disable_counter(idx); - - /* * Set event (if destined for PMNx counters) * We set the event for the cycle counter because we * have the ability to perform event filtering. @@ -1494,10 +1458,7 @@ static void krait_pmu_enable_event(struct perf_event *event) else armv7_pmnc_write_evtsel(idx, hwc->config_base); - /* Enable interrupt for this counter */ armv7_pmnc_enable_intens(idx); - - /* Enable counter */ armv7_pmnc_enable_counter(idx); } @@ -1798,14 +1759,6 @@ static void scorpion_pmu_enable_event(struct perf_event *event) int idx = hwc->idx; /* - * Enable counter and interrupt, and set the counter to count - * the event that we're interested in. - */ - - /* Disable counter */ - armv7_pmnc_disable_counter(idx); - - /* * Set event (if destined for PMNx counters) * We don't set the event for the cycle counter because we * don't have the ability to perform event filtering. @@ -1815,10 +1768,7 @@ static void scorpion_pmu_enable_event(struct perf_event *event) else if (idx != ARMV7_IDX_CYCLE_COUNTER) armv7_pmnc_write_evtsel(idx, hwc->config_base); - /* Enable interrupt for this counter */ armv7_pmnc_enable_intens(idx); - - /* Enable counter */ armv7_pmnc_enable_counter(idx); } diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index cccecae9823f..f851e070760c 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -565,15 +565,15 @@ static int dwc_pcie_register_dev(struct pci_dev *pdev) u32 sbdf; sbdf = (pci_domain_nr(pdev->bus) << 16) | PCI_DEVID(pdev->bus->number, pdev->devfn); - plat_dev = platform_device_register_data(NULL, "dwc_pcie_pmu", sbdf, - pdev, sizeof(*pdev)); - + plat_dev = platform_device_register_simple("dwc_pcie_pmu", sbdf, NULL, 0); if (IS_ERR(plat_dev)) return PTR_ERR(plat_dev); dev_info = kzalloc(sizeof(*dev_info), GFP_KERNEL); - if (!dev_info) + if (!dev_info) { + platform_device_unregister(plat_dev); return -ENOMEM; + } /* Cache platform device to handle pci device hotplug */ dev_info->plat_dev = plat_dev; @@ -614,18 +614,26 @@ static struct notifier_block dwc_pcie_pmu_nb = { static int dwc_pcie_pmu_probe(struct platform_device *plat_dev) { - struct pci_dev *pdev = plat_dev->dev.platform_data; + struct pci_dev *pdev; struct dwc_pcie_pmu *pcie_pmu; char *name; u32 sbdf; u16 vsec; int ret; + sbdf = plat_dev->id; + pdev = pci_get_domain_bus_and_slot(sbdf >> 16, PCI_BUS_NUM(sbdf & 0xffff), + sbdf & 0xff); + if (!pdev) { + pr_err("No pdev found for the sbdf 0x%x\n", sbdf); + return -ENODEV; + } + vsec = dwc_pcie_des_cap(pdev); if (!vsec) return -ENODEV; - sbdf = plat_dev->id; + pci_dev_put(pdev); name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x", sbdf); if (!name) return -ENOMEM; @@ -640,7 +648,7 @@ static int dwc_pcie_pmu_probe(struct platform_device *plat_dev) pcie_pmu->on_cpu = -1; pcie_pmu->pmu = (struct pmu){ .name = name, - .parent = &pdev->dev, + .parent = &plat_dev->dev, .module = THIS_MODULE, .attr_groups = dwc_pcie_attr_groups, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, @@ -730,6 +738,15 @@ static struct platform_driver dwc_pcie_pmu_driver = { .driver = {.name = "dwc_pcie_pmu",}, }; +static void dwc_pcie_cleanup_devices(void) +{ + struct dwc_pcie_dev_info *dev_info, *tmp; + + list_for_each_entry_safe(dev_info, tmp, &dwc_pcie_dev_info_head, dev_node) { + dwc_pcie_unregister_dev(dev_info); + } +} + static int __init dwc_pcie_pmu_init(void) { struct pci_dev *pdev = NULL; @@ -742,7 +759,7 @@ static int __init dwc_pcie_pmu_init(void) ret = dwc_pcie_register_dev(pdev); if (ret) { pci_dev_put(pdev); - return ret; + goto err_cleanup; } } @@ -751,35 +768,35 @@ static int __init dwc_pcie_pmu_init(void) dwc_pcie_pmu_online_cpu, dwc_pcie_pmu_offline_cpu); if (ret < 0) - return ret; + goto err_cleanup; dwc_pcie_pmu_hp_state = ret; ret = platform_driver_register(&dwc_pcie_pmu_driver); if (ret) - goto platform_driver_register_err; + goto err_remove_cpuhp; ret = bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb); if (ret) - goto platform_driver_register_err; + goto err_unregister_driver; notify = true; return 0; -platform_driver_register_err: +err_unregister_driver: + platform_driver_unregister(&dwc_pcie_pmu_driver); +err_remove_cpuhp: cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state); - +err_cleanup: + dwc_pcie_cleanup_devices(); return ret; } static void __exit dwc_pcie_pmu_exit(void) { - struct dwc_pcie_dev_info *dev_info, *tmp; - if (notify) bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb); - list_for_each_entry_safe(dev_info, tmp, &dwc_pcie_dev_info_head, dev_node) - dwc_pcie_unregister_dev(dev_info); + dwc_pcie_cleanup_devices(); platform_driver_unregister(&dwc_pcie_pmu_driver); cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state); } diff --git a/drivers/perf/marvell_cn10k_ddr_pmu.c b/drivers/perf/marvell_cn10k_ddr_pmu.c index 039feded9152..72ac17efd846 100644 --- a/drivers/perf/marvell_cn10k_ddr_pmu.c +++ b/drivers/perf/marvell_cn10k_ddr_pmu.c @@ -1064,8 +1064,8 @@ static int cn10k_ddr_perf_probe(struct platform_device *pdev) if (!name) return -ENOMEM; - hrtimer_init(&ddr_pmu->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ddr_pmu->hrtimer.function = cn10k_ddr_pmu_timer_handler; + hrtimer_setup(&ddr_pmu->hrtimer, cn10k_ddr_pmu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cpuhp_state_add_instance_nocalls( CPUHP_AP_PERF_ARM_MARVELL_CN10K_DDR_ONLINE, diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c index cadd60221b8f..6ed4707bd6bb 100644 --- a/drivers/perf/thunderx2_pmu.c +++ b/drivers/perf/thunderx2_pmu.c @@ -752,9 +752,8 @@ static int tx2_uncore_pmu_add_dev(struct tx2_uncore_pmu *tx2_pmu) tx2_pmu->cpu = cpu; if (tx2_pmu->hrtimer_callback) { - hrtimer_init(&tx2_pmu->hrtimer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - tx2_pmu->hrtimer.function = tx2_pmu->hrtimer_callback; + hrtimer_setup(&tx2_pmu->hrtimer, tx2_pmu->hrtimer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } ret = tx2_uncore_pmu_register(tx2_pmu); diff --git a/drivers/pinctrl/spacemit/Kconfig b/drivers/pinctrl/spacemit/Kconfig index a2f98b3f8a75..d6f6017fd097 100644 --- a/drivers/pinctrl/spacemit/Kconfig +++ b/drivers/pinctrl/spacemit/Kconfig @@ -7,7 +7,7 @@ config PINCTRL_SPACEMIT_K1 bool "SpacemiT K1 SoC Pinctrl driver" depends on ARCH_SPACEMIT || COMPILE_TEST depends on OF - default y + default ARCH_SPACEMIT select GENERIC_PINCTRL_GROUPS select GENERIC_PINMUX_FUNCTIONS select GENERIC_PINCONF diff --git a/drivers/platform/x86/amd/hsmp/Kconfig b/drivers/platform/x86/amd/hsmp/Kconfig index 7d10d4462a45..d6f7a62d55b5 100644 --- a/drivers/platform/x86/amd/hsmp/Kconfig +++ b/drivers/platform/x86/amd/hsmp/Kconfig @@ -7,7 +7,7 @@ config AMD_HSMP tristate menu "AMD HSMP Driver" - depends on AMD_NB || COMPILE_TEST + depends on AMD_NODE || COMPILE_TEST config AMD_HSMP_ACPI tristate "AMD HSMP ACPI device driver" diff --git a/drivers/platform/x86/amd/hsmp/acpi.c b/drivers/platform/x86/amd/hsmp/acpi.c index 444b43be35a2..c1eccb3c80c5 100644 --- a/drivers/platform/x86/amd/hsmp/acpi.c +++ b/drivers/platform/x86/amd/hsmp/acpi.c @@ -10,7 +10,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/amd_hsmp.h> -#include <asm/amd_nb.h> #include <linux/acpi.h> #include <linux/device.h> @@ -24,6 +23,8 @@ #include <uapi/asm-generic/errno-base.h> +#include <asm/amd_node.h> + #include "hsmp.h" #define DRIVER_NAME "amd_hsmp" @@ -321,8 +322,8 @@ static int hsmp_acpi_probe(struct platform_device *pdev) return -ENOMEM; if (!hsmp_pdev->is_probed) { - hsmp_pdev->num_sockets = amd_nb_num(); - if (hsmp_pdev->num_sockets == 0 || hsmp_pdev->num_sockets > MAX_AMD_SOCKETS) + hsmp_pdev->num_sockets = amd_num_nodes(); + if (hsmp_pdev->num_sockets == 0 || hsmp_pdev->num_sockets > MAX_AMD_NUM_NODES) return -ENODEV; hsmp_pdev->sock = devm_kcalloc(&pdev->dev, hsmp_pdev->num_sockets, diff --git a/drivers/platform/x86/amd/hsmp/hsmp.c b/drivers/platform/x86/amd/hsmp/hsmp.c index 03164e30b3a5..a3ac09a90de4 100644 --- a/drivers/platform/x86/amd/hsmp/hsmp.c +++ b/drivers/platform/x86/amd/hsmp/hsmp.c @@ -8,7 +8,6 @@ */ #include <asm/amd_hsmp.h> -#include <asm/amd_nb.h> #include <linux/acpi.h> #include <linux/delay.h> diff --git a/drivers/platform/x86/amd/hsmp/hsmp.h b/drivers/platform/x86/amd/hsmp/hsmp.h index e852f0a947e4..af8b21f821d6 100644 --- a/drivers/platform/x86/amd/hsmp/hsmp.h +++ b/drivers/platform/x86/amd/hsmp/hsmp.h @@ -21,8 +21,6 @@ #define HSMP_ATTR_GRP_NAME_SIZE 10 -#define MAX_AMD_SOCKETS 8 - #define HSMP_CDEV_NAME "hsmp_cdev" #define HSMP_DEVNODE_NAME "hsmp" @@ -41,7 +39,6 @@ struct hsmp_socket { void __iomem *virt_base_addr; struct semaphore hsmp_sem; char name[HSMP_ATTR_GRP_NAME_SIZE]; - struct pci_dev *root; struct device *dev; u16 sock_ind; int (*amd_hsmp_rdwr)(struct hsmp_socket *sock, u32 off, u32 *val, bool rw); diff --git a/drivers/platform/x86/amd/hsmp/plat.c b/drivers/platform/x86/amd/hsmp/plat.c index 02ca85762b68..b9782a078dbd 100644 --- a/drivers/platform/x86/amd/hsmp/plat.c +++ b/drivers/platform/x86/amd/hsmp/plat.c @@ -10,14 +10,16 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/amd_hsmp.h> -#include <asm/amd_nb.h> +#include <linux/build_bug.h> #include <linux/device.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/platform_device.h> #include <linux/sysfs.h> +#include <asm/amd_node.h> + #include "hsmp.h" #define DRIVER_NAME "amd_hsmp" @@ -34,28 +36,12 @@ #define SMN_HSMP_MSG_RESP 0x0010980 #define SMN_HSMP_MSG_DATA 0x00109E0 -#define HSMP_INDEX_REG 0xc4 -#define HSMP_DATA_REG 0xc8 - static struct hsmp_plat_device *hsmp_pdev; static int amd_hsmp_pci_rdwr(struct hsmp_socket *sock, u32 offset, u32 *value, bool write) { - int ret; - - if (!sock->root) - return -ENODEV; - - ret = pci_write_config_dword(sock->root, HSMP_INDEX_REG, - sock->mbinfo.base_addr + offset); - if (ret) - return ret; - - ret = (write ? pci_write_config_dword(sock->root, HSMP_DATA_REG, *value) - : pci_read_config_dword(sock->root, HSMP_DATA_REG, value)); - - return ret; + return amd_smn_hsmp_rdwr(sock->sock_ind, sock->mbinfo.base_addr + offset, value, write); } static ssize_t hsmp_metric_tbl_plat_read(struct file *filp, struct kobject *kobj, @@ -95,7 +81,12 @@ static umode_t hsmp_is_sock_attr_visible(struct kobject *kobj, * Static array of 8 + 1(for NULL) elements is created below * to create sysfs groups for sockets. * is_bin_visible function is used to show / hide the necessary groups. + * + * Validate the maximum number against MAX_AMD_NUM_NODES. If this changes, + * then the attributes and groups below must be adjusted. */ +static_assert(MAX_AMD_NUM_NODES == 8); + #define HSMP_BIN_ATTR(index, _list) \ static const struct bin_attribute attr##index = { \ .attr = { .name = HSMP_METRICS_TABLE_NAME, .mode = 0444}, \ @@ -159,10 +150,7 @@ static int init_platform_device(struct device *dev) int ret, i; for (i = 0; i < hsmp_pdev->num_sockets; i++) { - if (!node_to_amd_nb(i)) - return -ENODEV; sock = &hsmp_pdev->sock[i]; - sock->root = node_to_amd_nb(i)->root; sock->sock_ind = i; sock->dev = dev; sock->mbinfo.base_addr = SMN_HSMP_BASE; @@ -305,11 +293,11 @@ static int __init hsmp_plt_init(void) return -ENOMEM; /* - * amd_nb_num() returns number of SMN/DF interfaces present in the system + * amd_num_nodes() returns number of SMN/DF interfaces present in the system * if we have N SMN/DF interfaces that ideally means N sockets */ - hsmp_pdev->num_sockets = amd_nb_num(); - if (hsmp_pdev->num_sockets == 0 || hsmp_pdev->num_sockets > MAX_AMD_SOCKETS) + hsmp_pdev->num_sockets = amd_num_nodes(); + if (hsmp_pdev->num_sockets == 0 || hsmp_pdev->num_sockets > MAX_AMD_NUM_NODES) return ret; ret = platform_driver_register(&amd_hsmp_driver); diff --git a/drivers/pmdomain/amlogic/meson-secure-pwrc.c b/drivers/pmdomain/amlogic/meson-secure-pwrc.c index 42ce41a2fe3a..ff76ea36835e 100644 --- a/drivers/pmdomain/amlogic/meson-secure-pwrc.c +++ b/drivers/pmdomain/amlogic/meson-secure-pwrc.c @@ -221,7 +221,7 @@ static const struct meson_secure_pwrc_domain_desc t7_pwrc_domains[] = { SEC_PD(T7_VI_CLK2, 0), /* ETH is for ethernet online wakeup, and should be always on */ SEC_PD(T7_ETH, GENPD_FLAG_ALWAYS_ON), - SEC_PD(T7_ISP, 0), + TOP_PD(T7_ISP, 0, PWRC_T7_MIPI_ISP_ID), SEC_PD(T7_MIPI_ISP, 0), TOP_PD(T7_GDC, 0, PWRC_T7_NIC3_ID), TOP_PD(T7_DEWARP, 0, PWRC_T7_NIC3_ID), diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c index 1a6fc8d38e20..90c664d344d0 100644 --- a/drivers/power/reset/ltc2952-poweroff.c +++ b/drivers/power/reset/ltc2952-poweroff.c @@ -162,11 +162,11 @@ static void ltc2952_poweroff_default(struct ltc2952_poweroff *data) data->wde_interval = 300L * NSEC_PER_MSEC; data->trigger_delay = ktime_set(2, 500L * NSEC_PER_MSEC); - hrtimer_init(&data->timer_trigger, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - data->timer_trigger.function = ltc2952_poweroff_timer_trigger; + hrtimer_setup(&data->timer_trigger, ltc2952_poweroff_timer_trigger, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); - hrtimer_init(&data->timer_wde, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - data->timer_wde.function = ltc2952_poweroff_timer_wde; + hrtimer_setup(&data->timer_wde, ltc2952_poweroff_timer_wde, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } static int ltc2952_poweroff_init(struct platform_device *pdev) diff --git a/drivers/power/supply/ab8500_chargalg.c b/drivers/power/supply/ab8500_chargalg.c index 7a8d1feb8e90..dc6c8b0dd1cf 100644 --- a/drivers/power/supply/ab8500_chargalg.c +++ b/drivers/power/supply/ab8500_chargalg.c @@ -1787,13 +1787,12 @@ static int ab8500_chargalg_probe(struct platform_device *pdev) psy_cfg.drv_data = di; /* Initilialize safety timer */ - hrtimer_init(&di->safety_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - di->safety_timer.function = ab8500_chargalg_safety_timer_expired; + hrtimer_setup(&di->safety_timer, ab8500_chargalg_safety_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* Initilialize maintenance timer */ - hrtimer_init(&di->maintenance_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - di->maintenance_timer.function = - ab8500_chargalg_maintenance_timer_expired; + hrtimer_setup(&di->maintenance_timer, ab8500_chargalg_maintenance_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); /* Init work for chargalg */ INIT_DEFERRABLE_WORK(&di->chargalg_periodic_work, diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index 04c212953ded..5ad7cc438068 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -339,8 +339,7 @@ struct idle_inject_device *idle_inject_register_full(struct cpumask *cpumask, return NULL; cpumask_copy(to_cpumask(ii_dev->cpumask), cpumask); - hrtimer_init(&ii_dev->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - ii_dev->timer.function = idle_inject_timer_fn; + hrtimer_setup(&ii_dev->timer, idle_inject_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ii_dev->latency_us = UINT_MAX; ii_dev->update = update; diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 77d75e1f14a9..5ab3feb29686 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1274,7 +1274,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &rapl_defaults_byt), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &rapl_defaults_cht), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &rapl_defaults_tng), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &rapl_defaults_ann), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2,&rapl_defaults_ann), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &rapl_defaults_core), @@ -2064,8 +2064,7 @@ int rapl_package_add_pmu(struct rapl_package *rp) raw_spin_lock_init(&data->lock); INIT_LIST_HEAD(&data->active_list); data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms); - hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - data->hrtimer.function = rapl_hrtimer_handle; + hrtimer_setup(&data->hrtimer, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return rapl_pmu_update(rp); } diff --git a/drivers/pps/generators/pps_gen_parport.c b/drivers/pps/generators/pps_gen_parport.c index d46eed159495..f5eeb4dd01ad 100644 --- a/drivers/pps/generators/pps_gen_parport.c +++ b/drivers/pps/generators/pps_gen_parport.c @@ -208,8 +208,7 @@ static void parport_attach(struct parport *port) calibrate_port(&device); - hrtimer_init(&device.timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - device.timer.function = hrtimer_event; + hrtimer_setup(&device.timer, hrtimer_event, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_start(&device.timer, next_intr_time(&device), HRTIMER_MODE_ABS); return; diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 4ddf0efead68..00a7f3617cd8 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1830,12 +1830,49 @@ static const struct file_operations constraint_flags_fops = { #define REG_STR_SIZE 64 +static void link_and_create_debugfs(struct regulator *regulator, struct regulator_dev *rdev, + struct device *dev) +{ + int err = 0; + + if (dev) { + regulator->dev = dev; + + /* Add a link to the device sysfs entry */ + err = sysfs_create_link_nowarn(&rdev->dev.kobj, &dev->kobj, + regulator->supply_name); + if (err) { + rdev_dbg(rdev, "could not add device link %s: %pe\n", + dev->kobj.name, ERR_PTR(err)); + /* non-fatal */ + } + } + + if (err != -EEXIST) { + regulator->debugfs = debugfs_create_dir(regulator->supply_name, rdev->debugfs); + if (IS_ERR(regulator->debugfs)) { + rdev_dbg(rdev, "Failed to create debugfs directory\n"); + regulator->debugfs = NULL; + } + } + + if (regulator->debugfs) { + debugfs_create_u32("uA_load", 0444, regulator->debugfs, + ®ulator->uA_load); + debugfs_create_u32("min_uV", 0444, regulator->debugfs, + ®ulator->voltage[PM_SUSPEND_ON].min_uV); + debugfs_create_u32("max_uV", 0444, regulator->debugfs, + ®ulator->voltage[PM_SUSPEND_ON].max_uV); + debugfs_create_file("constraint_flags", 0444, regulator->debugfs, + regulator, &constraint_flags_fops); + } +} + static struct regulator *create_regulator(struct regulator_dev *rdev, struct device *dev, const char *supply_name) { struct regulator *regulator; - int err = 0; lockdep_assert_held_once(&rdev->mutex.base); @@ -1868,38 +1905,6 @@ static struct regulator *create_regulator(struct regulator_dev *rdev, list_add(®ulator->list, &rdev->consumer_list); - if (dev) { - regulator->dev = dev; - - /* Add a link to the device sysfs entry */ - err = sysfs_create_link_nowarn(&rdev->dev.kobj, &dev->kobj, - supply_name); - if (err) { - rdev_dbg(rdev, "could not add device link %s: %pe\n", - dev->kobj.name, ERR_PTR(err)); - /* non-fatal */ - } - } - - if (err != -EEXIST) { - regulator->debugfs = debugfs_create_dir(supply_name, rdev->debugfs); - if (IS_ERR(regulator->debugfs)) { - rdev_dbg(rdev, "Failed to create debugfs directory\n"); - regulator->debugfs = NULL; - } - } - - if (regulator->debugfs) { - debugfs_create_u32("uA_load", 0444, regulator->debugfs, - ®ulator->uA_load); - debugfs_create_u32("min_uV", 0444, regulator->debugfs, - ®ulator->voltage[PM_SUSPEND_ON].min_uV); - debugfs_create_u32("max_uV", 0444, regulator->debugfs, - ®ulator->voltage[PM_SUSPEND_ON].max_uV); - debugfs_create_file("constraint_flags", 0444, regulator->debugfs, - regulator, &constraint_flags_fops); - } - /* * Check now if the regulator is an always on regulator - if * it is then we don't need to do nearly so much work for @@ -2069,6 +2074,10 @@ static int regulator_resolve_supply(struct regulator_dev *rdev) if (have_full_constraints()) { r = dummy_regulator_rdev; + if (!r) { + ret = -EPROBE_DEFER; + goto out; + } get_device(&r->dev); } else { dev_err(dev, "Failed to resolve %s-supply for %s\n", @@ -2086,6 +2095,10 @@ static int regulator_resolve_supply(struct regulator_dev *rdev) goto out; } r = dummy_regulator_rdev; + if (!r) { + ret = -EPROBE_DEFER; + goto out; + } get_device(&r->dev); } @@ -2133,6 +2146,9 @@ static int regulator_resolve_supply(struct regulator_dev *rdev) regulator_unlock_two(rdev, r, &ww_ctx); + /* rdev->supply was created in set_supply() */ + link_and_create_debugfs(rdev->supply, r, &rdev->dev); + /* * In set_machine_constraints() we may have turned this regulator on * but we couldn't propagate to the supply if it hadn't been resolved @@ -2211,8 +2227,10 @@ struct regulator *_regulator_get_common(struct regulator_dev *rdev, struct devic * enabled, even if it isn't hooked up, and just * provide a dummy. */ - dev_warn(dev, "supply %s not found, using dummy regulator\n", id); rdev = dummy_regulator_rdev; + if (!rdev) + return ERR_PTR(-EPROBE_DEFER); + dev_warn(dev, "supply %s not found, using dummy regulator\n", id); get_device(&rdev->dev); break; @@ -2271,6 +2289,8 @@ struct regulator *_regulator_get_common(struct regulator_dev *rdev, struct devic return regulator; } + link_and_create_debugfs(regulator, rdev, dev); + rdev->open_count++; if (get_type == EXCLUSIVE_GET) { rdev->exclusive = 1; diff --git a/drivers/regulator/dummy.c b/drivers/regulator/dummy.c index 5b9b9e4e762d..9f59889129ab 100644 --- a/drivers/regulator/dummy.c +++ b/drivers/regulator/dummy.c @@ -60,7 +60,7 @@ static struct platform_driver dummy_regulator_driver = { .probe = dummy_regulator_probe, .driver = { .name = "reg-dummy", - .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .probe_type = PROBE_FORCE_SYNCHRONOUS, }, }; diff --git a/drivers/regulator/rtq2208-regulator.c b/drivers/regulator/rtq2208-regulator.c index 5925fa7a9a06..9cde7181b0f0 100644 --- a/drivers/regulator/rtq2208-regulator.c +++ b/drivers/regulator/rtq2208-regulator.c @@ -27,6 +27,11 @@ #define RTQ2208_REG_LDO1_CFG 0xB1 #define RTQ2208_REG_LDO2_CFG 0xC1 #define RTQ2208_REG_LDO_DVS_CTRL 0xD0 +#define RTQ2208_REG_HIDDEN_BUCKPH 0x55 +#define RTQ2208_REG_HIDDEN_LDOCFG0 0x8F +#define RTQ2208_REG_HIDDEN_LDOCFG1 0x96 +#define RTQ2208_REG_HIDDEN0 0xFE +#define RTQ2208_REG_HIDDEN1 0xFF /* Mask */ #define RTQ2208_BUCK_NR_MTP_SEL_MASK GENMASK(7, 0) @@ -45,6 +50,11 @@ #define RTQ2208_LDO1_VOSEL_SD_MASK BIT(5) #define RTQ2208_LDO2_DISCHG_EN_MASK BIT(6) #define RTQ2208_LDO2_VOSEL_SD_MASK BIT(7) +#define RTQ2208_MASK_BUCKPH_GROUP1 GENMASK(6, 4) +#define RTQ2208_MASK_BUCKPH_GROUP2 GENMASK(2, 0) +#define RTQ2208_MASK_LDO2_OPT0 BIT(7) +#define RTQ2208_MASK_LDO2_OPT1 BIT(6) +#define RTQ2208_MASK_LDO1_FIXED BIT(6) /* Size */ #define RTQ2208_VOUT_MAXNUM 256 @@ -245,11 +255,6 @@ static const unsigned int rtq2208_ldo_volt_table[] = { 3300000, }; -static struct of_regulator_match rtq2208_ldo_match[] = { - {.name = "ldo2", }, - {.name = "ldo1", }, -}; - static unsigned int rtq2208_of_map_mode(unsigned int mode) { switch (mode) { @@ -344,59 +349,6 @@ static irqreturn_t rtq2208_irq_handler(int irqno, void *devid) return IRQ_HANDLED; } -static int rtq2208_of_get_ldo_dvs_ability(struct device *dev) -{ - struct device_node *np; - struct of_regulator_match *match; - struct regulator_desc *desc; - struct regulator_init_data *init_data; - u32 fixed_uV; - int ret, i; - - if (!dev->of_node) - return -ENODEV; - - np = of_get_child_by_name(dev->of_node, "regulators"); - if (!np) - np = dev->of_node; - - ret = of_regulator_match(dev, np, rtq2208_ldo_match, ARRAY_SIZE(rtq2208_ldo_match)); - - of_node_put(np); - - if (ret < 0) - return ret; - - for (i = 0; i < ARRAY_SIZE(rtq2208_ldo_match); i++) { - match = rtq2208_ldo_match + i; - init_data = match->init_data; - desc = (struct regulator_desc *)match->desc; - - if (!init_data || !desc) - continue; - - /* specify working fixed voltage if the propery exists */ - ret = of_property_read_u32(match->of_node, "richtek,fixed-microvolt", &fixed_uV); - - if (!ret) { - if (fixed_uV != init_data->constraints.min_uV || - fixed_uV != init_data->constraints.max_uV) - return -EINVAL; - desc->n_voltages = 1; - desc->fixed_uV = fixed_uV; - desc->fixed_uV = init_data->constraints.min_uV; - desc->ops = &rtq2208_regulator_ldo_fix_ops; - } else { - desc->n_voltages = ARRAY_SIZE(rtq2208_ldo_volt_table); - desc->volt_table = rtq2208_ldo_volt_table; - desc->ops = &rtq2208_regulator_ldo_adj_ops; - } - } - - return 0; -} - - #define BUCK_INFO(_name, _id) \ { \ .name = _name, \ @@ -424,9 +376,11 @@ static const struct linear_range rtq2208_vout_range[] = { REGULATOR_LINEAR_RANGE(1310000, 181, 255, 10000), }; -static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, int mtp_sel, int idx) +static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, int mtp_sel, int idx, + unsigned int ldo1_fixed, unsigned int ldo2_fixed) { struct regulator_desc *desc; + unsigned int fixed_uV; static const struct { char *name; int base; @@ -462,7 +416,8 @@ static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, in rdesc->mode_mask = RTQ2208_BUCK_NRMODE_MASK; - if (idx >= RTQ2208_BUCK_B && idx <= RTQ2208_BUCK_E) { + switch (idx) { + case RTQ2208_BUCK_B ... RTQ2208_BUCK_E: /* init buck desc */ desc->ops = &rtq2208_regulator_buck_ops; desc->vsel_reg = curr_info->base + VSEL_SHIFT(mtp_sel); @@ -480,7 +435,19 @@ static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, in rdesc->suspend_config_reg = BUCK_RG_SHIFT(curr_info->base, 4); rdesc->suspend_enable_mask = RTQ2208_BUCK_EN_STR_MASK; rdesc->suspend_mode_mask = RTQ2208_BUCK_STRMODE_MASK; - } else { + break; + default: + fixed_uV = idx == RTQ2208_LDO2 ? ldo2_fixed : ldo1_fixed; + if (fixed_uV) { + desc->n_voltages = 1; + desc->fixed_uV = fixed_uV; + desc->ops = &rtq2208_regulator_ldo_fix_ops; + } else { + desc->n_voltages = ARRAY_SIZE(rtq2208_ldo_volt_table); + desc->volt_table = rtq2208_ldo_volt_table; + desc->ops = &rtq2208_regulator_ldo_adj_ops; + } + /* init ldo desc */ desc->active_discharge_reg = RTQ2208_REG_LDO_DVS_CTRL; desc->active_discharge_on = curr_info->dis_on; @@ -490,13 +457,15 @@ static void rtq2208_init_regulator_desc(struct rtq2208_regulator_desc *rdesc, in rdesc->suspend_config_reg = curr_info->base; rdesc->suspend_enable_mask = RTQ2208_LDO_EN_STR_MASK; + break; } } static int rtq2208_parse_regulator_dt_data(int n_regulator, const unsigned int *regulator_idx_table, - struct rtq2208_regulator_desc *rdesc[RTQ2208_LDO_MAX], struct device *dev) + struct rtq2208_regulator_desc *rdesc[RTQ2208_LDO_MAX], struct device *dev, + unsigned int ldo1_fixed, unsigned int ldo2_fixed) { - int mtp_sel, i, idx, ret; + int mtp_sel, i, idx; /* get mtp_sel0 or mtp_sel1 */ mtp_sel = device_property_read_bool(dev, "richtek,mtp-sel-high"); @@ -508,43 +477,101 @@ static int rtq2208_parse_regulator_dt_data(int n_regulator, const unsigned int * if (!rdesc[i]) return -ENOMEM; - rtq2208_init_regulator_desc(rdesc[i], mtp_sel, idx); - - /* init ldo dvs ability */ - if (idx >= RTQ2208_LDO2) - rtq2208_ldo_match[idx - RTQ2208_LDO2].desc = &rdesc[i]->desc; + rtq2208_init_regulator_desc(rdesc[i], mtp_sel, idx, ldo1_fixed, ldo2_fixed); } - /* init ldo fixed_uV */ - ret = rtq2208_of_get_ldo_dvs_ability(dev); - if (ret) - return dev_err_probe(dev, ret, "Failed to get ldo fixed_uV\n"); - return 0; } -/** different slave address corresponds different used bucks - * slave address 0x10: BUCK[BCA FGE] - * slave address 0x20: BUCK[BC FGHE] - * slave address 0x40: BUCK[C G] - */ -static int rtq2208_regulator_check(int slave_addr, int *num, - int *regulator_idx_table, unsigned int *buck_masks) +static int rtq2208_regulator_check(struct device *dev, int *num, int *regulator_idx_table, + unsigned int *buck_masks, unsigned int *ldo1_fixed_uV, + unsigned int *ldo2_fixed_uV) { - static bool rtq2208_used_table[3][RTQ2208_LDO_MAX] = { - /* BUCK[BCA FGE], LDO[12] */ - {1, 1, 0, 1, 1, 1, 0, 1, 1, 1}, - /* BUCK[BC FGHE], LDO[12]*/ - {1, 1, 0, 0, 1, 1, 1, 1, 1, 1}, - /* BUCK[C G], LDO[12] */ - {0, 1, 0, 0, 0, 1, 0, 0, 1, 1}, - }; - int i, idx = ffs(slave_addr >> 4) - 1; + struct regmap *regmap = dev_get_regmap(dev, NULL); + bool rtq2208_used_table[RTQ2208_LDO_MAX] = {0}; + u8 entry_key[] = { 0x69, 0x01 }; + unsigned int buck_phase, ldo_cfg0, ldo_cfg1; + int i, ret; u8 mask; + ret = regmap_raw_write(regmap, RTQ2208_REG_HIDDEN0, entry_key, ARRAY_SIZE(entry_key)); + if (ret) + return dev_err_probe(dev, ret, "Failed to enter hidden page\n"); + + ret = regmap_read(regmap, RTQ2208_REG_HIDDEN_BUCKPH, &buck_phase); + if (ret) + return dev_err_probe(dev, ret, "Failed to read buck phase configuration\n"); + + ret = regmap_read(regmap, RTQ2208_REG_HIDDEN_LDOCFG0, &ldo_cfg0); + if (ret) + return dev_err_probe(dev, ret, "Failed to read ldo cfg0\n"); + + ret = regmap_read(regmap, RTQ2208_REG_HIDDEN_LDOCFG1, &ldo_cfg1); + if (ret) + return dev_err_probe(dev, ret, "Failed to read ldo cfg1\n"); + + ret = regmap_write(regmap, RTQ2208_REG_HIDDEN1, 0x00); + if (ret) + return dev_err_probe(dev, ret, "Failed to exit hidden page\n"); + + dev_info(dev, "BUCK Phase 0x%x\n", buck_phase); + /* + * Use buck phase configuration to assign used table mask + * GROUP1 GROUP2 + * 0 -> 2P + 2P BC FG + * 1 -> 2P + 1P + 1P BCA FGE + * 2 -> 1P + 1P + 1P + 1P BCDA FGHE + * 3 -> 3P + 1P BC FG + * others -> 4P C G + */ + switch (FIELD_GET(RTQ2208_MASK_BUCKPH_GROUP1, buck_phase)) { + case 2: + rtq2208_used_table[RTQ2208_BUCK_D] = true; + fallthrough; + case 1: + rtq2208_used_table[RTQ2208_BUCK_A] = true; + fallthrough; + case 0: + case 3: + rtq2208_used_table[RTQ2208_BUCK_B] = true; + fallthrough; + default: + rtq2208_used_table[RTQ2208_BUCK_C] = true; + break; + } + + switch (FIELD_GET(RTQ2208_MASK_BUCKPH_GROUP2, buck_phase)) { + case 2: + rtq2208_used_table[RTQ2208_BUCK_F] = true; + fallthrough; + case 1: + rtq2208_used_table[RTQ2208_BUCK_E] = true; + fallthrough; + case 0: + case 3: + rtq2208_used_table[RTQ2208_BUCK_H] = true; + fallthrough; + default: + rtq2208_used_table[RTQ2208_BUCK_G] = true; + break; + } + + *ldo1_fixed_uV = FIELD_GET(RTQ2208_MASK_LDO1_FIXED, ldo_cfg1) ? 1200000 : 0; + + if (!FIELD_GET(RTQ2208_MASK_LDO2_OPT0, ldo_cfg0) && + !FIELD_GET(RTQ2208_MASK_LDO2_OPT1, ldo_cfg1)) + *ldo2_fixed_uV = 0; + else if (FIELD_GET(RTQ2208_MASK_LDO2_OPT1, ldo_cfg1)) + *ldo2_fixed_uV = 900000; + else + *ldo2_fixed_uV = 1200000; + + /* By default, LDO1 & LDO2 are always used */ + rtq2208_used_table[RTQ2208_LDO1] = rtq2208_used_table[RTQ2208_LDO2] = true; + for (i = 0; i < RTQ2208_LDO_MAX; i++) { - if (!rtq2208_used_table[idx][i]) + if (!rtq2208_used_table[i]) continue; regulator_idx_table[(*num)++] = i; @@ -559,7 +586,7 @@ static int rtq2208_regulator_check(int slave_addr, int *num, static const struct regmap_config rtq2208_regmap_config = { .reg_bits = 8, .val_bits = 8, - .max_register = 0xEF, + .max_register = 0xFF, }; static int rtq2208_probe(struct i2c_client *i2c) @@ -573,6 +600,7 @@ static int rtq2208_probe(struct i2c_client *i2c) int i, ret = 0, idx, n_regulator = 0; unsigned int regulator_idx_table[RTQ2208_LDO_MAX], buck_masks[RTQ2208_BUCK_NUM_IRQ_REGS] = {0x33, 0x33, 0x33, 0x33, 0x33}; + unsigned int ldo1_fixed_uV, ldo2_fixed_uV; rdev_map = devm_kzalloc(dev, sizeof(struct rtq2208_rdev_map), GFP_KERNEL); if (!rdev_map) @@ -583,7 +611,8 @@ static int rtq2208_probe(struct i2c_client *i2c) return dev_err_probe(dev, PTR_ERR(regmap), "Failed to allocate regmap\n"); /* get needed regulator */ - ret = rtq2208_regulator_check(i2c->addr, &n_regulator, regulator_idx_table, buck_masks); + ret = rtq2208_regulator_check(dev, &n_regulator, regulator_idx_table, buck_masks, + &ldo1_fixed_uV, &ldo2_fixed_uV); if (ret) return dev_err_probe(dev, ret, "Failed to check used regulators\n"); @@ -593,7 +622,8 @@ static int rtq2208_probe(struct i2c_client *i2c) cfg.dev = dev; /* init regulator desc */ - ret = rtq2208_parse_regulator_dt_data(n_regulator, regulator_idx_table, rdesc, dev); + ret = rtq2208_parse_regulator_dt_data(n_regulator, regulator_idx_table, rdesc, dev, + ldo1_fixed_uV, ldo2_fixed_uV); if (ret) return ret; diff --git a/drivers/reset/reset-microchip-sparx5.c b/drivers/reset/reset-microchip-sparx5.c index aa5464be7053..6d3e75b33260 100644 --- a/drivers/reset/reset-microchip-sparx5.c +++ b/drivers/reset/reset-microchip-sparx5.c @@ -8,6 +8,7 @@ */ #include <linux/mfd/syscon.h> #include <linux/of.h> +#include <linux/of_address.h> #include <linux/module.h> #include <linux/platform_device.h> #include <linux/property.h> @@ -72,14 +73,22 @@ static struct regmap *mchp_lan966x_syscon_to_regmap(struct device *dev, struct device_node *syscon_np) { struct regmap_config regmap_config = mchp_lan966x_syscon_regmap_config; - resource_size_t size; + struct resource res; void __iomem *base; + int err; + + err = of_address_to_resource(syscon_np, 0, &res); + if (err) + return ERR_PTR(err); - base = devm_of_iomap(dev, syscon_np, 0, &size); - if (IS_ERR(base)) - return ERR_CAST(base); + /* It is not possible to use devm_of_iomap because this resource is + * shared with other drivers. + */ + base = devm_ioremap(dev, res.start, resource_size(&res)); + if (!base) + return ERR_PTR(-ENOMEM); - regmap_config.max_register = size - 4; + regmap_config.max_register = resource_size(&res) - 4; return devm_regmap_init_mmio(dev, base, ®map_config); } diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index e31fa0ad127e..b88cd4fb295b 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -240,8 +240,7 @@ static struct rtc_device *rtc_allocate_device(void) /* Init uie timer */ rtc_timer_init(&rtc->uie_rtctimer, rtc_uie_update_irq, rtc); /* Init pie timer */ - hrtimer_init(&rtc->pie_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rtc->pie_timer.function = rtc_pie_update_irq; + hrtimer_setup(&rtc->pie_timer, rtc_pie_update_irq, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rtc->pie_enabled = 0; set_bit(RTC_FEATURE_ALARM, rtc->features); diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 26e1ea1940ec..62feb2c639d5 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -2326,8 +2326,7 @@ static inline int __init ap_async_init(void) */ if (MACHINE_IS_VM) poll_high_timeout = 1500000; - hrtimer_init(&ap_poll_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ap_poll_timer.function = ap_poll_timeout; + hrtimer_setup(&ap_poll_timer, ap_poll_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); queue_work(system_long_wq, &ap_scan_bus_work); diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index 16d085d56e9d..9e42230e42b8 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -2922,9 +2922,7 @@ static long ibmvscsis_alloctimer(struct scsi_info *vscsi) struct timer_cb *p_timer; p_timer = &vscsi->rsp_q_timer; - hrtimer_init(&p_timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - - p_timer->timer.function = ibmvscsis_service_wait_q; + hrtimer_setup(&p_timer->timer, ibmvscsis_service_wait_q, CLOCK_MONOTONIC, HRTIMER_MODE_REL); p_timer->started = false; p_timer->timer_pops = 0; diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h index e5a9c5a323f8..62438e84e52a 100644 --- a/drivers/scsi/lpfc/lpfc.h +++ b/drivers/scsi/lpfc/lpfc.h @@ -1715,18 +1715,12 @@ lpfc_phba_elsring(struct lpfc_hba *phba) * Note: If no valid cpu found, then nr_cpu_ids is returned. * **/ -static inline unsigned int +static __always_inline unsigned int lpfc_next_online_cpu(const struct cpumask *mask, unsigned int start) { - unsigned int cpu_it; - - for_each_cpu_wrap(cpu_it, mask, start) { - if (cpu_online(cpu_it)) - break; - } - - return cpu_it; + return cpumask_next_and_wrap(start, mask, cpu_online_mask); } + /** * lpfc_next_present_cpu - Finds next present CPU after n * @n: the cpu prior to search @@ -1734,16 +1728,9 @@ lpfc_next_online_cpu(const struct cpumask *mask, unsigned int start) * Note: If no next present cpu, then fallback to first present cpu. * **/ -static inline unsigned int lpfc_next_present_cpu(int n) +static __always_inline unsigned int lpfc_next_present_cpu(int n) { - unsigned int cpu; - - cpu = cpumask_next(n, cpu_present_mask); - - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_present_mask); - - return cpu; + return cpumask_next_wrap(n, cpu_present_mask); } /** diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index bcadf11414c8..919bf9b7ac26 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -7952,11 +7952,10 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba) timer_setup(&phba->fcf.redisc_wait, lpfc_sli4_fcf_redisc_wait_tmo, 0); /* CMF congestion timer */ - hrtimer_init(&phba->cmf_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - phba->cmf_timer.function = lpfc_cmf_timer; + hrtimer_setup(&phba->cmf_timer, lpfc_cmf_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); /* CMF 1 minute stats collection timer */ - hrtimer_init(&phba->cmf_stats_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - phba->cmf_stats_timer.function = lpfc_cmf_stats_timer; + hrtimer_setup(&phba->cmf_stats_timer, lpfc_cmf_stats_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Control structure for handling external multi-buffer mailbox @@ -12873,7 +12872,7 @@ lpfc_irq_rebalance(struct lpfc_hba *phba, unsigned int cpu, bool offline) if (offline) { /* Find next online CPU on original mask */ - cpu_next = cpumask_next_wrap(cpu, orig_mask, cpu, true); + cpu_next = cpumask_next_wrap(cpu, orig_mask); cpu_select = lpfc_next_online_cpu(orig_mask, cpu_next); /* Found a valid CPU */ diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c index 0ba9e6a6a13c..c8d6ced5640e 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_transport.c +++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c @@ -105,10 +105,10 @@ struct rep_manu_reply { u8 reserved0[2]; u8 sas_format; u8 reserved2[3]; - u8 vendor_id[SAS_EXPANDER_VENDOR_ID_LEN]; - u8 product_id[SAS_EXPANDER_PRODUCT_ID_LEN]; - u8 product_rev[SAS_EXPANDER_PRODUCT_REV_LEN]; - u8 component_vendor_id[SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN]; + u8 vendor_id[SAS_EXPANDER_VENDOR_ID_LEN] __nonstring; + u8 product_id[SAS_EXPANDER_PRODUCT_ID_LEN] __nonstring; + u8 product_rev[SAS_EXPANDER_PRODUCT_REV_LEN] __nonstring; + u8 component_vendor_id[SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN] __nonstring; u16 component_id; u8 component_revision_id; u8 reserved3; diff --git a/drivers/scsi/mpt3sas/mpi/mpi2_cnfg.h b/drivers/scsi/mpt3sas/mpi/mpi2_cnfg.h index 587f7d248219..d123d3b740e1 100644 --- a/drivers/scsi/mpt3sas/mpi/mpi2_cnfg.h +++ b/drivers/scsi/mpt3sas/mpi/mpi2_cnfg.h @@ -606,7 +606,7 @@ typedef struct _MPI2_CONFIG_REPLY { typedef struct _MPI2_CONFIG_PAGE_MAN_0 { MPI2_CONFIG_PAGE_HEADER Header; /*0x00 */ - U8 ChipName[16]; /*0x04 */ + U8 ChipName[16] __nonstring; /*0x04 */ U8 ChipRevision[8]; /*0x14 */ U8 BoardName[16]; /*0x1C */ U8 BoardAssembly[16]; /*0x2C */ diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c index d84413b77d84..dc74ebc6405a 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_transport.c +++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c @@ -328,10 +328,10 @@ struct rep_manu_reply { u8 reserved0[2]; u8 sas_format; u8 reserved2[3]; - u8 vendor_id[SAS_EXPANDER_VENDOR_ID_LEN]; - u8 product_id[SAS_EXPANDER_PRODUCT_ID_LEN]; - u8 product_rev[SAS_EXPANDER_PRODUCT_REV_LEN]; - u8 component_vendor_id[SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN]; + u8 vendor_id[SAS_EXPANDER_VENDOR_ID_LEN] __nonstring; + u8 product_id[SAS_EXPANDER_PRODUCT_ID_LEN] __nonstring; + u8 product_rev[SAS_EXPANDER_PRODUCT_REV_LEN] __nonstring; + u8 component_vendor_id[SAS_EXPANDER_COMPONENT_VENDOR_ID_LEN] __nonstring; u16 component_id; u8 component_revision_id; u8 reserved3; diff --git a/drivers/scsi/qla2xxx/qla_mr.h b/drivers/scsi/qla2xxx/qla_mr.h index 4f63aff333db..3a2bd953a976 100644 --- a/drivers/scsi/qla2xxx/qla_mr.h +++ b/drivers/scsi/qla2xxx/qla_mr.h @@ -282,8 +282,8 @@ struct register_host_info { #define QLAFX00_TGT_NODE_LIST_SIZE (sizeof(uint32_t) * 32) struct config_info_data { - uint8_t model_num[16]; - uint8_t model_description[80]; + uint8_t model_num[16] __nonstring; + uint8_t model_description[80] __nonstring; uint8_t reserved0[160]; uint8_t symbolic_name[64]; uint8_t serial_num[32]; diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 5ceaa4665e5d..fe5c30bb2639 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -6384,8 +6384,8 @@ static struct sdebug_queued_cmd *sdebug_alloc_queued_cmd(struct scsi_cmnd *scmd) sd_dp = &sqcp->sd_dp; - hrtimer_init(&sd_dp->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - sd_dp->hrt.function = sdebug_q_cmd_hrt_complete; + hrtimer_setup(&sd_dp->hrt, sdebug_q_cmd_hrt_complete, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED); INIT_WORK(&sd_dp->ew.work, sdebug_q_cmd_wq_complete); sqcp->scmd = scmd; diff --git a/drivers/soc/hisilicon/kunpeng_hccs.c b/drivers/soc/hisilicon/kunpeng_hccs.c index 8aa8dec14911..444a8f59b7da 100644 --- a/drivers/soc/hisilicon/kunpeng_hccs.c +++ b/drivers/soc/hisilicon/kunpeng_hccs.c @@ -1539,8 +1539,8 @@ static ssize_t used_types_show(struct kobject *kobj, u16 i; for (i = 0; i < hdev->used_type_num - 1; i++) - len += sysfs_emit(&buf[len], "%s ", hdev->type_name_maps[i].name); - len += sysfs_emit(&buf[len], "%s\n", hdev->type_name_maps[i].name); + len += sysfs_emit_at(buf, len, "%s ", hdev->type_name_maps[i].name); + len += sysfs_emit_at(buf, len, "%s\n", hdev->type_name_maps[i].name); return len; } diff --git a/drivers/soc/imx/soc-imx8m.c b/drivers/soc/imx/soc-imx8m.c index 8ac7658e3d52..3ed8161d7d28 100644 --- a/drivers/soc/imx/soc-imx8m.c +++ b/drivers/soc/imx/soc-imx8m.c @@ -192,9 +192,20 @@ static __maybe_unused const struct of_device_id imx8_soc_match[] = { devm_kasprintf((dev), GFP_KERNEL, "%d.%d", ((soc_rev) >> 4) & 0xf, (soc_rev) & 0xf) : \ "unknown" +static void imx8m_unregister_soc(void *data) +{ + soc_device_unregister(data); +} + +static void imx8m_unregister_cpufreq(void *data) +{ + platform_device_unregister(data); +} + static int imx8m_soc_probe(struct platform_device *pdev) { struct soc_device_attribute *soc_dev_attr; + struct platform_device *cpufreq_dev; const struct imx8_soc_data *data; struct device *dev = &pdev->dev; const struct of_device_id *id; @@ -239,11 +250,22 @@ static int imx8m_soc_probe(struct platform_device *pdev) if (IS_ERR(soc_dev)) return PTR_ERR(soc_dev); + ret = devm_add_action(dev, imx8m_unregister_soc, soc_dev); + if (ret) + return ret; + pr_info("SoC: %s revision %s\n", soc_dev_attr->soc_id, soc_dev_attr->revision); - if (IS_ENABLED(CONFIG_ARM_IMX_CPUFREQ_DT)) - platform_device_register_simple("imx-cpufreq-dt", -1, NULL, 0); + if (IS_ENABLED(CONFIG_ARM_IMX_CPUFREQ_DT)) { + cpufreq_dev = platform_device_register_simple("imx-cpufreq-dt", -1, NULL, 0); + if (IS_ERR(cpufreq_dev)) + return dev_err_probe(dev, PTR_ERR(cpufreq_dev), + "Failed to register imx-cpufreq-dev device\n"); + ret = devm_add_action(dev, imx8m_unregister_cpufreq, cpufreq_dev); + if (ret) + return ret; + } return 0; } diff --git a/drivers/soc/qcom/pdr_interface.c b/drivers/soc/qcom/pdr_interface.c index 328b6153b2be..71be378d2e43 100644 --- a/drivers/soc/qcom/pdr_interface.c +++ b/drivers/soc/qcom/pdr_interface.c @@ -75,7 +75,6 @@ static int pdr_locator_new_server(struct qmi_handle *qmi, { struct pdr_handle *pdr = container_of(qmi, struct pdr_handle, locator_hdl); - struct pdr_service *pds; mutex_lock(&pdr->lock); /* Create a local client port for QMI communication */ @@ -87,12 +86,7 @@ static int pdr_locator_new_server(struct qmi_handle *qmi, mutex_unlock(&pdr->lock); /* Service pending lookup requests */ - mutex_lock(&pdr->list_lock); - list_for_each_entry(pds, &pdr->lookups, node) { - if (pds->need_locator_lookup) - schedule_work(&pdr->locator_work); - } - mutex_unlock(&pdr->list_lock); + schedule_work(&pdr->locator_work); return 0; } diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c index 052c292eeda6..cde19cdfd3c7 100644 --- a/drivers/soc/qcom/pmic_glink.c +++ b/drivers/soc/qcom/pmic_glink.c @@ -233,7 +233,7 @@ static void pmic_glink_pdr_callback(int state, char *svc_path, void *priv) static int pmic_glink_rpmsg_probe(struct rpmsg_device *rpdev) { - struct pmic_glink *pg = __pmic_glink; + struct pmic_glink *pg; guard(mutex)(&__pmic_glink_lock); pg = __pmic_glink; diff --git a/drivers/soc/ti/ti_sci_inta_msi.c b/drivers/soc/ti/ti_sci_inta_msi.c index c36364522157..193266f5e3f9 100644 --- a/drivers/soc/ti/ti_sci_inta_msi.c +++ b/drivers/soc/ti/ti_sci_inta_msi.c @@ -103,19 +103,15 @@ int ti_sci_inta_msi_domain_alloc_irqs(struct device *dev, if (ret) return ret; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); nvec = ti_sci_inta_msi_alloc_descs(dev, res); - if (nvec <= 0) { - ret = nvec; - goto unlock; - } + if (nvec <= 0) + return nvec; /* Use alloc ALL as it's unclear whether there are gaps in the indices */ ret = msi_domain_alloc_irqs_all_locked(dev, MSI_DEFAULT_DOMAIN, nvec); if (ret) dev_err(dev, "Failed to allocate IRQs %d\n", ret); -unlock: - msi_unlock_descs(dev); return ret; } EXPORT_SYMBOL_GPL(ti_sci_inta_msi_domain_alloc_irqs); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index a7a4647717d4..ff07c87dbadc 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2954,9 +2954,13 @@ static ssize_t slave_show(struct device *dev, struct device_attribute *attr, struct spi_controller *ctlr = container_of(dev, struct spi_controller, dev); struct device *child; + int ret; child = device_find_any_child(&ctlr->dev); - return sysfs_emit(buf, "%s\n", child ? to_spi_device(child)->modalias : NULL); + ret = sysfs_emit(buf, "%s\n", child ? to_spi_device(child)->modalias : NULL); + put_device(child); + + return ret; } static ssize_t slave_store(struct device *dev, struct device_attribute *attr, diff --git a/drivers/staging/media/atomisp/include/linux/atomisp_platform.h b/drivers/staging/media/atomisp/include/linux/atomisp_platform.h index 049246774ced..6146555fe9cf 100644 --- a/drivers/staging/media/atomisp/include/linux/atomisp_platform.h +++ b/drivers/staging/media/atomisp/include/linux/atomisp_platform.h @@ -172,10 +172,10 @@ void atomisp_unregister_subdev(struct v4l2_subdev *subdev); #define IS_BYT __IS_SOC(INTEL_ATOM_SILVERMONT) #define IS_CHT __IS_SOC(INTEL_ATOM_AIRMONT) #define IS_MRFD __IS_SOC(INTEL_ATOM_SILVERMONT_MID) -#define IS_MOFD __IS_SOC(INTEL_ATOM_AIRMONT_MID) +#define IS_MOFD __IS_SOC(INTEL_ATOM_SILVERMONT_MID2) /* Both CHT and MOFD come with ISP2401 */ #define IS_ISP2401 __IS_SOCS(INTEL_ATOM_AIRMONT, \ - INTEL_ATOM_AIRMONT_MID) + INTEL_ATOM_SILVERMONT_MID2) #endif /* ATOMISP_PLATFORM_H_ */ diff --git a/drivers/staging/media/ipu3/include/uapi/intel-ipu3.h b/drivers/staging/media/ipu3/include/uapi/intel-ipu3.h index 4aa2797f5e3c..8b85524beb59 100644 --- a/drivers/staging/media/ipu3/include/uapi/intel-ipu3.h +++ b/drivers/staging/media/ipu3/include/uapi/intel-ipu3.h @@ -322,7 +322,8 @@ struct ipu3_uapi_ae_config { * 0: positive, 1: negative, default 0. * @y_calc: Pre-processing that converts Bayer quad to RGB+Y values to be * used for building histogram. Range [0, 32], default 8. - * Rule: + * + * Rule: * y_gen_rate_gr + y_gen_rate_r + y_gen_rate_b + y_gen_rate_gb = 32 * A single Y is calculated based on sum of Gr/R/B/Gb based on * their contribution ratio. diff --git a/drivers/thermal/intel/intel_tcc.c b/drivers/thermal/intel/intel_tcc.c index 817421508d5c..b2a615aea7c1 100644 --- a/drivers/thermal/intel/intel_tcc.c +++ b/drivers/thermal/intel/intel_tcc.c @@ -106,7 +106,7 @@ static const struct x86_cpu_id intel_tcc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &temp_broadwell), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &temp_broadwell), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &temp_broadwell), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &temp_goldmont), X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &temp_goldmont), diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c index d0b18358859e..742004d63c6f 100644 --- a/drivers/tty/serial/8250/8250_bcm7271.c +++ b/drivers/tty/serial/8250/8250_bcm7271.c @@ -1056,8 +1056,7 @@ static int brcmuart_probe(struct platform_device *pdev) } /* setup HR timer */ - hrtimer_init(&priv->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - priv->hrt.function = brcmuart_hrtimer_func; + hrtimer_setup(&priv->hrt, brcmuart_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); up.port.shutdown = brcmuart_shutdown; up.port.startup = brcmuart_startup; diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 442967a6cd52..c57f44882abb 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -566,12 +566,10 @@ static int serial8250_em485_init(struct uart_8250_port *p) if (!p->em485) return -ENOMEM; - hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx; - p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx; + hrtimer_setup(&p->em485->stop_tx_timer, &serial8250_em485_handle_stop_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&p->em485->start_tx_timer, &serial8250_em485_handle_start_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); p->em485->port = p; p->em485->active_timer = NULL; p->em485->tx_stopped = true; diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 04212c823a91..98f178bdbcbe 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2867,11 +2867,10 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id) return -EINVAL; } } - - hrtimer_init(&uap->trigger_start_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer_init(&uap->trigger_stop_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - uap->trigger_start_tx.function = pl011_trigger_start_tx; - uap->trigger_stop_tx.function = pl011_trigger_stop_tx; + hrtimer_setup(&uap->trigger_start_tx, pl011_trigger_start_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&uap->trigger_stop_tx, pl011_trigger_stop_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); ret = pl011_setup_port(&dev->dev, uap, &dev->res, portnr); if (ret) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 9c59ec128bb4..9a1afe409b98 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -2582,10 +2582,10 @@ static int imx_uart_probe(struct platform_device *pdev) imx_uart_writel(sport, ucr3, UCR3); } - hrtimer_init(&sport->trigger_start_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer_init(&sport->trigger_stop_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sport->trigger_start_tx.function = imx_trigger_start_tx; - sport->trigger_stop_tx.function = imx_trigger_stop_tx; + hrtimer_setup(&sport->trigger_start_tx, imx_trigger_start_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&sport->trigger_stop_tx, imx_trigger_stop_tx, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* * Allocate the IRQ(s) i.MX1 has three interrupts whereas later diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index b1ea48f38248..b72c3bc19bfa 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -1702,8 +1702,7 @@ static void sci_request_dma(struct uart_port *port) dma += s->buf_len_rx; } - hrtimer_init(&s->rx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - s->rx_timer.function = sci_dma_rx_timer_fn; + hrtimer_setup(&s->rx_timer, sci_dma_rx_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); s->chan_rx_saved = s->chan_rx = chan; diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index 92ec51870d1d..fe457bf1e15b 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -454,7 +454,7 @@ static void cdns_uart_handle_tx(void *dev_id) if (cdns_uart->port->rs485.flags & SER_RS485_ENABLED && (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port))) { - cdns_uart->tx_timer.function = &cdns_rs485_rx_callback; + hrtimer_update_function(&cdns_uart->tx_timer, cdns_rs485_rx_callback); hrtimer_start(&cdns_uart->tx_timer, ns_to_ktime(cdns_calc_after_tx_delay(cdns_uart)), HRTIMER_MODE_REL); } @@ -734,7 +734,7 @@ static void cdns_uart_start_tx(struct uart_port *port) if (cdns_uart->port->rs485.flags & SER_RS485_ENABLED) { if (!cdns_uart->rs485_tx_started) { - cdns_uart->tx_timer.function = &cdns_rs485_tx_callback; + hrtimer_update_function(&cdns_uart->tx_timer, cdns_rs485_tx_callback); cdns_rs485_tx_setup(cdns_uart); return hrtimer_start(&cdns_uart->tx_timer, ms_to_ktime(port->rs485.delay_rts_before_send), @@ -1626,8 +1626,8 @@ static int cdns_rs485_config(struct uart_port *port, struct ktermios *termios, writel(val, port->membase + CDNS_UART_MODEMCR); /* Timer setup */ - hrtimer_init(&cdns_uart->tx_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cdns_uart->tx_timer.function = &cdns_rs485_tx_callback; + hrtimer_setup(&cdns_uart->tx_timer, &cdns_rs485_tx_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); /* Disable transmitter and make Rx setup*/ cdns_uart_stop_tx(port); diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 23b9f6efa047..2cfa1774944b 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -1782,15 +1782,19 @@ static void ufs_qcom_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg) ufshcd_mcq_config_esi(hba, msg); } +struct ufs_qcom_irq { + unsigned int irq; + unsigned int idx; + struct ufs_hba *hba; +}; + static irqreturn_t ufs_qcom_mcq_esi_handler(int irq, void *data) { - struct msi_desc *desc = data; - struct device *dev = msi_desc_to_dev(desc); - struct ufs_hba *hba = dev_get_drvdata(dev); - u32 id = desc->msi_index; - struct ufs_hw_queue *hwq = &hba->uhq[id]; + struct ufs_qcom_irq *qi = data; + struct ufs_hba *hba = qi->hba; + struct ufs_hw_queue *hwq = &hba->uhq[qi->idx]; - ufshcd_mcq_write_cqis(hba, 0x1, id); + ufshcd_mcq_write_cqis(hba, 0x1, qi->idx); ufshcd_mcq_poll_cqe_lock(hba, hwq); return IRQ_HANDLED; @@ -1799,8 +1803,7 @@ static irqreturn_t ufs_qcom_mcq_esi_handler(int irq, void *data) static int ufs_qcom_config_esi(struct ufs_hba *hba) { struct ufs_qcom_host *host = ufshcd_get_variant(hba); - struct msi_desc *desc; - struct msi_desc *failed_desc = NULL; + struct ufs_qcom_irq *qi; int nr_irqs, ret; if (host->esi_enabled) @@ -1811,47 +1814,47 @@ static int ufs_qcom_config_esi(struct ufs_hba *hba) * 2. Poll queues do not need ESI. */ nr_irqs = hba->nr_hw_queues - hba->nr_queues[HCTX_TYPE_POLL]; + qi = devm_kcalloc(hba->dev, nr_irqs, sizeof(*qi), GFP_KERNEL); + if (qi) + return -ENOMEM; + ret = platform_device_msi_init_and_alloc_irqs(hba->dev, nr_irqs, ufs_qcom_write_msi_msg); if (ret) { dev_err(hba->dev, "Failed to request Platform MSI %d\n", ret); - return ret; + goto cleanup; } - msi_lock_descs(hba->dev); - msi_for_each_desc(desc, hba->dev, MSI_DESC_ALL) { - ret = devm_request_irq(hba->dev, desc->irq, - ufs_qcom_mcq_esi_handler, - IRQF_SHARED, "qcom-mcq-esi", desc); + for (int idx = 0; idx < nr_irqs; idx++) { + qi[idx].irq = msi_get_virq(hba->dev, idx); + qi[idx].idx = idx; + qi[idx].hba = hba; + + ret = devm_request_irq(hba->dev, qi[idx].irq, ufs_qcom_mcq_esi_handler, + IRQF_SHARED, "qcom-mcq-esi", qi + idx); if (ret) { dev_err(hba->dev, "%s: Fail to request IRQ for %d, err = %d\n", - __func__, desc->irq, ret); - failed_desc = desc; - break; + __func__, qi[idx].irq, ret); + qi[idx].irq = 0; + goto cleanup; } } - msi_unlock_descs(hba->dev); - if (ret) { - /* Rewind */ - msi_lock_descs(hba->dev); - msi_for_each_desc(desc, hba->dev, MSI_DESC_ALL) { - if (desc == failed_desc) - break; - devm_free_irq(hba->dev, desc->irq, hba); - } - msi_unlock_descs(hba->dev); - platform_device_msi_free_irqs_all(hba->dev); - } else { - if (host->hw_ver.major == 6 && host->hw_ver.minor == 0 && - host->hw_ver.step == 0) - ufshcd_rmwl(hba, ESI_VEC_MASK, - FIELD_PREP(ESI_VEC_MASK, MAX_ESI_VEC - 1), - REG_UFS_CFG3); - ufshcd_mcq_enable_esi(hba); - host->esi_enabled = true; + if (host->hw_ver.major == 6 && host->hw_ver.minor == 0 && + host->hw_ver.step == 0) { + ufshcd_rmwl(hba, ESI_VEC_MASK, + FIELD_PREP(ESI_VEC_MASK, MAX_ESI_VEC - 1), + REG_UFS_CFG3); } + ufshcd_mcq_enable_esi(hba); + host->esi_enabled = true; + return 0; +cleanup: + for (int idx = 0; qi[idx].irq; idx++) + devm_free_irq(hba->dev, qi[idx].irq, hba); + platform_device_msi_free_irqs_all(hba->dev); + devm_kfree(hba->dev, qi); return ret; } diff --git a/drivers/usb/chipidea/otg_fsm.c b/drivers/usb/chipidea/otg_fsm.c index c17516c29b63..a093544482d5 100644 --- a/drivers/usb/chipidea/otg_fsm.c +++ b/drivers/usb/chipidea/otg_fsm.c @@ -424,8 +424,7 @@ static enum hrtimer_restart ci_otg_hrtimer_func(struct hrtimer *t) /* Initialize timers */ static int ci_otg_init_timers(struct ci_hdrc *ci) { - hrtimer_init(&ci->otg_fsm_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ci->otg_fsm_hrtimer.function = ci_otg_hrtimer_func; + hrtimer_setup(&ci->otg_fsm_hrtimer, ci_otg_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); return 0; } diff --git a/drivers/usb/dwc2/hcd_queue.c b/drivers/usb/dwc2/hcd_queue.c index 238c6fd50e75..2a542a99ec44 100644 --- a/drivers/usb/dwc2/hcd_queue.c +++ b/drivers/usb/dwc2/hcd_queue.c @@ -1459,8 +1459,7 @@ static void dwc2_qh_init(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh, /* Initialize QH */ qh->hsotg = hsotg; timer_setup(&qh->unreserve_timer, dwc2_unreserve_timer_fn, 0); - hrtimer_init(&qh->wait_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - qh->wait_timer.function = &dwc2_wait_timer_fn; + hrtimer_setup(&qh->wait_timer, &dwc2_wait_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); qh->ep_type = ep_type; qh->ep_is_in = ep_is_in; diff --git a/drivers/usb/fotg210/fotg210-hcd.c b/drivers/usb/fotg210/fotg210-hcd.c index 3d404d19a205..64c4965a160f 100644 --- a/drivers/usb/fotg210/fotg210-hcd.c +++ b/drivers/usb/fotg210/fotg210-hcd.c @@ -4901,8 +4901,7 @@ static int hcd_fotg210_init(struct usb_hcd *hcd) */ fotg210->need_io_watchdog = 1; - hrtimer_init(&fotg210->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - fotg210->hrtimer.function = fotg210_hrtimer_func; + hrtimer_setup(&fotg210->hrtimer, fotg210_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); fotg210->next_hrtimer_event = FOTG210_HRTIMER_NO_EVENT; hcc_params = fotg210_readl(fotg210, &fotg210->caps->hcc_params); diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index f60576a65ca6..58b0dd575af3 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -1559,8 +1559,7 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f) ncm->port.open = ncm_open; ncm->port.close = ncm_close; - hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - ncm->task_timer.function = ncm_tx_timeout; + hrtimer_setup(&ncm->task_timer, ncm_tx_timeout, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); DBG(cdev, "CDC Network: IN/%s OUT/%s NOTIFY/%s\n", ncm->port.in_ep->name, ncm->port.out_ep->name, diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index bda08c5ba7c0..4f1b5db51dda 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -2479,8 +2479,7 @@ static DEVICE_ATTR_RO(urbs); static int dummy_start_ss(struct dummy_hcd *dum_hcd) { - hrtimer_init(&dum_hcd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - dum_hcd->timer.function = dummy_timer; + hrtimer_setup(&dum_hcd->timer, dummy_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); dum_hcd->rh_state = DUMMY_RH_RUNNING; dum_hcd->stream_en_ep = 0; INIT_LIST_HEAD(&dum_hcd->urbp_list); @@ -2509,8 +2508,7 @@ static int dummy_start(struct usb_hcd *hcd) return dummy_start_ss(dum_hcd); spin_lock_init(&dum_hcd->dum->lock); - hrtimer_init(&dum_hcd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - dum_hcd->timer.function = dummy_timer; + hrtimer_setup(&dum_hcd->timer, dummy_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); dum_hcd->rh_state = DUMMY_RH_RUNNING; INIT_LIST_HEAD(&dum_hcd->urbp_list); diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index 6de79ac5e6a4..6d1d190c914d 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -466,8 +466,7 @@ static int ehci_init(struct usb_hcd *hcd) */ ehci->need_io_watchdog = 1; - hrtimer_init(&ehci->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ehci->hrtimer.function = ehci_hrtimer_func; + hrtimer_setup(&ehci->hrtimer, ehci_hrtimer_func, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ehci->next_hrtimer_event = EHCI_HRTIMER_NO_EVENT; hcc_params = ehci_readl(ehci, &ehci->caps->hcc_params); diff --git a/drivers/usb/musb/musb_cppi41.c b/drivers/usb/musb/musb_cppi41.c index 9589243e8951..4cde3abb7006 100644 --- a/drivers/usb/musb/musb_cppi41.c +++ b/drivers/usb/musb/musb_cppi41.c @@ -760,8 +760,8 @@ cppi41_dma_controller_create(struct musb *musb, void __iomem *base) if (!controller) goto kzalloc_fail; - hrtimer_init(&controller->early_tx, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - controller->early_tx.function = cppi41_recheck_tx_req; + hrtimer_setup(&controller->early_tx, cppi41_recheck_tx_req, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); INIT_LIST_HEAD(&controller->early_tx_list); controller->controller.channel_alloc = cppi41_dma_channel_allocate; diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 62ca4a0ec55b..a99db4e025cd 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -7721,14 +7721,14 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) kthread_init_work(&port->event_work, tcpm_pd_event_handler); kthread_init_work(&port->enable_frs, tcpm_enable_frs_work); kthread_init_work(&port->send_discover_work, tcpm_send_discover_work); - hrtimer_init(&port->state_machine_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->state_machine_timer.function = state_machine_timer_handler; - hrtimer_init(&port->vdm_state_machine_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->vdm_state_machine_timer.function = vdm_state_machine_timer_handler; - hrtimer_init(&port->enable_frs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->enable_frs_timer.function = enable_frs_timer_handler; - hrtimer_init(&port->send_discover_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - port->send_discover_timer.function = send_discover_timer_handler; + hrtimer_setup(&port->state_machine_timer, state_machine_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&port->vdm_state_machine_timer, vdm_state_machine_timer_handler, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_setup(&port->enable_frs_timer, enable_frs_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + hrtimer_setup(&port->send_discover_timer, send_discover_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); spin_lock_init(&port->pd_event_lock); diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 49559605177e..c321d442f0da 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -266,24 +266,12 @@ static struct file *vfio_device_open_file(struct vfio_device *device) if (ret) goto err_free; - /* - * We can't use anon_inode_getfd() because we need to modify - * the f_mode flags directly to allow more than just ioctls - */ - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, - df, O_RDWR); + filep = anon_inode_getfile_fmode("[vfio-device]", &vfio_device_fops, + df, O_RDWR, FMODE_PREAD | FMODE_PWRITE); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_close_device; } - - /* - * TODO: add an anon_inode interface to do this. - * Appears to be missing by lack of need rather than - * explicitly prevented. Now there's need. - */ - filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); - /* * Use the pseudo fs inode on the device to link all mmaps * to the same address space, allowing us to unmap all vmas diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 70fbc9a3e703..cf3fb61f4d5b 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -23,6 +23,7 @@ #include <linux/cleanup.h> #include <linux/uuid.h> #include <linux/configfs.h> +#include <linux/mm.h> #include <uapi/linux/sev-guest.h> #include <uapi/linux/psp-sev.h> diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 8a294b9cbcf6..56d0dbe62163 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2950,8 +2950,8 @@ static int virtio_mem_probe(struct virtio_device *vdev) mutex_init(&vm->hotplug_mutex); INIT_LIST_HEAD(&vm->next); spin_lock_init(&vm->removal_lock); - hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - vm->retry_timer.function = virtio_mem_timer_expired; + hrtimer_setup(&vm->retry_timer, virtio_mem_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; vm->in_kdump = is_kdump_kernel(); diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c index 7a1096265f18..0820e35ad2e3 100644 --- a/drivers/watchdog/softdog.c +++ b/drivers/watchdog/softdog.c @@ -187,14 +187,12 @@ static int __init softdog_init(void) watchdog_set_nowayout(&softdog_dev, nowayout); watchdog_stop_on_reboot(&softdog_dev); - hrtimer_init(&softdog_ticktock, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - softdog_ticktock.function = softdog_fire; + hrtimer_setup(&softdog_ticktock, softdog_fire, CLOCK_MONOTONIC, HRTIMER_MODE_REL); if (IS_ENABLED(CONFIG_SOFT_WATCHDOG_PRETIMEOUT)) { softdog_info.options |= WDIOF_PRETIMEOUT; - hrtimer_init(&softdog_preticktock, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - softdog_preticktock.function = softdog_pretimeout; + hrtimer_setup(&softdog_preticktock, softdog_pretimeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } if (soft_active_on_boot) diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 19698d87dc57..8369fd94fc1a 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1051,8 +1051,8 @@ static int watchdog_cdev_register(struct watchdog_device *wdd) } kthread_init_work(&wd_data->work, watchdog_ping_work); - hrtimer_init(&wd_data->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - wd_data->timer.function = watchdog_timer_expired; + hrtimer_setup(&wd_data->timer, watchdog_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); watchdog_hrtimer_pretimeout_init(wdd); if (wdd->id == 0) { diff --git a/drivers/watchdog/watchdog_hrtimer_pretimeout.c b/drivers/watchdog/watchdog_hrtimer_pretimeout.c index 940b53718a91..fbc7eecd8b20 100644 --- a/drivers/watchdog/watchdog_hrtimer_pretimeout.c +++ b/drivers/watchdog/watchdog_hrtimer_pretimeout.c @@ -23,8 +23,8 @@ void watchdog_hrtimer_pretimeout_init(struct watchdog_device *wdd) { struct watchdog_core_data *wd_data = wdd->wd_data; - hrtimer_init(&wd_data->pretimeout_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wd_data->pretimeout_timer.function = watchdog_hrtimer_pretimeout; + hrtimer_setup(&wd_data->pretimeout_timer, watchdog_hrtimer_pretimeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } void watchdog_hrtimer_pretimeout_start(struct watchdog_device *wdd) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3e68521f4e2f..399d455d50d6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -669,8 +669,8 @@ v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir, * */ -static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int err; u32 perm; @@ -692,8 +692,7 @@ static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (fid) p9_fid_put(fid); - - return err; + return ERR_PTR(err); } /** diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 143ac03b7425..cc2007be2173 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -350,9 +350,9 @@ out: * */ -static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t omode) +static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t omode) { int err; struct v9fs_session_info *v9ses; @@ -417,7 +417,7 @@ error: p9_fid_put(fid); v9fs_put_acl(dacl, pacl); p9_fid_put(dfid); - return err; + return ERR_PTR(err); } static int diff --git a/fs/Kconfig b/fs/Kconfig index 64d420e3c475..afe21866d6b4 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -336,7 +336,6 @@ source "fs/qnx4/Kconfig" source "fs/qnx6/Kconfig" source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" -source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/erofs/Kconfig" source "fs/vboxsf/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 15df0a923d3a..77fd7f7b5d02 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -87,7 +87,6 @@ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ obj-y += unicode/ -obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS) += smb/ obj-$(CONFIG_HPFS_FS) += hpfs/ obj-$(CONFIG_NTFS3_FS) += ntfs3/ diff --git a/fs/affs/affs.h b/fs/affs/affs.h index e8c2c4535cb3..ac4e9a02910b 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -168,7 +168,7 @@ extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsi extern int affs_unlink(struct inode *dir, struct dentry *dentry); extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); -extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, +extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 8c154490a2d6..f883be50db12 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -273,7 +273,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir, return 0; } -int +struct dentry * affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -285,7 +285,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, inode = affs_new_inode(dir); if (!inode) - return -ENOSPC; + return ERR_PTR(-ENOSPC); inode->i_mode = S_IFDIR | mode; affs_mode_to_prot(inode); @@ -298,9 +298,9 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, clear_nlink(inode); mark_inode_dirty(inode); iput(inode); - return error; + return ERR_PTR(error); } - return 0; + return NULL; } int diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 6d42f85c6be5..e941da5b6dd9 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -362,3 +362,53 @@ int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist, alist->nr_addrs++; return 0; } + +/* + * Set the app data on the rxrpc peers an address list points to + */ +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist) +{ + unsigned long data = (unsigned long)server; + int n = 0, o = 0; + + if (!old_alist) { + /* New server. Just set all. */ + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + return; + } + if (!new_alist) { + /* Dead server. Just remove all. */ + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); + return; + } + + /* Walk through the two lists simultaneously, setting new peers and + * clearing old ones. The two lists are ordered by pointer to peer + * record. + */ + while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) { + struct rxrpc_peer *pn = new_alist->addrs[n].peer; + struct rxrpc_peer *po = old_alist->addrs[o].peer; + + if (pn == po) + continue; + if (pn < po) { + rxrpc_kernel_set_peer_data(pn, data); + n++; + } else { + rxrpc_kernel_set_peer_data(po, 0); + o++; + } + } + + if (n < new_alist->nr_addrs) + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + if (o < old_alist->nr_addrs) + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); +} diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 96a6781f3653..0168bbf53fe0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -20,8 +20,9 @@ static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; static atomic_t cell_debug_id; -static void afs_queue_cell_manager(struct afs_net *); -static void afs_manage_cell_work(struct work_struct *); +static void afs_cell_timer(struct timer_list *timer); +static void afs_destroy_cell_work(struct work_struct *work); +static void afs_manage_cell_work(struct work_struct *work); static void afs_dec_cells_outstanding(struct afs_net *net) { @@ -29,19 +30,11 @@ static void afs_dec_cells_outstanding(struct afs_net *net) wake_up_var(&net->cells_outstanding); } -/* - * Set the cell timer to fire after a given delay, assuming it's not already - * set for an earlier time. - */ -static void afs_set_cell_timer(struct afs_net *net, time64_t delay) +static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state) { - if (net->live) { - atomic_inc(&net->cells_outstanding); - if (timer_reduce(&net->cells_timer, jiffies + delay * HZ)) - afs_dec_cells_outstanding(net); - } else { - afs_queue_cell_manager(net); - } + smp_store_release(&cell->state, state); /* Commit cell changes before state */ + smp_wmb(); /* Set cell state before task state */ + wake_up_var(&cell->state); } /* @@ -116,7 +109,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, const char *name, unsigned int namelen, const char *addresses) { - struct afs_vlserver_list *vllist; + struct afs_vlserver_list *vllist = NULL; struct afs_cell *cell; int i, ret; @@ -163,13 +156,15 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->net = net; refcount_set(&cell->ref, 1); atomic_set(&cell->active, 0); + INIT_WORK(&cell->destroyer, afs_destroy_cell_work); INIT_WORK(&cell->manager, afs_manage_cell_work); + timer_setup(&cell->management_timer, afs_cell_timer, 0); init_rwsem(&cell->vs_lock); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); cell->fs_servers = RB_ROOT; - seqlock_init(&cell->fs_lock); + init_rwsem(&cell->fs_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); @@ -204,7 +199,13 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ atomic_inc(&net->cells_outstanding); + ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, + 2, INT_MAX / 2, GFP_KERNEL); + if (ret < 0) + goto error; + cell->dynroot_ino = ret; cell->debug_id = atomic_inc_return(&cell_debug_id); + trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); _leave(" = %p", cell); @@ -214,6 +215,7 @@ parse_failed: if (ret == -EINVAL) printk(KERN_ERR "kAFS: bad VL server IP address\n"); error: + afs_put_vlserverlist(cell->net, vllist); kfree(cell->name - 1); kfree(cell); _leave(" = %d", ret); @@ -227,6 +229,7 @@ error: * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. * @excl: T if an error should be given if the cell name already exists. + * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if * needed. Note that that actual DNS query is punted off to the manager thread @@ -235,7 +238,8 @@ error: */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl) + const char *vllist, bool excl, + enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; struct rb_node *parent, **pp; @@ -245,7 +249,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, _enter("%s,%s", name, vllist); if (!excl) { - cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup); + cell = afs_find_cell(net, name, namesz, trace); if (!IS_ERR(cell)) goto wait_for_cell; } @@ -288,26 +292,28 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, cell = candidate; candidate = NULL; - atomic_set(&cell->active, 2); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert); + afs_use_cell(cell, trace); rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_get_queue_new); + afs_queue_cell(cell, afs_cell_trace_queue_new); wait_for_cell: - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active), - afs_cell_trace_wait); _debug("wait_for_cell"); - wait_var_event(&cell->state, - ({ - state = smp_load_acquire(&cell->state); /* vs error */ - state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED; - })); + state = smp_load_acquire(&cell->state); /* vs error */ + if (state != AFS_CELL_ACTIVE && + state != AFS_CELL_DEAD) { + afs_see_cell(cell, afs_cell_trace_wait); + wait_var_event(&cell->state, + ({ + state = smp_load_acquire(&cell->state); /* vs error */ + state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; + })); + } /* Check the state obtained from the wait check. */ - if (state == AFS_CELL_REMOVED) { + if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } @@ -321,7 +327,7 @@ cell_already_exists: if (excl) { ret = -EEXIST; } else { - afs_use_cell(cursor, afs_cell_trace_use_lookup); + afs_use_cell(cursor, trace); ret = 0; } up_write(&net->cells_lock); @@ -331,7 +337,7 @@ cell_already_exists: goto wait_for_cell; goto error_noput; error: - afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error); error_noput: _leave(" = %d [error]", ret); return ERR_PTR(ret); @@ -376,8 +382,9 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) if (cp && cp < rootcell + len) return -EINVAL; - /* allocate a cell record for the root cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false); + /* allocate a cell record for the root/workstation cell */ + new_root = afs_lookup_cell(net, rootcell, len, vllist, false, + afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); return PTR_ERR(new_root); @@ -388,12 +395,11 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) /* install the new cell */ down_write(&net->cells_lock); - afs_see_cell(new_root, afs_cell_trace_see_ws); old_root = rcu_replace_pointer(net->ws_cell, new_root, lockdep_is_held(&net->cells_lock)); up_write(&net->cells_lock); - afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws); + afs_unuse_cell(old_root, afs_cell_trace_unuse_ws); _leave(" = 0"); return 0; } @@ -511,8 +517,9 @@ static void afs_cell_destroy(struct rcu_head *rcu) trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free); afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); - afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias); + afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); + idr_remove(&net->cells_dyn_ino, cell->dynroot_ino); kfree(cell->name - 1); kfree(cell); @@ -520,30 +527,14 @@ static void afs_cell_destroy(struct rcu_head *rcu) _leave(" [destroyed]"); } -/* - * Queue the cell manager. - */ -static void afs_queue_cell_manager(struct afs_net *net) -{ - int outstanding = atomic_inc_return(&net->cells_outstanding); - - _enter("%d", outstanding); - - if (!queue_work(afs_wq, &net->cells_manager)) - afs_dec_cells_outstanding(net); -} - -/* - * Cell management timer. We have an increment on cells_outstanding that we - * need to pass along to the work item. - */ -void afs_cells_timer(struct timer_list *timer) +static void afs_destroy_cell_work(struct work_struct *work) { - struct afs_net *net = container_of(timer, struct afs_net, cells_timer); + struct afs_cell *cell = container_of(work, struct afs_cell, destroyer); - _enter(""); - if (!queue_work(afs_wq, &net->cells_manager)) - afs_dec_cells_outstanding(net); + afs_see_cell(cell, afs_cell_trace_destroy); + timer_delete_sync(&cell->management_timer); + cancel_work_sync(&cell->manager); + call_rcu(&cell->rcu, afs_cell_destroy); } /* @@ -575,7 +566,7 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason) if (zero) { a = atomic_read(&cell->active); WARN(a != 0, "Cell active count %u > 0\n", a); - call_rcu(&cell->rcu, afs_cell_destroy); + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } } } @@ -587,10 +578,9 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) { int r, a; - r = refcount_read(&cell->ref); - WARN_ON(r == 0); + __refcount_inc(&cell->ref, &r); a = atomic_inc_return(&cell->active); - trace_afs_cell(cell->debug_id, r, a, reason); + trace_afs_cell(cell->debug_id, r + 1, a, reason); return cell; } @@ -598,10 +588,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) * Record a cell becoming less active. When the active counter reaches 1, it * is scheduled for destruction, but may get reactivated. */ -void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason) { unsigned int debug_id; time64_t now, expire_delay; + bool zero; int r, a; if (!cell) @@ -616,13 +607,15 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr expire_delay = afs_cell_gc_delay; debug_id = cell->debug_id; - r = refcount_read(&cell->ref); a = atomic_dec_return(&cell->active); - trace_afs_cell(debug_id, r, a, reason); - WARN_ON(a == 0); - if (a == 1) + if (!a) /* 'cell' may now be garbage collected. */ - afs_set_cell_timer(net, expire_delay); + afs_set_cell_timer(cell, expire_delay); + + zero = __refcount_dec_and_test(&cell->ref, &r); + trace_afs_cell(debug_id, r - 1, a, reason); + if (zero) + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } /* @@ -642,9 +635,27 @@ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason) */ void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason) { - afs_get_cell(cell, reason); - if (!queue_work(afs_wq, &cell->manager)) - afs_put_cell(cell, afs_cell_trace_put_queue_fail); + queue_work(afs_wq, &cell->manager); +} + +/* + * Cell-specific management timer. + */ +static void afs_cell_timer(struct timer_list *timer) +{ + struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer); + + afs_see_cell(cell, afs_cell_trace_see_mgmt_timer); + if (refcount_read(&cell->ref) > 0 && cell->net->live) + queue_work(afs_wq, &cell->manager); +} + +/* + * Set/reduce the cell timer. + */ +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) +{ + timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ); } /* @@ -706,7 +717,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) if (cell->proc_link.next) cell->proc_link.next->pprev = &cell->proc_link.next; - afs_dynroot_mkdir(net, cell); mutex_unlock(&net->proc_cells_lock); return 0; } @@ -723,217 +733,130 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) mutex_lock(&net->proc_cells_lock); if (!hlist_unhashed(&cell->proc_link)) hlist_del_rcu(&cell->proc_link); - afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); _leave(""); } +static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage) +{ + const struct afs_vlserver_list *vllist; + time64_t expire_at = cell->last_inactive; + time64_t now = ktime_get_real_seconds(); + + if (atomic_read(&cell->active)) + return false; + if (!cell->net->live) + return true; + + vllist = rcu_dereference_protected(cell->vl_servers, true); + if (vllist && vllist->nr_servers > 0) + expire_at += afs_cell_gc_delay; + + if (expire_at <= now) + return true; + if (expire_at < *_next_manage) + *_next_manage = expire_at; + return false; +} + /* * Manage a cell record, initialising and destroying it, maintaining its DNS * records. */ -static void afs_manage_cell(struct afs_cell *cell) +static bool afs_manage_cell(struct afs_cell *cell) { struct afs_net *net = cell->net; - int ret, active; + time64_t next_manage = TIME64_MAX; + int ret; _enter("%s", cell->name); -again: _debug("state %u", cell->state); switch (cell->state) { - case AFS_CELL_INACTIVE: - case AFS_CELL_FAILED: - down_write(&net->cells_lock); - active = 1; - if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) { - rb_erase(&cell->net_node, &net->cells); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0, - afs_cell_trace_unuse_delete); - smp_store_release(&cell->state, AFS_CELL_REMOVED); - } - up_write(&net->cells_lock); - if (cell->state == AFS_CELL_REMOVED) { - wake_up_var(&cell->state); - goto final_destruction; - } - if (cell->state == AFS_CELL_FAILED) - goto done; - smp_store_release(&cell->state, AFS_CELL_UNSET); - wake_up_var(&cell->state); - goto again; - - case AFS_CELL_UNSET: - smp_store_release(&cell->state, AFS_CELL_ACTIVATING); - wake_up_var(&cell->state); - goto again; - - case AFS_CELL_ACTIVATING: - ret = afs_activate_cell(net, cell); - if (ret < 0) - goto activation_failed; + case AFS_CELL_SETTING_UP: + goto set_up_cell; + case AFS_CELL_ACTIVE: + goto cell_is_active; + case AFS_CELL_REMOVING: + WARN_ON_ONCE(1); + return false; + case AFS_CELL_DEAD: + return false; + default: + _debug("bad state %u", cell->state); + WARN_ON_ONCE(1); /* Unhandled state */ + return false; + } - smp_store_release(&cell->state, AFS_CELL_ACTIVE); - wake_up_var(&cell->state); - goto again; +set_up_cell: + ret = afs_activate_cell(net, cell); + if (ret < 0) { + cell->error = ret; + goto remove_cell; + } - case AFS_CELL_ACTIVE: - if (atomic_read(&cell->active) > 1) { - if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { - ret = afs_update_cell(cell); - if (ret < 0) - cell->error = ret; - } - goto done; - } - smp_store_release(&cell->state, AFS_CELL_DEACTIVATING); - wake_up_var(&cell->state); - goto again; + afs_set_cell_state(cell, AFS_CELL_ACTIVE); - case AFS_CELL_DEACTIVATING: - if (atomic_read(&cell->active) > 1) - goto reverse_deactivation; - afs_deactivate_cell(net, cell); - smp_store_release(&cell->state, AFS_CELL_INACTIVE); - wake_up_var(&cell->state); - goto again; +cell_is_active: + if (afs_has_cell_expired(cell, &next_manage)) + goto remove_cell; - case AFS_CELL_REMOVED: - goto done; + if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { + ret = afs_update_cell(cell); + if (ret < 0) + cell->error = ret; + } - default: - break; + if (next_manage < TIME64_MAX && cell->net->live) { + time64_t now = ktime_get_real_seconds(); + + if (next_manage - now <= 0) + afs_queue_cell(cell, afs_cell_trace_queue_again); + else + afs_set_cell_timer(cell, next_manage - now); } - _debug("bad state %u", cell->state); - BUG(); /* Unhandled state */ + _leave(" [done %u]", cell->state); + return false; -activation_failed: - cell->error = ret; - afs_deactivate_cell(net, cell); +remove_cell: + down_write(&net->cells_lock); - smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */ - wake_up_var(&cell->state); - goto again; + if (atomic_read(&cell->active)) { + up_write(&net->cells_lock); + goto cell_is_active; + } -reverse_deactivation: - smp_store_release(&cell->state, AFS_CELL_ACTIVE); - wake_up_var(&cell->state); - _leave(" [deact->act]"); - return; + /* Make sure that the expiring server records are going to see the fact + * that the cell is caput. + */ + afs_set_cell_state(cell, AFS_CELL_REMOVING); -done: - _leave(" [done %u]", cell->state); - return; + afs_deactivate_cell(net, cell); + afs_purge_servers(cell); + + rb_erase(&cell->net_node, &net->cells); + afs_see_cell(cell, afs_cell_trace_unuse_delete); + up_write(&net->cells_lock); -final_destruction: /* The root volume is pinning the cell */ afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root); cell->root_volume = NULL; - afs_put_cell(cell, afs_cell_trace_put_destroy); + + afs_set_cell_state(cell, AFS_CELL_DEAD); + return true; } static void afs_manage_cell_work(struct work_struct *work) { struct afs_cell *cell = container_of(work, struct afs_cell, manager); + bool final_put; - afs_manage_cell(cell); - afs_put_cell(cell, afs_cell_trace_put_queue_work); -} - -/* - * Manage the records of cells known to a network namespace. This includes - * updating the DNS records and garbage collecting unused cells that were - * automatically added. - * - * Note that constructed cell records may only be removed from net->cells by - * this work item, so it is safe for this work item to stash a cursor pointing - * into the tree and then return to caller (provided it skips cells that are - * still under construction). - * - * Note also that we were given an increment on net->cells_outstanding by - * whoever queued us that we need to deal with before returning. - */ -void afs_manage_cells(struct work_struct *work) -{ - struct afs_net *net = container_of(work, struct afs_net, cells_manager); - struct rb_node *cursor; - time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; - bool purging = !net->live; - - _enter(""); - - /* Trawl the cell database looking for cells that have expired from - * lack of use and cells whose DNS results have expired and dispatch - * their managers. - */ - down_read(&net->cells_lock); - - for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { - struct afs_cell *cell = - rb_entry(cursor, struct afs_cell, net_node); - unsigned active; - bool sched_cell = false; - - active = atomic_read(&cell->active); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), - active, afs_cell_trace_manage); - - ASSERTCMP(active, >=, 1); - - if (purging) { - if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) { - active = atomic_dec_return(&cell->active); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), - active, afs_cell_trace_unuse_pin); - } - } - - if (active == 1) { - struct afs_vlserver_list *vllist; - time64_t expire_at = cell->last_inactive; - - read_lock(&cell->vl_servers_lock); - vllist = rcu_dereference_protected( - cell->vl_servers, - lockdep_is_held(&cell->vl_servers_lock)); - if (vllist->nr_servers > 0) - expire_at += afs_cell_gc_delay; - read_unlock(&cell->vl_servers_lock); - if (purging || expire_at <= now) - sched_cell = true; - else if (expire_at < next_manage) - next_manage = expire_at; - } - - if (!purging) { - if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) - sched_cell = true; - } - - if (sched_cell) - afs_queue_cell(cell, afs_cell_trace_get_queue_manage); - } - - up_read(&net->cells_lock); - - /* Update the timer on the way out. We have to pass an increment on - * cells_outstanding in the namespace that we are in to the timer or - * the work scheduler. - */ - if (!purging && next_manage < TIME64_MAX) { - now = ktime_get_real_seconds(); - - if (next_manage - now <= 0) { - if (queue_work(afs_wq, &net->cells_manager)) - atomic_inc(&net->cells_outstanding); - } else { - afs_set_cell_timer(net, next_manage - now); - } - } - - afs_dec_cells_outstanding(net); - _leave(" [%d]", atomic_read(&net->cells_outstanding)); + afs_see_cell(cell, afs_cell_trace_manage); + final_put = afs_manage_cell(cell); + afs_see_cell(cell, afs_cell_trace_managed); + if (final_put) + afs_put_cell(cell, afs_cell_trace_put_final); } /* @@ -942,6 +865,7 @@ void afs_manage_cells(struct work_struct *work) void afs_cell_purge(struct afs_net *net) { struct afs_cell *ws; + struct rb_node *cursor; _enter(""); @@ -949,14 +873,21 @@ void afs_cell_purge(struct afs_net *net) ws = rcu_replace_pointer(net->ws_cell, NULL, lockdep_is_held(&net->cells_lock)); up_write(&net->cells_lock); - afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws); + afs_unuse_cell(ws, afs_cell_trace_unuse_ws); - _debug("del timer"); - if (del_timer_sync(&net->cells_timer)) - atomic_dec(&net->cells_outstanding); + _debug("kick cells"); + down_read(&net->cells_lock); + for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { + struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node); + + afs_see_cell(cell, afs_cell_trace_purge); - _debug("kick mgr"); - afs_queue_cell_manager(net); + if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_unuse_cell(cell, afs_cell_trace_unuse_pin); + + afs_queue_cell(cell, afs_cell_trace_queue_purge); + } + up_read(&net->cells_lock); _debug("wait"); wait_var_event(&net->cells_outstanding, diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 99a3f20bc786..1a906805a9e3 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -139,49 +139,6 @@ bool afs_cm_incoming_call(struct afs_call *call) } /* - * Find the server record by peer address and record a probe to the cache - * manager from a server. - */ -static int afs_find_cm_server_by_peer(struct afs_call *call) -{ - struct sockaddr_rxrpc srx; - struct afs_server *server; - struct rxrpc_peer *peer; - - peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall); - - server = afs_find_server(call->net, peer); - if (!server) { - trace_afs_cm_no_server(call, &srx); - return 0; - } - - call->server = server; - return 0; -} - -/* - * Find the server record by server UUID and record a probe to the cache - * manager from a server. - */ -static int afs_find_cm_server_by_uuid(struct afs_call *call, - struct afs_uuid *uuid) -{ - struct afs_server *server; - - rcu_read_lock(); - server = afs_find_server_by_uuid(call->net, call->request); - rcu_read_unlock(); - if (!server) { - trace_afs_cm_no_server_u(call, call->request); - return 0; - } - - call->server = server; - return 0; -} - -/* * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) @@ -322,10 +279,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -349,18 +303,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) */ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { - int ret; - _enter(""); afs_extract_discard(call, 0); - ret = afs_extract_data(call, false); - if (ret < 0) - return ret; - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_peer(call); + return afs_extract_data(call, false); } /* @@ -373,8 +319,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) __be32 *b; int ret; - _enter(""); - _enter("{%u}", call->unmarshall); switch (call->unmarshall) { @@ -421,9 +365,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_uuid(call, call->request); + if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) { + pr_notice("Callback UUID does not match fileserver UUID\n"); + trace_afs_cm_no_server_u(call, call->request); + return 0; + } + + return 0; } /* @@ -455,7 +403,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -533,7 +481,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -593,7 +541,7 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -667,9 +615,5 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - - /* We'll need the file server record as that tells us which set of - * vnodes to operate upon. - */ - return afs_find_cm_server_by_peer(call); + return 0; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 02cbf38e1a77..9e7b1fe82c27 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -33,8 +33,8 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nl loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); -static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode); +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, @@ -1004,9 +1004,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, afs_stat_v(dvnode, n_lookup); inode = afs_do_lookup(dir, dentry); if (inode == ERR_PTR(-ENOENT)) - inode = afs_try_auto_mntpt(dentry, dir); - - if (!IS_ERR_OR_NULL(inode)) + inode = NULL; + else if (!IS_ERR_OR_NULL(inode)) fid = AFS_FS_I(inode)->fid; _debug("splice %p", dentry->d_inode); @@ -1315,8 +1314,8 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1328,7 +1327,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op = afs_alloc_operation(NULL, dvnode->volume); if (IS_ERR(op)) { d_drop(dentry); - return PTR_ERR(op); + return ERR_CAST(op); } fscache_use_cookie(afs_vnode_cache(dvnode), true); @@ -1344,7 +1343,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op->ops = &afs_mkdir_operation; ret = afs_do_sync_operation(op); afs_dir_unuse_cookie(dvnode, ret); - return ret; + return ERR_PTR(ret); } /* diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 008698d706ca..691e0ae607a1 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,16 +10,19 @@ #include <linux/dns_resolver.h> #include "internal.h" -static atomic_t afs_autocell_ino; +#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */ +#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX) + +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino); /* * iget5() comparator for inode created by autocell operations - * - * These pseudo inodes don't match anything. */ static int afs_iget5_pseudo_test(struct inode *inode, void *opaque) { - return 0; + struct afs_fid *fid = opaque; + + return inode->i_ino == fid->vnode; } /* @@ -39,28 +42,16 @@ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque) } /* - * Create an inode for a dynamic root directory or an autocell dynamic - * automount dir. + * Create an inode for an autocell dynamic automount dir. */ -struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) +static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) { - struct afs_super_info *as = AFS_FS_S(sb); struct afs_vnode *vnode; struct inode *inode; - struct afs_fid fid = {}; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; _enter(""); - if (as->volume) - fid.vid = as->volume->vid; - if (root) { - fid.vnode = 1; - fid.unique = 1; - } else { - fid.vnode = atomic_inc_return(&afs_autocell_ino); - fid.unique = 0; - } - inode = iget5_locked(sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); if (!inode) { @@ -73,115 +64,71 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) vnode = AFS_FS_I(inode); - /* there shouldn't be an existing inode */ - BUG_ON(!(inode->i_state & I_NEW)); - - netfs_inode_init(&vnode->netfs, NULL, false); - inode->i_size = 0; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - if (root) { - inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &simple_dir_operations; - } else { - inode->i_op = &afs_autocell_inode_operations; - } - set_nlink(inode, 2); - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - simple_inode_init_ts(inode); - inode->i_blocks = 0; - inode->i_generation = 0; - - set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); - if (!root) { + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_autocell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - inode->i_flags |= S_AUTOMOUNT; - } - inode->i_flags |= S_NOATIME; - unlock_new_inode(inode); + unlock_new_inode(inode); + } _leave(" = %p", inode); return inode; } /* - * Probe to see if a cell may exist. This prevents positive dentries from - * being created unnecessarily. + * Try to automount the mountpoint with pseudo directory, if the autocell + * option is set. */ -static int afs_probe_cell_name(struct dentry *dentry) +static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry, + unsigned int flags) { - struct afs_cell *cell; + struct afs_cell *cell = NULL; struct afs_net *net = afs_d2net(dentry); + struct inode *inode = NULL; const char *name = dentry->d_name.name; size_t len = dentry->d_name.len; - char *result = NULL; - int ret; + bool dotted = false; + int ret = -ENOENT; /* Names prefixed with a dot are R/W mounts. */ if (name[0] == '.') { - if (len == 1) - return -EINVAL; name++; len--; + dotted = true; } - cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe); - if (!IS_ERR(cell)) { - afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe); - return 0; + cell = afs_lookup_cell(net, name, len, NULL, false, + afs_cell_trace_use_lookup_dynroot); + if (IS_ERR(cell)) { + ret = PTR_ERR(cell); + goto out_no_cell; } - ret = dns_query(net->net, "afsdb", name, len, "srv=1", - &result, NULL, false); - if (ret == -ENODATA || ret == -ENOKEY || ret == 0) - ret = -ENOENT; - if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) { - struct dns_server_list_v1_header *v1 = (void *)result; - - if (v1->hdr.zero == 0 && - v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST && - v1->hdr.version == 1 && - (v1->status != DNS_LOOKUP_GOOD && - v1->status != DNS_LOOKUP_GOOD_WITH_BAD)) - return -ENOENT; - - } - - kfree(result); - return ret; -} - -/* - * Try to auto mount the mountpoint with pseudo directory, if the autocell - * operation is setted. - */ -struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) -{ - struct afs_vnode *vnode = AFS_FS_I(dir); - struct inode *inode; - int ret = -ENOENT; - - _enter("%p{%pd}, {%llx:%llu}", - dentry, dentry, vnode->fid.vid, vnode->fid.vnode); - - if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) - goto out; - - ret = afs_probe_cell_name(dentry); - if (ret < 0) - goto out; - - inode = afs_iget_pseudo_dir(dir->i_sb, false); + inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out; } - _leave("= %p", inode); - return inode; + dentry->d_fsdata = cell; + return d_splice_alias(inode, dentry); out: - _leave("= %d", ret); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot); +out_no_cell: + if (!inode) + return d_splice_alias(inode, dentry); return ret == -ENOENT ? NULL : ERR_PTR(ret); } @@ -193,8 +140,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr { _enter("%pd", dentry); - ASSERTCMP(d_inode(dentry), ==, NULL); - if (flags & LOOKUP_CREATE) return ERR_PTR(-EOPNOTSUPP); @@ -203,98 +148,49 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr return ERR_PTR(-ENAMETOOLONG); } - return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry); + if (dentry->d_name.len == 5 && + memcmp(dentry->d_name.name, "@cell", 5) == 0) + return afs_lookup_atcell(dir, dentry, 2); + + if (dentry->d_name.len == 6 && + memcmp(dentry->d_name.name, ".@cell", 6) == 0) + return afs_lookup_atcell(dir, dentry, 3); + + return afs_dynroot_lookup_cell(dir, dentry, flags); } const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -const struct dentry_operations afs_dynroot_dentry_operations = { - .d_delete = always_delete_dentry, - .d_release = afs_d_release, - .d_automount = afs_d_automount, -}; - -/* - * Create a manually added cell mount directory. - * - The caller must hold net->proc_cells_lock - */ -int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell) -{ - struct super_block *sb = net->dynroot_sb; - struct dentry *root, *subdir, *dsubdir; - char *dotname = cell->name - 1; - int ret; - - if (!sb || atomic_read(&sb->s_active) == 0) - return 0; - - /* Let the ->lookup op do the creation */ - root = sb->s_root; - inode_lock(root->d_inode); - subdir = lookup_one_len(cell->name, root, cell->name_len); - if (IS_ERR(subdir)) { - ret = PTR_ERR(subdir); - goto unlock; - } - - dsubdir = lookup_one_len(dotname, root, cell->name_len + 1); - if (IS_ERR(dsubdir)) { - ret = PTR_ERR(dsubdir); - dput(subdir); - goto unlock; - } - - /* Note that we're retaining extra refs on the dentries. */ - subdir->d_fsdata = (void *)1UL; - dsubdir->d_fsdata = (void *)1UL; - ret = 0; -unlock: - inode_unlock(root->d_inode); - return ret; -} - -static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len) +static void afs_dynroot_d_release(struct dentry *dentry) { - struct dentry *subdir; - - /* Don't want to trigger a lookup call, which will re-add the cell */ - subdir = try_lookup_one_len(name, root, name_len); - if (IS_ERR_OR_NULL(subdir)) { - _debug("lookup %ld", PTR_ERR(subdir)); - return; - } - - _debug("rmdir %pd %u", subdir, d_count(subdir)); + struct afs_cell *cell = dentry->d_fsdata; - if (subdir->d_fsdata) { - _debug("unpin %u", d_count(subdir)); - subdir->d_fsdata = NULL; - dput(subdir); - } - dput(subdir); + afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt); } /* - * Remove a manually added cell mount directory. - * - The caller must hold net->proc_cells_lock + * Keep @cell symlink dentries around, but only keep cell autodirs when they're + * being used. */ -void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell) +static int afs_dynroot_delete_dentry(const struct dentry *dentry) { - struct super_block *sb = net->dynroot_sb; - char *dotname = cell->name - 1; - - if (!sb || atomic_read(&sb->s_active) == 0) - return; + const struct qstr *name = &dentry->d_name; - inode_lock(sb->s_root->d_inode); - afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len); - afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1); - inode_unlock(sb->s_root->d_inode); - _leave(""); + if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0) + return 0; + if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0) + return 0; + return 1; } +const struct dentry_operations afs_dynroot_dentry_operations = { + .d_delete = afs_dynroot_delete_dentry, + .d_release = afs_dynroot_d_release, + .d_automount = afs_d_automount, +}; + static void afs_atcell_delayed_put_cell(void *arg) { struct afs_cell *cell = arg; @@ -314,6 +210,9 @@ static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inod const char *name; bool dotted = vnode->fid.vnode == 3; + if (!rcu_access_pointer(net->ws_cell)) + return ERR_PTR(-ENOENT); + if (!dentry) { /* We're in RCU-pathwalk. */ cell = rcu_dereference(net->ws_cell); @@ -325,9 +224,6 @@ static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inod return name; } - if (!rcu_access_pointer(net->ws_cell)) - return ERR_PTR(-ENOENT); - down_read(&net->cells_lock); cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); @@ -347,149 +243,163 @@ static const struct inode_operations afs_atcell_inode_operations = { }; /* - * Look up @cell or .@cell in a dynroot directory. This is a substitution for - * the local cell name for the net namespace. + * Create an inode for the @cell or .@cell symlinks. */ -static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name) +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino) { struct afs_vnode *vnode; - struct afs_fid fid = { .vnode = 2, .unique = 1, }; - struct dentry *dentry; struct inode *inode; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; - if (name[0] == '.') - fid.vnode = 3; - - dentry = d_alloc_name(root, name); - if (!dentry) - return ERR_PTR(-ENOMEM); - - inode = iget5_locked(dentry->d_sb, fid.vnode, + inode = iget5_locked(dir->i_sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); - if (!inode) { - dput(dentry); + if (!inode) return ERR_PTR(-ENOMEM); - } vnode = AFS_FS_I(inode); - /* there shouldn't be an existing inode */ - if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) { - iput(inode); - dput(dentry); - return ERR_PTR(-EIO); + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 1); + inode->i_size = 0; + inode->i_mode = S_IFLNK | 0555; + inode->i_op = &afs_atcell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + unlock_new_inode(inode); } - - netfs_inode_init(&vnode->netfs, NULL, false); - simple_inode_init_ts(inode); - set_nlink(inode, 1); - inode->i_size = 0; - inode->i_mode = S_IFLNK | 0555; - inode->i_op = &afs_atcell_inode_operations; - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - inode->i_blocks = 0; - inode->i_generation = 0; - inode->i_flags |= S_NOATIME; - - unlock_new_inode(inode); - d_splice_alias(inode, dentry); - return dentry; + return d_splice_alias(inode, dentry); } /* - * Create @cell and .@cell symlinks. + * Transcribe the cell database into readdir content under the RCU read lock. + * Each cell produces two entries, one prefixed with a dot and one not. */ -static int afs_dynroot_symlink(struct afs_net *net) +static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx) { - struct super_block *sb = net->dynroot_sb; - struct dentry *root, *symlink, *dsymlink; - int ret; - - /* Let the ->lookup op do the creation */ - root = sb->s_root; - inode_lock(root->d_inode); - symlink = afs_dynroot_create_symlink(root, "@cell"); - if (IS_ERR(symlink)) { - ret = PTR_ERR(symlink); - goto unlock; - } + const struct afs_cell *cell; + loff_t newpos; + + _enter("%llu", ctx->pos); + + for (;;) { + unsigned int ix = ctx->pos >> 1; + + cell = idr_get_next(&net->cells_dyn_ino, &ix); + if (!cell) + return 0; + if (READ_ONCE(cell->state) == AFS_CELL_REMOVING || + READ_ONCE(cell->state) == AFS_CELL_DEAD) { + ctx->pos += 2; + ctx->pos &= ~1; + continue; + } - dsymlink = afs_dynroot_create_symlink(root, ".@cell"); - if (IS_ERR(dsymlink)) { - ret = PTR_ERR(dsymlink); - dput(symlink); - goto unlock; - } + newpos = ix << 1; + if (newpos > ctx->pos) + ctx->pos = newpos; - /* Note that we're retaining extra refs on the dentries. */ - symlink->d_fsdata = (void *)1UL; - dsymlink->d_fsdata = (void *)1UL; - ret = 0; -unlock: - inode_unlock(root->d_inode); - return ret; + _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino); + + if ((ctx->pos & 1) == 0) { + if (!dir_emit(ctx, cell->name, cell->name_len, + cell->dynroot_ino, DT_DIR)) + return 0; + ctx->pos++; + } + if ((ctx->pos & 1) == 1) { + if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1, + cell->dynroot_ino + 1, DT_DIR)) + return 0; + ctx->pos++; + } + } + return 0; } /* - * Populate a newly created dynamic root with cell names. + * Read the AFS dynamic root directory. This produces a list of cellnames, + * dotted and undotted, along with @cell and .@cell links if configured. */ -int afs_dynroot_populate(struct super_block *sb) +static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx) { - struct afs_cell *cell; - struct afs_net *net = afs_sb2net(sb); - int ret; + struct afs_net *net = afs_d2net(file->f_path.dentry); + int ret = 0; - mutex_lock(&net->proc_cells_lock); - - net->dynroot_sb = sb; - ret = afs_dynroot_symlink(net); - if (ret < 0) - goto error; + if (!dir_emit_dots(file, ctx)) + return 0; - hlist_for_each_entry(cell, &net->proc_cells, proc_link) { - ret = afs_dynroot_mkdir(net, cell); - if (ret < 0) - goto error; + if (ctx->pos == 2) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, "@cell", 5, 2, DT_LNK)) + return 0; + ctx->pos = 3; + } + if (ctx->pos == 3) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, ".@cell", 6, 3, DT_LNK)) + return 0; + ctx->pos = 4; } - ret = 0; -out: - mutex_unlock(&net->proc_cells_lock); + if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) { + rcu_read_lock(); + ret = afs_dynroot_readdir_cells(net, ctx); + rcu_read_unlock(); + } return ret; - -error: - net->dynroot_sb = NULL; - goto out; } +static const struct file_operations afs_dynroot_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = afs_dynroot_readdir, + .fsync = noop_fsync, +}; + /* - * When a dynamic root that's in the process of being destroyed, depopulate it - * of pinned directories. + * Create an inode for a dynamic root directory. */ -void afs_dynroot_depopulate(struct super_block *sb) +struct inode *afs_dynroot_iget_root(struct super_block *sb) { - struct afs_net *net = afs_sb2net(sb); - struct dentry *root = sb->s_root, *subdir; - - /* Prevent more subdirs from being created */ - mutex_lock(&net->proc_cells_lock); - if (net->dynroot_sb == sb) - net->dynroot_sb = NULL; - mutex_unlock(&net->proc_cells_lock); - - if (root) { - struct hlist_node *n; - inode_lock(root->d_inode); - - /* Remove all the pins for dirs created for manually added cells */ - hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) { - if (subdir->d_fsdata) { - subdir->d_fsdata = NULL; - dput(subdir); - } - } + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,}; + + if (as->volume) + fid.vid = as->volume->vid; + + inode = iget5_locked(sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) + return ERR_PTR(-ENOMEM); + + vnode = AFS_FS_I(inode); - inode_unlock(root->d_inode); + /* there shouldn't be an existing inode */ + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_dynroot_inode_operations; + inode->i_fop = &afs_dynroot_file_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + unlock_new_inode(inode); } + _leave(" = %p", inode); + return inode; } diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index b516d05b0fef..07a8bfbdd9b9 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -235,20 +235,20 @@ out: * Probe all of a fileserver's addresses to find out the best route and to * query its capabilities. */ -void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct afs_addr_list *new_alist, struct key *key) +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key) { struct afs_endpoint_state *estate, *old; - struct afs_addr_list *alist; + struct afs_addr_list *old_alist = NULL, *alist; unsigned long unprobed; _enter("%pU", &server->uuid); estate = kzalloc(sizeof(*estate), GFP_KERNEL); if (!estate) - return; + return -ENOMEM; - refcount_set(&estate->ref, 1); + refcount_set(&estate->ref, 2); estate->server_id = server->debug_id; estate->rtt = UINT_MAX; @@ -256,21 +256,31 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, old = rcu_dereference_protected(server->endpoint_state, lockdep_is_held(&server->fs_lock)); - estate->responsive_set = old->responsive_set; - estate->addresses = afs_get_addrlist(new_alist ?: old->addresses, - afs_alist_trace_get_estate); + if (old) { + estate->responsive_set = old->responsive_set; + if (!new_alist) + new_alist = old->addresses; + } + + if (old_alist != new_alist) + afs_set_peer_appdata(server, old_alist, new_alist); + + estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate); alist = estate->addresses; estate->probe_seq = ++server->probe_counter; atomic_set(&estate->nr_probing, alist->nr_addrs); + if (new_alist) + server->addr_version = new_alist->version; rcu_assign_pointer(server->endpoint_state, estate); - set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); write_unlock(&server->fs_lock); + if (old) + set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), afs_estate_trace_alloc_probe); - afs_get_address_preferences(net, alist); + afs_get_address_preferences(net, new_alist); server->probed_at = jiffies; unprobed = (1UL << alist->nr_addrs) - 1; @@ -293,6 +303,8 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, } afs_put_endpoint_state(old, afs_estate_trace_put_probe); + afs_put_endpoint_state(estate, afs_estate_trace_put_probe); + return 0; } /* diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 1d9ecd5418d8..bc9556991d7c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1653,7 +1653,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, bp = call->request; *bp++ = htonl(FSGIVEUPALLCALLBACKS); - call->server = afs_use_server(server, afs_server_trace_give_up_cb); + call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb); afs_make_call(call, GFP_NOFS); afs_wait_for_call_to_complete(call); ret = call->error; @@ -1760,7 +1760,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, return false; call->key = key; - call->server = afs_use_server(server, afs_server_trace_get_caps); + call->server = afs_use_server(server, false, afs_server_trace_use_get_caps); call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer); call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps); call->probe_index = addr_index; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index df30bd62da79..440b0e731093 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -287,9 +287,8 @@ struct afs_net { /* Cell database */ struct rb_root cells; + struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */ struct afs_cell __rcu *ws_cell; - struct work_struct cells_manager; - struct timer_list cells_timer; atomic_t cells_outstanding; struct rw_semaphore cells_lock; struct mutex cells_alias_lock; @@ -301,18 +300,11 @@ struct afs_net { * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ - seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */ - struct rb_root fs_servers; /* afs_server (by server UUID or address) */ + seqlock_t fs_lock; /* For fs_probe_*, fs_proc */ struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ - struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ - seqlock_t fs_addr_lock; /* For fs_addresses[46] */ - - struct work_struct fs_manager; - struct timer_list fs_timer; - struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; @@ -345,13 +337,10 @@ struct afs_net { extern const char afs_init_sysname[]; enum afs_cell_state { - AFS_CELL_UNSET, - AFS_CELL_ACTIVATING, + AFS_CELL_SETTING_UP, AFS_CELL_ACTIVE, - AFS_CELL_DEACTIVATING, - AFS_CELL_INACTIVE, - AFS_CELL_FAILED, - AFS_CELL_REMOVED, + AFS_CELL_REMOVING, + AFS_CELL_DEAD, }; /* @@ -382,7 +371,9 @@ struct afs_cell { struct afs_cell *alias_of; /* The cell this is an alias of */ struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ + struct work_struct destroyer; /* Destroyer for cell */ struct work_struct manager; /* Manager for init/deinit/dns */ + struct timer_list management_timer; /* General management timer */ struct hlist_node proc_link; /* /proc cell list link */ time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ @@ -398,6 +389,7 @@ struct afs_cell { enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ unsigned int debug_id; + unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */ /* The volumes belonging to this cell */ struct rw_semaphore vs_lock; /* Lock for server->volumes */ @@ -407,7 +399,7 @@ struct afs_cell { /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ - seqlock_t fs_lock; /* For fs_servers */ + struct rw_semaphore fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ @@ -542,22 +534,22 @@ struct afs_server { }; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ - struct rb_node uuid_rb; /* Link in net->fs_servers */ - struct afs_server __rcu *uuid_next; /* Next server with same UUID */ - struct afs_server *uuid_prev; /* Previous server with same UUID */ - struct list_head probe_link; /* Link in net->fs_probe_list */ - struct hlist_node addr_link; /* Link in net->fs_addresses6 */ + struct rb_node uuid_rb; /* Link in cell->fs_servers */ + struct list_head probe_link; /* Link in net->fs_probe_* */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ - struct afs_server *gc_next; /* Next server in manager's list */ + struct work_struct destroyer; /* Work item to try and destroy a server */ + struct timer_list timer; /* Management timer */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_UPDATING 1 #define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ -#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */ -#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */ -#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */ +#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */ +#define AFS_SERVER_FL_CREATING 4 /* The record is being created */ +#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */ +#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */ +#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ @@ -567,6 +559,7 @@ struct afs_server { atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ u16 service_id; /* Service ID we're using. */ + short create_error; /* Creation error */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ @@ -621,6 +614,7 @@ struct afs_volume { afs_volid_t vid; /* The volume ID of this volume */ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; + unsigned int debug_id; /* Debugging ID for traces */ time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ @@ -700,7 +694,6 @@ struct afs_vnode { #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ -#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ @@ -1008,6 +1001,9 @@ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, __be32 xdr, u16 port); extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, __be32 *xdr, u16 port); +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist); /* * addr_prefs.c @@ -1044,16 +1040,17 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); -extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, - const char *, bool); +struct afs_cell *afs_lookup_cell(struct afs_net *net, + const char *name, unsigned int namesz, + const char *vllist, bool excl, + enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); -extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); -extern void afs_manage_cells(struct work_struct *); -extern void afs_cells_timer(struct timer_list *); +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs); extern void __net_exit afs_cell_purge(struct afs_net *); /* @@ -1111,11 +1108,7 @@ extern int afs_silly_iput(struct dentry *, struct inode *); extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; -extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); -extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *); -extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *); -extern int afs_dynroot_populate(struct super_block *); -extern void afs_dynroot_depopulate(struct super_block *); +struct inode *afs_dynroot_iget_root(struct super_block *sb); /* * file.c @@ -1207,8 +1200,8 @@ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *est enum afs_estate_trace where); void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); -void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct afs_addr_list *new_addrs, struct key *key); +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key); int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); @@ -1228,7 +1221,6 @@ int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); -extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, @@ -1510,20 +1502,30 @@ extern void __exit afs_clean_up_permit_cache(void); */ extern spinlock_t afs_server_peer_lock; -extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); -extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); +struct afs_server *afs_find_server(const struct rxrpc_peer *peer); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); -extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace); -extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace); -extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace); +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason); +void afs_unuse_server(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); +void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); -extern void afs_manage_servers(struct work_struct *); -extern void afs_servers_timer(struct timer_list *); +void afs_purge_servers(struct afs_cell *cell); extern void afs_fs_probe_timer(struct timer_list *); -extern void __net_exit afs_purge_servers(struct afs_net *); +void __net_exit afs_wait_for_servers(struct afs_net *net); bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); +static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace) +{ + int r = refcount_read(&server->ref); + int a = atomic_read(&server->active); + + trace_afs_server(server->debug_id, r, a, trace); + +} + static inline void afs_inc_servers_outstanding(struct afs_net *net) { atomic_inc(&net->servers_outstanding); diff --git a/fs/afs/main.c b/fs/afs/main.c index 1ae0067f772d..c845c5daaeba 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -76,25 +76,17 @@ static int __net_init afs_net_init(struct net *net_ns) mutex_init(&net->socket_mutex); net->cells = RB_ROOT; + idr_init(&net->cells_dyn_ino); init_rwsem(&net->cells_lock); - INIT_WORK(&net->cells_manager, afs_manage_cells); - timer_setup(&net->cells_timer, afs_cells_timer, 0); - mutex_init(&net->cells_alias_lock); mutex_init(&net->proc_cells_lock); INIT_HLIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); - net->fs_servers = RB_ROOT; INIT_LIST_HEAD(&net->fs_probe_fast); INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); - INIT_HLIST_HEAD(&net->fs_addresses); - seqlock_init(&net->fs_addr_lock); - - INIT_WORK(&net->fs_manager, afs_manage_servers); - timer_setup(&net->fs_timer, afs_servers_timer, 0); INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); atomic_set(&net->servers_outstanding, 1); @@ -130,13 +122,14 @@ error_open_socket: net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); - afs_purge_servers(net); + afs_wait_for_servers(net); error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: afs_put_sysnames(net->sysnames); error_sysnames: + idr_destroy(&net->cells_dyn_ino); net->live = false; return ret; } @@ -151,10 +144,11 @@ static void __net_exit afs_net_exit(struct net *net_ns) net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); - afs_purge_servers(net); + afs_wait_for_servers(net); afs_close_socket(net); afs_proc_cleanup(net); afs_put_sysnames(net->sysnames); + idr_destroy(&net->cells_dyn_ino); kfree_rcu(rcu_access_pointer(net->address_prefs), rcu); } diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 507c25a5b2cb..45cee6534122 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -87,7 +87,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->force = true; } if (ctx->cell) { - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt); ctx->cell = NULL; } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { @@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false); + cell = afs_lookup_cell(ctx->net, p, size, NULL, false, + afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); return PTR_ERR(cell); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 12c88d8be3fe..40e879c8ca77 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,14 +122,15 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true); + cell = afs_lookup_cell(net, name, strlen(name), args, true, + afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); goto done; } if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin); + afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin); } else { goto inval; } @@ -443,8 +444,6 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) } server = list_entry(v, struct afs_server, proc_link); - estate = rcu_dereference(server->endpoint_state); - alist = estate->addresses; seq_printf(m, "%pU %3d %3d %s\n", &server->uuid, refcount_read(&server->ref), @@ -454,10 +453,16 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) server->flags, server->rtt); seq_printf(m, " - probe: last=%d\n", (int)(jiffies - server->probed_at) / HZ); + + estate = rcu_dereference(server->endpoint_state); + if (!estate) + goto out; failed = estate->failed_set; seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n", estate->probe_seq, atomic_read(&estate->nr_probing), estate->responsive_set, estate->failed_set); + + alist = estate->addresses; seq_printf(m, " - ALIST v=%u ap=%u\n", alist->version, alist->addr_pref_version); for (i = 0; i < alist->nr_addrs; i++) { @@ -470,6 +475,8 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) rxrpc_kernel_get_srtt(addr->peer), addr->last_error, addr->prio); } + +out: return 0; } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 886416ea1d96..d5e480a33859 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -179,7 +179,7 @@ static void afs_free_call(struct afs_call *call) if (call->type->destructor) call->type->destructor(call); - afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); + afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call); kfree(call->request); o = atomic_read(&net->nr_outstanding_calls); @@ -766,8 +766,14 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { + struct afs_call *call = (struct afs_call *)user_call_ID; struct afs_net *net = afs_sock2net(sk); + call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall); + call->server = afs_find_server(call->peer); + if (!call->server) + trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer)); + queue_work(afs_wq, &net->charge_preallocation_work); } diff --git a/fs/afs/server.c b/fs/afs/server.c index 4504e16b458c..c530d1ca15df 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -14,190 +14,104 @@ static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ static atomic_t afs_server_debug_id; -static struct afs_server *afs_maybe_use_server(struct afs_server *, - enum afs_server_trace); static void __afs_put_server(struct afs_net *, struct afs_server *); +static void afs_server_timer(struct timer_list *timer); +static void afs_server_destroyer(struct work_struct *work); /* * Find a server by one of its addresses. */ -struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer) +struct afs_server *afs_find_server(const struct rxrpc_peer *peer) { - const struct afs_endpoint_state *estate; - const struct afs_addr_list *alist; - struct afs_server *server = NULL; - unsigned int i; - int seq = 1; + struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer); - rcu_read_lock(); - - do { - if (server) - afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq); - server = NULL; - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - - hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) { - estate = rcu_dereference(server->endpoint_state); - alist = estate->addresses; - for (i = 0; i < alist->nr_addrs; i++) - if (alist->addrs[i].peer == peer) - goto found; - } - - server = NULL; - continue; - found: - server = afs_maybe_use_server(server, afs_server_trace_get_by_addr); - - } while (need_seqretry(&net->fs_addr_lock, seq)); - - done_seqretry(&net->fs_addr_lock, seq); - - rcu_read_unlock(); - return server; + if (!server) + return NULL; + return afs_use_server(server, false, afs_server_trace_use_cm_call); } /* - * Look up a server by its UUID and mark it active. + * Look up a server by its UUID and mark it active. The caller must hold + * cell->fs_lock. */ -struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid) +static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid) { - struct afs_server *server = NULL; + struct afs_server *server; struct rb_node *p; - int diff, seq = 1; + int diff; _enter("%pU", uuid); - do { - /* Unfortunately, rbtree walking doesn't give reliable results - * under just the RCU read lock, so we have to check for - * changes. - */ - if (server) - afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq); - server = NULL; - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - read_seqbegin_or_lock(&net->fs_lock, &seq); - - p = net->fs_servers.rb_node; - while (p) { - server = rb_entry(p, struct afs_server, uuid_rb); - - diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); - if (diff < 0) { - p = p->rb_left; - } else if (diff > 0) { - p = p->rb_right; - } else { - afs_use_server(server, afs_server_trace_get_by_uuid); - break; - } - - server = NULL; - } - } while (need_seqretry(&net->fs_lock, seq)); + p = cell->fs_servers.rb_node; + while (p) { + server = rb_entry(p, struct afs_server, uuid_rb); - done_seqretry(&net->fs_lock, seq); + diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); + if (diff < 0) { + p = p->rb_left; + } else if (diff > 0) { + p = p->rb_right; + } else { + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) + return NULL; /* Need a write lock */ + afs_use_server(server, true, afs_server_trace_use_by_uuid); + return server; + } + } - _leave(" = %p", server); - return server; + return NULL; } /* - * Install a server record in the namespace tree. If there's a clash, we stick - * it into a list anchored on whichever afs_server struct is actually in the - * tree. + * Install a server record in the cell tree. The caller must hold an exclusive + * lock on cell->fs_lock. */ static struct afs_server *afs_install_server(struct afs_cell *cell, - struct afs_server *candidate) + struct afs_server **candidate) { - const struct afs_endpoint_state *estate; - const struct afs_addr_list *alist; - struct afs_server *server, *next; + struct afs_server *server; struct afs_net *net = cell->net; struct rb_node **pp, *p; int diff; _enter("%p", candidate); - write_seqlock(&net->fs_lock); - /* Firstly install the server in the UUID lookup tree */ - pp = &net->fs_servers.rb_node; + pp = &cell->fs_servers.rb_node; p = NULL; while (*pp) { p = *pp; _debug("- consider %p", p); server = rb_entry(p, struct afs_server, uuid_rb); - diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t)); - if (diff < 0) { + diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t)); + if (diff < 0) pp = &(*pp)->rb_left; - } else if (diff > 0) { + else if (diff > 0) pp = &(*pp)->rb_right; - } else { - if (server->cell == cell) - goto exists; - - /* We have the same UUID representing servers in - * different cells. Append the new server to the list. - */ - for (;;) { - next = rcu_dereference_protected( - server->uuid_next, - lockdep_is_held(&net->fs_lock.lock)); - if (!next) - break; - server = next; - } - rcu_assign_pointer(server->uuid_next, candidate); - candidate->uuid_prev = server; - server = candidate; - goto added_dup; - } + else + goto exists; } - server = candidate; + server = *candidate; + *candidate = NULL; rb_link_node(&server->uuid_rb, p, pp); - rb_insert_color(&server->uuid_rb, &net->fs_servers); + rb_insert_color(&server->uuid_rb, &cell->fs_servers); + write_seqlock(&net->fs_lock); hlist_add_head_rcu(&server->proc_link, &net->fs_proc); + write_sequnlock(&net->fs_lock); afs_get_cell(cell, afs_cell_trace_get_server); -added_dup: - write_seqlock(&net->fs_addr_lock); - estate = rcu_dereference_protected(server->endpoint_state, - lockdep_is_held(&net->fs_addr_lock.lock)); - alist = estate->addresses; - - /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install - * it in the IPv4 and/or IPv6 reverse-map lists. - * - * TODO: For speed we want to use something other than a flat list - * here; even sorting the list in terms of lowest address would help a - * bit, but anything we might want to do gets messy and memory - * intensive. - */ - if (alist->nr_addrs > 0) - hlist_add_head_rcu(&server->addr_link, &net->fs_addresses); - - write_sequnlock(&net->fs_addr_lock); - exists: - afs_get_server(server, afs_server_trace_get_install); - write_sequnlock(&net->fs_lock); + afs_use_server(server, true, afs_server_trace_use_install); return server; } /* - * Allocate a new server record and mark it active. + * Allocate a new server record and mark it as active but uncreated. */ -static struct afs_server *afs_alloc_server(struct afs_cell *cell, - const uuid_t *uuid, - struct afs_addr_list *alist) +static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid) { - struct afs_endpoint_state *estate; struct afs_server *server; struct afs_net *net = cell->net; @@ -205,65 +119,49 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); if (!server) - goto enomem; - - estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL); - if (!estate) - goto enomem_server; + return NULL; refcount_set(&server->ref, 1); - atomic_set(&server->active, 1); + atomic_set(&server->active, 0); + __set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); server->debug_id = atomic_inc_return(&afs_server_debug_id); - server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); + INIT_WORK(&server->destroyer, &afs_server_destroyer); + timer_setup(&server->timer, afs_server_timer, 0); INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); INIT_LIST_HEAD(&server->probe_link); + INIT_HLIST_NODE(&server->proc_link); spin_lock_init(&server->probe_lock); server->cell = cell; server->rtt = UINT_MAX; server->service_id = FS_SERVICE; - server->probe_counter = 1; server->probed_at = jiffies - LONG_MAX / 2; - refcount_set(&estate->ref, 1); - estate->addresses = alist; - estate->server_id = server->debug_id; - estate->probe_seq = 1; - rcu_assign_pointer(server->endpoint_state, estate); afs_inc_servers_outstanding(net); - trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc); - trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), - afs_estate_trace_alloc_server); _leave(" = %p", server); return server; - -enomem_server: - kfree(server); -enomem: - _leave(" = NULL [nomem]"); - return NULL; } /* * Look up an address record for a server */ -static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, - struct key *key, const uuid_t *uuid) +static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server, + struct key *key) { struct afs_vl_cursor vc; struct afs_addr_list *alist = NULL; int ret; ret = -ERESTARTSYS; - if (afs_begin_vlserver_operation(&vc, cell, key)) { + if (afs_begin_vlserver_operation(&vc, server->cell, key)) { while (afs_select_vlserver(&vc)) { if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) - alist = afs_yfsvl_get_endpoints(&vc, uuid); + alist = afs_yfsvl_get_endpoints(&vc, &server->uuid); else - alist = afs_vl_get_addrs_u(&vc, uuid); + alist = afs_vl_get_addrs_u(&vc, &server->uuid); } ret = afs_end_vlserver_operation(&vc); @@ -273,72 +171,122 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, } /* - * Get or create a fileserver record. + * Get or create a fileserver record and return it with an active-use count on + * it. */ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, const uuid_t *uuid, u32 addr_version) { - struct afs_addr_list *alist; - struct afs_server *server, *candidate; + struct afs_addr_list *alist = NULL; + struct afs_server *server, *candidate = NULL; + bool creating = false; + int ret; _enter("%p,%pU", cell->net, uuid); - server = afs_find_server_by_uuid(cell->net, uuid); + down_read(&cell->fs_lock); + server = afs_find_server_by_uuid(cell, uuid); + /* Won't see servers marked uncreated. */ + up_read(&cell->fs_lock); + if (server) { + timer_delete_sync(&server->timer); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) + goto wait_for_creation; if (server->addr_version != addr_version) set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); return server; } - alist = afs_vl_lookup_addrs(cell, key, uuid); - if (IS_ERR(alist)) - return ERR_CAST(alist); - - candidate = afs_alloc_server(cell, uuid, alist); + candidate = afs_alloc_server(cell, uuid); if (!candidate) { afs_put_addrlist(alist, afs_alist_trace_put_server_oom); return ERR_PTR(-ENOMEM); } - server = afs_install_server(cell, candidate); - if (server != candidate) { - afs_put_addrlist(alist, afs_alist_trace_put_server_dup); + down_write(&cell->fs_lock); + server = afs_install_server(cell, &candidate); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) { + /* We need to wait for creation to complete. */ + up_write(&cell->fs_lock); + goto wait_for_creation; + } + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + set_bit(AFS_SERVER_FL_CREATING, &server->flags); + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; + } + up_write(&cell->fs_lock); + timer_delete_sync(&server->timer); + + /* If we get to create the server, we look up the addresses and then + * immediately dispatch an asynchronous probe to each interface on the + * fileserver. This will make sure the repeat-probing service is + * started. + */ + if (creating) { + alist = afs_vl_lookup_addrs(server, key); + if (IS_ERR(alist)) { + ret = PTR_ERR(alist); + goto create_failed; + } + + ret = afs_fs_probe_fileserver(cell->net, server, alist, key); + if (ret) + goto create_failed; + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + } + +out: + afs_put_addrlist(alist, afs_alist_trace_put_server_create); + if (candidate) { + kfree(rcu_access_pointer(server->endpoint_state)); kfree(candidate); - } else { - /* Immediately dispatch an asynchronous probe to each interface - * on the fileserver. This will make sure the repeat-probing - * service is started. - */ - afs_fs_probe_fileserver(cell->net, server, alist, key); + afs_dec_servers_outstanding(cell->net); + } + return server ?: ERR_PTR(ret); + +wait_for_creation: + afs_see_server(server, afs_server_trace_wait_create); + wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE); + if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) { + /* Barrier: read flag before error */ + ret = READ_ONCE(server->create_error); + afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + goto out; } - return server; -} + ret = 0; + goto out; -/* - * Set the server timer to fire after a given delay, assuming it's not already - * set for an earlier time. - */ -static void afs_set_server_timer(struct afs_net *net, time64_t delay) -{ - if (net->live) { - afs_inc_servers_outstanding(net); - if (timer_reduce(&net->fs_timer, jiffies + delay * HZ)) - afs_dec_servers_outstanding(net); +create_failed: + down_write(&cell->fs_lock); + + WRITE_ONCE(server->create_error, ret); + smp_wmb(); /* Barrier: set error before flag. */ + set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; } + afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + + up_write(&cell->fs_lock); + goto out; } /* - * Server management timer. We have an increment on fs_outstanding that we - * need to pass along to the work item. + * Set/reduce a server's timer. */ -void afs_servers_timer(struct timer_list *timer) +static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs) { - struct afs_net *net = container_of(timer, struct afs_net, fs_timer); - - _enter(""); - if (!queue_work(afs_wq, &net->fs_manager)) - afs_dec_servers_outstanding(net); + mod_timer(&server->timer, jiffies + delay_secs * HZ); } /* @@ -357,32 +305,20 @@ struct afs_server *afs_get_server(struct afs_server *server, } /* - * Try to get a reference on a server object. + * Get an active count on a server object and maybe remove from the inactive + * list. */ -static struct afs_server *afs_maybe_use_server(struct afs_server *server, - enum afs_server_trace reason) -{ - unsigned int a; - int r; - - if (!__refcount_inc_not_zero(&server->ref, &r)) - return NULL; - - a = atomic_inc_return(&server->active); - trace_afs_server(server->debug_id, r + 1, a, reason); - return server; -} - -/* - * Get an active count on a server object. - */ -struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason) +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason) { unsigned int a; int r; __refcount_inc(&server->ref, &r); a = atomic_inc_return(&server->active); + if (a == 1 && activate && + !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + del_timer(&server->timer); trace_afs_server(server->debug_id, r + 1, a, reason); return server; @@ -415,13 +351,16 @@ void afs_put_server(struct afs_net *net, struct afs_server *server, void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - if (server) { - unsigned int active = atomic_dec_return(&server->active); + if (!server) + return; - if (active == 0) - afs_set_server_timer(net, afs_server_gc_delay); - afs_put_server(net, server, reason); + if (atomic_dec_and_test(&server->active)) { + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) || + READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING) + schedule_work(&server->destroyer); } + + afs_put_server(net, server, reason); } /* @@ -430,10 +369,22 @@ void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, void afs_unuse_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - if (server) { - server->unuse_time = ktime_get_real_seconds(); - afs_unuse_server_notime(net, server, reason); + if (!server) + return; + + if (atomic_dec_and_test(&server->active)) { + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) && + READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) { + time64_t unuse_time = ktime_get_real_seconds(); + + server->unuse_time = unuse_time; + afs_set_server_timer(server, afs_server_gc_delay); + } else { + schedule_work(&server->destroyer); + } } + + afs_put_server(net, server, reason); } static void afs_server_rcu(struct rcu_head *rcu) @@ -463,159 +414,119 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server } /* - * destroy a dead server + * Check to see if the server record has expired. */ -static void afs_destroy_server(struct afs_net *net, struct afs_server *server) +static bool afs_has_server_expired(const struct afs_server *server) { - if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) - afs_give_up_callbacks(net, server); + time64_t expires_at; - afs_put_server(net, server, afs_server_trace_destroy); + if (atomic_read(&server->active)) + return false; + + if (server->cell->net->live || + server->cell->state >= AFS_CELL_REMOVING) { + trace_afs_server(server->debug_id, refcount_read(&server->ref), + 0, afs_server_trace_purging); + return true; + } + + expires_at = server->unuse_time; + if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && + !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) + expires_at += afs_server_gc_delay; + + return ktime_get_real_seconds() > expires_at; } /* - * Garbage collect any expired servers. + * Remove a server record from it's parent cell's database. */ -static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) +static bool afs_remove_server_from_cell(struct afs_server *server) { - struct afs_server *server, *next, *prev; - int active; - - while ((server = gc_list)) { - gc_list = server->gc_next; - - write_seqlock(&net->fs_lock); - - active = atomic_read(&server->active); - if (active == 0) { - trace_afs_server(server->debug_id, refcount_read(&server->ref), - active, afs_server_trace_gc); - next = rcu_dereference_protected( - server->uuid_next, lockdep_is_held(&net->fs_lock.lock)); - prev = server->uuid_prev; - if (!prev) { - /* The one at the front is in the tree */ - if (!next) { - rb_erase(&server->uuid_rb, &net->fs_servers); - } else { - rb_replace_node_rcu(&server->uuid_rb, - &next->uuid_rb, - &net->fs_servers); - next->uuid_prev = NULL; - } - } else { - /* This server is not at the front */ - rcu_assign_pointer(prev->uuid_next, next); - if (next) - next->uuid_prev = prev; - } - - list_del(&server->probe_link); - hlist_del_rcu(&server->proc_link); - if (!hlist_unhashed(&server->addr_link)) - hlist_del_rcu(&server->addr_link); - } - write_sequnlock(&net->fs_lock); + struct afs_cell *cell = server->cell; + + down_write(&cell->fs_lock); - if (active == 0) - afs_destroy_server(net, server); + if (!afs_has_server_expired(server)) { + up_write(&cell->fs_lock); + return false; } + + set_bit(AFS_SERVER_FL_EXPIRED, &server->flags); + _debug("expire %pU %u", &server->uuid, atomic_read(&server->active)); + afs_see_server(server, afs_server_trace_see_expired); + rb_erase(&server->uuid_rb, &cell->fs_servers); + up_write(&cell->fs_lock); + return true; } -/* - * Manage the records of servers known to be within a network namespace. This - * includes garbage collecting unused servers. - * - * Note also that we were given an increment on net->servers_outstanding by - * whoever queued us that we need to deal with before returning. - */ -void afs_manage_servers(struct work_struct *work) +static void afs_server_destroyer(struct work_struct *work) { - struct afs_net *net = container_of(work, struct afs_net, fs_manager); - struct afs_server *gc_list = NULL; - struct rb_node *cursor; - time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; - bool purging = !net->live; - - _enter(""); + struct afs_endpoint_state *estate; + struct afs_server *server = container_of(work, struct afs_server, destroyer); + struct afs_net *net = server->cell->net; - /* Trawl the server list looking for servers that have expired from - * lack of use. - */ - read_seqlock_excl(&net->fs_lock); + afs_see_server(server, afs_server_trace_see_destroyer); - for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) { - struct afs_server *server = - rb_entry(cursor, struct afs_server, uuid_rb); - int active = atomic_read(&server->active); + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + return; - _debug("manage %pU %u", &server->uuid, active); + if (!afs_remove_server_from_cell(server)) + return; - if (purging) { - trace_afs_server(server->debug_id, refcount_read(&server->ref), - active, afs_server_trace_purging); - if (active != 0) - pr_notice("Can't purge s=%08x\n", server->debug_id); - } + timer_shutdown_sync(&server->timer); + cancel_work(&server->destroyer); - if (active == 0) { - time64_t expire_at = server->unuse_time; - - if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && - !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) - expire_at += afs_server_gc_delay; - if (purging || expire_at <= now) { - server->gc_next = gc_list; - gc_list = server; - } else if (expire_at < next_manage) { - next_manage = expire_at; - } - } - } + if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) + afs_give_up_callbacks(net, server); - read_sequnlock_excl(&net->fs_lock); + /* Unbind the rxrpc_peer records from the server. */ + estate = rcu_access_pointer(server->endpoint_state); + if (estate) + afs_set_peer_appdata(server, estate->addresses, NULL); - /* Update the timer on the way out. We have to pass an increment on - * servers_outstanding in the namespace that we are in to the timer or - * the work scheduler. - */ - if (!purging && next_manage < TIME64_MAX) { - now = ktime_get_real_seconds(); + write_seqlock(&net->fs_lock); + list_del_init(&server->probe_link); + if (!hlist_unhashed(&server->proc_link)) + hlist_del_rcu(&server->proc_link); + write_sequnlock(&net->fs_lock); - if (next_manage - now <= 0) { - if (queue_work(afs_wq, &net->fs_manager)) - afs_inc_servers_outstanding(net); - } else { - afs_set_server_timer(net, next_manage - now); - } - } + afs_put_server(net, server, afs_server_trace_destroy); +} - afs_gc_servers(net, gc_list); +static void afs_server_timer(struct timer_list *timer) +{ + struct afs_server *server = container_of(timer, struct afs_server, timer); - afs_dec_servers_outstanding(net); - _leave(" [%d]", atomic_read(&net->servers_outstanding)); + afs_see_server(server, afs_server_trace_see_timer); + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + schedule_work(&server->destroyer); } -static void afs_queue_server_manager(struct afs_net *net) +/* + * Wake up all the servers in a cell so that they can purge themselves. + */ +void afs_purge_servers(struct afs_cell *cell) { - afs_inc_servers_outstanding(net); - if (!queue_work(afs_wq, &net->fs_manager)) - afs_dec_servers_outstanding(net); + struct afs_server *server; + struct rb_node *rb; + + down_read(&cell->fs_lock); + for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) { + server = rb_entry(rb, struct afs_server, uuid_rb); + afs_see_server(server, afs_server_trace_see_purge); + schedule_work(&server->destroyer); + } + up_read(&cell->fs_lock); } /* - * Purge list of servers. + * Wait for outstanding servers. */ -void afs_purge_servers(struct afs_net *net) +void afs_wait_for_servers(struct afs_net *net) { _enter(""); - if (del_timer_sync(&net->fs_timer)) - afs_dec_servers_outstanding(net); - - afs_queue_server_manager(net); - - _debug("wait"); atomic_dec(&net->servers_outstanding); wait_var_event(&net->servers_outstanding, !atomic_read(&net->servers_outstanding)); @@ -639,7 +550,7 @@ static noinline bool afs_update_server_record(struct afs_operation *op, atomic_read(&server->active), afs_server_trace_update); - alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid); + alist = afs_vl_lookup_addrs(server, op->key); if (IS_ERR(alist)) { rcu_read_lock(); estate = rcu_dereference(server->endpoint_state); diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index d20cd902ef94..20d5474837df 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -16,7 +16,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) if (slist && refcount_dec_and_test(&slist->usage)) { for (i = 0; i < slist->nr_servers; i++) afs_unuse_server(net, slist->servers[i].server, - afs_server_trace_put_slist); + afs_server_trace_unuse_slist); kfree_rcu(slist, rcu); } } @@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { - afs_unuse_server(volume->cell->net, server, - afs_server_trace_put_slist_isort); + afs_unuse_server_notime(volume->cell->net, server, + afs_server_trace_unuse_slist_isort); continue; } diff --git a/fs/afs/super.c b/fs/afs/super.c index a9bee610674e..25b306db6992 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -194,8 +194,6 @@ static int afs_show_options(struct seq_file *m, struct dentry *root) if (as->dyn_root) seq_puts(m, ",dyn"); - if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags)) - seq_puts(m, ",autocell"); switch (as->flock_mode) { case afs_flock_mode_unset: break; case afs_flock_mode_local: p = "local"; break; @@ -292,13 +290,14 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false); + NULL, false, + afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse); afs_see_cell(cell, afs_cell_trace_see_source); ctx->cell = cell; } @@ -395,7 +394,7 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = NULL; cell = afs_use_cell(ctx->cell->alias_of, afs_cell_trace_use_fc_alias); - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); ctx->cell = cell; goto reget_key; } @@ -468,7 +467,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) /* allocate the root inode and dentry */ if (as->dyn_root) { - inode = afs_iget_pseudo_dir(sb, true); + inode = afs_dynroot_iget_root(sb); } else { sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); @@ -478,9 +477,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); - if (ctx->autocell || as->dyn_root) - set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); - ret = -ENOMEM; sb->s_root = d_make_root(inode); if (!sb->s_root) @@ -488,9 +484,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) if (as->dyn_root) { sb->s_d_op = &afs_dynroot_dentry_operations; - ret = afs_dynroot_populate(sb); - if (ret < 0) - goto error; } else { sb->s_d_op = &afs_fs_dentry_operations; rcu_assign_pointer(as->volume->sb, sb); @@ -527,9 +520,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { - struct afs_net *net = afs_net(as->net_ns); afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi); - afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi); + afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi); put_net(as->net_ns); kfree(as); } @@ -539,9 +531,6 @@ static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = AFS_FS_S(sb); - if (as->dyn_root) - afs_dynroot_depopulate(sb); - /* Clear the callback interests (which will do ilookup5) before * deactivating the superblock. */ @@ -615,7 +604,7 @@ static void afs_free_fc(struct fs_context *fc) afs_destroy_sbi(fc->s_fs_info); afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc); - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); key_put(ctx->key); kfree(ctx); } diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index f9e76b604f31..709b4cdb723e 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -205,11 +205,11 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key) goto is_alias; if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { - afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); return -ERESTARTSYS; } - afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); } mutex_unlock(&cell->net->proc_cells_lock); @@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) if (!name_len || name_len > AFS_MAXCELLNAME) master = ERR_PTR(-EOPNOTSUPP); else - master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false); + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false, + afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) return PTR_ERR(master); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index d8f79f6ada3d..6ad9688d8f4b 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -48,7 +48,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) cell->dns_expiry <= ktime_get_real_seconds()) { dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count); set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); - afs_queue_cell(cell, afs_cell_trace_get_queue_dns); + afs_queue_cell(cell, afs_cell_trace_queue_dns); if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { if (wait_var_event_interruptible( diff --git a/fs/afs/volume.c b/fs/afs/volume.c index af3a3f57c1b3..0efff3d25133 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -10,6 +10,7 @@ #include "internal.h" static unsigned __read_mostly afs_volume_record_life = 60 * 60; +static atomic_t afs_volume_debug_id; static void afs_destroy_volume(struct work_struct *work); @@ -59,7 +60,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume) struct afs_cell *cell = volume->cell; if (!hlist_unhashed(&volume->proc_link)) { - trace_afs_volume(volume->vid, refcount_read(&cell->ref), + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), afs_volume_trace_remove); write_seqlock(&cell->volume_lock); hlist_del_rcu(&volume->proc_link); @@ -84,6 +85,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, if (!volume) goto error_0; + volume->debug_id = atomic_inc_return(&afs_volume_debug_id); volume->vid = vldb->vid[params->type]; volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol); @@ -115,7 +117,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, *_slist = slist; rcu_assign_pointer(volume->servers, slist); - trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc); + trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc); return volume; error_1: @@ -247,7 +249,7 @@ static void afs_destroy_volume(struct work_struct *work) afs_remove_volume_from_cell(volume); afs_put_serverlist(volume->cell->net, slist); afs_put_cell(volume->cell, afs_cell_trace_put_vol); - trace_afs_volume(volume->vid, refcount_read(&volume->ref), + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), afs_volume_trace_free); kfree_rcu(volume, rcu); @@ -262,7 +264,7 @@ bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason) int r; if (__refcount_inc_not_zero(&volume->ref, &r)) { - trace_afs_volume(volume->vid, r + 1, reason); + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); return true; } return false; @@ -278,7 +280,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, int r; __refcount_inc(&volume->ref, &r); - trace_afs_volume(volume->vid, r + 1, reason); + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); } return volume; } @@ -290,12 +292,13 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason) { if (volume) { + unsigned int debug_id = volume->debug_id; afs_volid_t vid = volume->vid; bool zero; int r; zero = __refcount_dec_and_test(&volume->ref, &r); - trace_afs_volume(vid, r - 1, reason); + trace_afs_volume(debug_id, vid, r - 1, reason); if (zero) schedule_work(&volume->destructor); } diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 77c7991d89aa..23cea74f9933 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -218,6 +218,8 @@ void autofs_clean_ino(struct autofs_info *); static inline int autofs_check_pipe(struct file *pipe) { + if (pipe->f_mode & FMODE_PATH) + return -EINVAL; if (!(pipe->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 6d57efbb8110..c5a6aae12d2c 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -442,7 +442,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp, sbi->exp_timeout = timeout * HZ; } else { struct dentry *base = fp->f_path.dentry; - struct inode *inode = base->d_inode; int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; struct dentry *dentry; struct autofs_info *ino; @@ -460,9 +459,7 @@ static int autofs_dev_ioctl_timeout(struct file *fp, "the parent autofs mount timeout which could " "prevent shutdown\n"); - inode_lock_shared(inode); dentry = try_lookup_one_len(param->path, base, path_len); - inode_unlock_shared(inode); if (IS_ERR_OR_NULL(dentry)) return dentry ? PTR_ERR(dentry) : -ENOENT; ino = autofs_dentry_ino(dentry); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 530d18827e35..174c7205fee4 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -15,8 +15,8 @@ static int autofs_dir_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); +static struct dentry *autofs_dir_mkdir(struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static long autofs_root_compat_ioctl(struct file *, @@ -720,9 +720,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs_dir_mkdir(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -739,7 +739,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); d_add(dentry, inode); if (sbi->version < 5) @@ -751,7 +751,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, inc_nlink(dir); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - return 0; + return NULL; } /* Get/set timeout ioctl() operation */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 316d88da2ce1..0ef9bcb744dd 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -58,10 +58,10 @@ static int bad_inode_symlink(struct mnt_idmap *idmap, return -EIO; } -static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return -EIO; + return ERR_PTR(-EIO); } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 15725b4ce393..595b57fabc9a 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -511,10 +511,6 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ret = -EXDEV; goto err; } - if (!d_is_positive(victim)) { - ret = -ENOENT; - goto err; - } ret = __bch2_unlink(dir, victim, true); if (!ret) { fsnotify_rmdir(dir, victim); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 90ade8f648d9..b2669d7ffec5 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -858,10 +858,10 @@ err: return bch2_err_class(ret); } -static int bch2_mkdir(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, umode_t mode) +static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, umode_t mode) { - return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); + return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); } static int bch2_rename2(struct mnt_idmap *idmap, @@ -2396,7 +2396,7 @@ static struct file_system_type bcache_fs_type = { .name = "bcachefs", .init_fs_context = bch2_init_fs_context, .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, }; MODULE_ALIAS_FS("bcachefs"); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8054f44d39cf..584fa89bc877 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -762,8 +762,7 @@ static int parse_elf_property(const char *data, size_t *off, size_t datasz, } #define NOTE_DATA_SZ SZ_1K -#define GNU_PROPERTY_TYPE_0_NAME "GNU" -#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) +#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0)) static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, struct arch_elf_state *arch) @@ -800,7 +799,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || note.nhdr.n_namesz != NOTE_NAME_SZ || strncmp(note.data + sizeof(note.nhdr), - GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr))) return -ENOEXEC; off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, @@ -1603,14 +1602,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { copy_siginfo_to_external(csigdata, siginfo); - fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); + fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata); } /* @@ -1706,7 +1705,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm } size = name_curpos - (char *)data; - fill_note(note, "CORE", NT_FILE, size, data); + fill_note(note, NN_FILE, NT_FILE, size, data); return 0; } @@ -1767,7 +1766,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); info->size += notesize(&t->notes[0]); @@ -1801,7 +1800,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", + fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX", note_type, ret, data); info->size += notesize(&t->notes[note_iter]); @@ -1821,7 +1820,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); info->size += notesize(&t->notes[0]); @@ -1832,7 +1831,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, } t->prstatus.pr_fpvalid = 1; - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu); info->size += notesize(&t->notes[1]); return 1; @@ -1852,7 +1851,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) return 0; - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); #ifdef CORE_DUMP_USE_REGSET view = task_user_regset_view(dump_task); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c13ee8180b17..9133f3827f90 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1024,7 +1024,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, /* deal with each load segment separately */ phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { - unsigned long maddr, disp, excess, excess1; + unsigned long maddr, disp, excess; int prot = 0, flags; if (phdr->p_type != PT_LOAD) @@ -1120,9 +1120,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * extant in the file */ excess = phdr->p_memsz - phdr->p_filesz; - excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); #ifdef CONFIG_MMU + unsigned long excess1 + = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); if (excess > excess1) { unsigned long xaddr = maddr + phdr->p_filesz + excess1; unsigned long xmaddr; @@ -1397,7 +1398,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &t->prstatus); t->num_notes++; *sz += notesize(&t->notes[0]); @@ -1415,7 +1416,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ } if (t->prstatus.pr_fpvalid) { - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu), &t->fpu); t->num_notes++; *sz += notesize(&t->notes[1]); @@ -1530,7 +1531,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) */ fill_psinfo(psinfo, current->group_leader, current->mm); - fill_note(&psinfo_note, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); thread_status_size += notesize(&psinfo_note); auxv = (elf_addr_t *) current->mm->saved_auxv; @@ -1538,7 +1539,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); thread_status_size += notesize(&auxv_note); offset = sizeof(*elf); /* ELF header */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 38756f8cef46..a9e56c994e9e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6744,18 +6744,18 @@ fail: return err; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, diff --git a/fs/buffer.c b/fs/buffer.c index cc8452f60251..194eacbefc95 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2361,9 +2361,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) { struct inode *inode = folio->mapping->host; sector_t iblock, lblock; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + struct buffer_head *bh, *head, *prev = NULL; size_t blocksize; - int nr, i; int fully_mapped = 1; bool page_error = false; loff_t limit = i_size_read(inode); @@ -2372,16 +2371,12 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) limit = inode->i_sb->s_maxbytes; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; iblock = div_u64(folio_pos(folio), blocksize); lblock = div_u64(limit + blocksize - 1, blocksize); bh = head; - nr = 0; - i = 0; do { if (buffer_uptodate(bh)) @@ -2398,7 +2393,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) page_error = true; } if (!buffer_mapped(bh)) { - folio_zero_range(folio, i * blocksize, + folio_zero_range(folio, bh_offset(bh), blocksize); if (!err) set_buffer_uptodate(bh); @@ -2411,40 +2406,33 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (buffer_uptodate(bh)) continue; } - arr[nr++] = bh; - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - if (fully_mapped) - folio_set_mappedtodisk(folio); - - if (!nr) { - /* - * All buffers are uptodate or get_block() returned an - * error when trying to map them - we can finish the read. - */ - folio_end_read(folio, !page_error); - return 0; - } - /* Stage two: lock the buffers */ - for (i = 0; i < nr; i++) { - bh = arr[i]; lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + mark_buffer_async_read(bh); - } + if (prev) + submit_bh(REQ_OP_READ, prev); + prev = bh; + } while (iblock++, (bh = bh->b_this_page) != head); + + if (fully_mapped) + folio_set_mappedtodisk(folio); /* - * Stage 3: start the IO. Check for uptodateness - * inside the buffer lock in case another process reading - * the underlying blockdev brought it uptodate (the sct fix). + * All buffers are uptodate or get_block() returned an error + * when trying to map them - we must finish the read because + * end_buffer_async_read() will never be called on any buffer + * in this folio. */ - for (i = 0; i < nr; i++) { - bh = arr[i]; - if (buffer_uptodate(bh)) - end_buffer_async_read(bh, 1); - else - submit_bh(REQ_OP_READ, bh); - } + if (prev) + submit_bh(REQ_OP_READ, prev); + else + folio_end_read(folio, !page_error); + return 0; } EXPORT_SYMBOL(block_read_full_folio); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7cf59713f0f7..83a60126de0f 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -128,18 +128,19 @@ retry: ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - ret = cachefiles_inject_write_error(); - if (ret == 0) - ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); - if (ret < 0) { + subdir = ERR_PTR(cachefiles_inject_write_error()); + if (!IS_ERR(subdir)) + subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); + ret = PTR_ERR(subdir); + if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); goto mkdir_error; } trace_cachefiles_mkdir(dir, subdir); - if (unlikely(d_unhashed(subdir))) { - cachefiles_put_directory(subdir); + if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) { + dput(subdir); goto retry; } ASSERT(d_backing_inode(subdir)); @@ -195,7 +196,8 @@ mark_error: mkdir_error: inode_unlock(d_inode(dir)); - dput(subdir); + if (!IS_ERR(subdir)) + dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index fe3de9ad57bf..d9bc67176128 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -317,8 +317,9 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, goto err_free_id; } - anon_file->file = anon_inode_getfile("[cachefiles]", - &cachefiles_ondemand_fd_fops, object, O_WRONLY); + anon_file->file = anon_inode_getfile_fmode("[cachefiles]", + &cachefiles_ondemand_fd_fops, object, + O_WRONLY, FMODE_PWRITE | FMODE_LSEEK); if (IS_ERR(anon_file->file)) { ret = PTR_ERR(anon_file->file); goto err_put_fd; @@ -333,8 +334,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, goto err_put_file; } - anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; - load = (void *)req->msg.data; load->fd = anon_file->fd; object->ondemand->ondemand_id = object_id; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f5224a566b69..29be367905a1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -82,6 +82,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci; struct ceph_snap_context *snapc; @@ -92,6 +93,8 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) return false; } + atomic64_inc(&mdsc->dirty_folios); + ci = ceph_inode(inode); /* dirty the head */ @@ -568,7 +571,36 @@ struct ceph_writeback_ctl u64 truncate_size; u32 truncate_seq; bool size_stable; + bool head_snapc; + struct ceph_snap_context *snapc; + struct ceph_snap_context *last_snapc; + + bool done; + bool should_loop; + bool range_whole; + pgoff_t start_index; + pgoff_t index; + pgoff_t end; + xa_mark_t tag; + + pgoff_t strip_unit_end; + unsigned int wsize; + unsigned int nr_folios; + unsigned int max_pages; + unsigned int locked_pages; + + int op_idx; + int num_ops; + u64 offset; + u64 len; + + struct folio_batch fbatch; + unsigned int processed_in_fbatch; + + bool from_pool; + struct page **pages; + struct page **data_pages; }; /* @@ -666,22 +698,23 @@ static u64 get_writepages_data_length(struct inode *inode, } /* - * Write a single page, but leave the page locked. + * Write a folio, but leave it locked. * * If we get a write error, mark the mapping for error, but still adjust the - * dirty page accounting (i.e., page is no longer dirty). + * dirty page accounting (i.e., folio is no longer dirty). */ -static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +static int write_folio_nounlock(struct folio *folio, + struct writeback_control *wbc) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; + struct page *page = &folio->page; + struct inode *inode = folio->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_snap_context *snapc, *oldest; - loff_t page_off = page_offset(page); + loff_t page_off = folio_pos(folio); int err; - loff_t len = thp_size(page); + loff_t len = folio_size(folio); loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; @@ -689,27 +722,27 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) bool caching = ceph_is_cache_enabled(inode); struct page *bounce_page = NULL; - doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page, - page->index); + doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio, + folio->index); if (ceph_inode_is_shutdown(inode)) return -EIO; /* verify this is a writeable snap context */ - snapc = page_snap_context(page); + snapc = page_snap_context(&folio->page); if (!snapc) { - doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode), - page); + doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode), + folio); return 0; } oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { - doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n", - ceph_vinop(inode), page, snapc); + doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n", + ceph_vinop(inode), folio, snapc); /* we should only noop if called by kswapd */ WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); return 0; } ceph_put_snap_context(oldest); @@ -726,8 +759,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) len = ceph_wbc.i_size - page_off; wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; - doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n", - ceph_vinop(inode), page, page->index, page_off, wlen, snapc, + doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n", + ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > @@ -740,32 +773,32 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); if (IS_ERR(req)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); return PTR_ERR(req); } if (wlen < len) len = wlen; - set_page_writeback(page); + folio_start_writeback(folio); if (caching) - ceph_set_page_fscache(page); + ceph_set_page_fscache(&folio->page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { - bounce_page = fscrypt_encrypt_pagecache_blocks(page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, CEPH_FSCRYPT_BLOCK_SIZE, 0, GFP_NOFS); if (IS_ERR(bounce_page)) { - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); ceph_osdc_put_request(req); return PTR_ERR(bounce_page); } } /* it may be a short write due to an object boundary */ - WARN_ON_ONCE(len > thp_size(page)); + WARN_ON_ONCE(len > folio_size(folio)); osd_req_op_extent_osd_data_pages(req, 0, bounce_page ? &bounce_page : &page, wlen, 0, false, false); @@ -791,25 +824,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (err == -ERESTARTSYS) { /* killed by SIGKILL */ doutc(cl, "%llx.%llx interrupted page %p\n", - ceph_vinop(inode), page); - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + ceph_vinop(inode), folio); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); return err; } if (err == -EBLOCKLISTED) fsc->blocklisted = true; - doutc(cl, "%llx.%llx setting page/mapping error %d %p\n", - ceph_vinop(inode), err, page); + doutc(cl, "%llx.%llx setting mapping error %d %p\n", + ceph_vinop(inode), err, folio); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { doutc(cl, "%llx.%llx cleaned page %p\n", - ceph_vinop(inode), page); + ceph_vinop(inode), folio); err = 0; /* vfs expects us to return 0 */ } - oldest = detach_page_private(page); + oldest = folio_detach_private(folio); WARN_ON_ONCE(oldest != snapc); - end_page_writeback(page); + folio_end_writeback(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ @@ -820,32 +853,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) return err; } -static int ceph_writepage(struct page *page, struct writeback_control *wbc) -{ - int err; - struct inode *inode = page->mapping->host; - BUG_ON(!inode); - ihold(inode); - - if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_fs_client(inode)->write_congested) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - - folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ - - err = writepage_nounlock(page, wbc); - if (err == -ERESTARTSYS) { - /* direct memory reclaimer was killed by SIGKILL. return 0 - * to prevent caller from setting mapping/page error */ - err = 0; - } - unlock_page(page); - iput(inode); - return err; -} - /* * async writeback completion handler. * @@ -865,6 +872,7 @@ static void writepages_finish(struct ceph_osd_request *req) struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); unsigned int len = 0; bool remove_page; @@ -920,6 +928,12 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); + + if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) { + wake_up_all(&mdsc->flush_end_wq); + WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0); + } + doutc(cl, "unlocking %p\n", page); if (remove_page) @@ -949,36 +963,13 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_dec_osd_stopping_blocker(fsc->mdsc); } -/* - * initiate async writeback - */ -static int ceph_writepages_start(struct address_space *mapping, - struct writeback_control *wbc) +static inline +bool is_forced_umount(struct address_space *mapping) { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; - struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start_index, end = -1; - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct folio_batch fbatch; - int rc = 0; - unsigned int wsize = i_blocksize(inode); - struct ceph_osd_request *req = NULL; - struct ceph_writeback_ctl ceph_wbc; - bool should_loop, range_whole = false; - bool done = false; - bool caching = ceph_is_cache_enabled(inode); - xa_mark_t tag; - - if (wbc->sync_mode == WB_SYNC_NONE && - fsc->write_congested) - return 0; - - doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), - wbc->sync_mode == WB_SYNC_NONE ? "NONE" : - (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { @@ -987,388 +978,733 @@ static int ceph_writepages_start(struct address_space *mapping, ceph_vinop(inode), ceph_ino(inode)); } mapping_set_error(mapping, -EIO); - return -EIO; /* we're in a forced umount, don't write! */ + return true; } + + return false; +} + +static inline +unsigned int ceph_define_write_size(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + unsigned int wsize = i_blocksize(inode); + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - folio_batch_init(&fbatch); + return wsize; +} + +static inline +void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_init(&ceph_wbc->fbatch); + ceph_wbc->processed_in_fbatch = 0; +} + +static inline +void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_release(&ceph_wbc->fbatch); + ceph_folio_batch_init(ceph_wbc); +} + +static inline +void ceph_init_writeback_ctl(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + ceph_wbc->snapc = NULL; + ceph_wbc->last_snapc = NULL; + + ceph_wbc->strip_unit_end = 0; + ceph_wbc->wsize = ceph_define_write_size(mapping); - start_index = wbc->range_cyclic ? mapping->writeback_index : 0; - index = start_index; + ceph_wbc->nr_folios = 0; + ceph_wbc->max_pages = 0; + ceph_wbc->locked_pages = 0; + + ceph_wbc->done = false; + ceph_wbc->should_loop = false; + ceph_wbc->range_whole = false; + + ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - tag = PAGECACHE_TAG_TOWRITE; + ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; } else { - tag = PAGECACHE_TAG_DIRTY; + ceph_wbc->tag = PAGECACHE_TAG_DIRTY; } -retry: + + ceph_wbc->op_idx = -1; + ceph_wbc->num_ops = 0; + ceph_wbc->offset = 0; + ceph_wbc->len = 0; + ceph_wbc->from_pool = false; + + ceph_folio_batch_init(ceph_wbc); + + ceph_wbc->pages = NULL; + ceph_wbc->data_pages = NULL; +} + +static inline +int ceph_define_writeback_range(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + /* find oldest snap context with dirty data */ - snapc = get_oldest_context(inode, &ceph_wbc, NULL); - if (!snapc) { + ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL); + if (!ceph_wbc->snapc) { /* hmm, why does writepages get called when there is no dirty data? */ doutc(cl, " no snap context with dirty data?\n"); - goto out; + return -ENODATA; } - doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc, - snapc->seq, snapc->num_snaps); - should_loop = false; - if (ceph_wbc.head_snapc && snapc != last_snapc) { + doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", + ceph_wbc->snapc, ceph_wbc->snapc->seq, + ceph_wbc->snapc->num_snaps); + + ceph_wbc->should_loop = false; + + if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) { /* where to start/end? */ if (wbc->range_cyclic) { - index = start_index; - end = -1; - if (index > 0) - should_loop = true; - doutc(cl, " cyclic, start at %lu\n", index); + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; + doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + ceph_wbc->index = wbc->range_start >> PAGE_SHIFT; + ceph_wbc->end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = true; - doutc(cl, " not cyclic, %lu to %lu\n", index, end); + ceph_wbc->range_whole = true; + doutc(cl, " not cyclic, %lu to %lu\n", + ceph_wbc->index, ceph_wbc->end); } - } else if (!ceph_wbc.head_snapc) { + } else if (!ceph_wbc->head_snapc) { /* Do not respect wbc->range_{start,end}. Dirty pages * in that range can be associated with newer snapc. * They are not writeable until we write all dirty pages * associated with 'snapc' get written */ - if (index > 0) - should_loop = true; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; doutc(cl, " non-head snapc, range whole\n"); } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); + ceph_put_snap_context(ceph_wbc->last_snapc); + ceph_wbc->last_snapc = ceph_wbc->snapc; - ceph_put_snap_context(last_snapc); - last_snapc = snapc; + return 0; +} - while (!done && index <= end) { - int num_ops = 0, op_idx; - unsigned i, nr_folios, max_pages, locked_pages = 0; - struct page **pages = NULL, **data_pages; - struct page *page; - pgoff_t strip_unit_end = 0; - u64 offset = 0, len = 0; - bool from_pool = false; +static inline +bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end; +} - max_pages = wsize >> PAGE_SHIFT; +static inline +bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc, + unsigned index) +{ + return index < ceph_wbc->nr_folios && + ceph_wbc->locked_pages < ceph_wbc->max_pages; +} -get_more_pages: - nr_folios = filemap_get_folios_tag(mapping, &index, - end, tag, &fbatch); - doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios); - if (!nr_folios && !locked_pages) - break; - for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { - struct folio *folio = fbatch.folios[i]; +static +int ceph_check_page_before_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_snap_context *pgsnapc; - page = &folio->page; - doutc(cl, "? %p idx %lu\n", page, page->index); - if (locked_pages == 0) - lock_page(page); /* first page */ - else if (!trylock_page(page)) - break; + /* only dirty folios, or our accounting breaks */ + if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) { + doutc(cl, "!dirty or !mapping %p\n", folio); + return -ENODATA; + } - /* only dirty pages, or our accounting breaks */ - if (unlikely(!PageDirty(page)) || - unlikely(page->mapping != mapping)) { - doutc(cl, "!dirty or !mapping %p\n", page); - unlock_page(page); - continue; - } - /* only if matching snap context */ - pgsnapc = page_snap_context(page); - if (pgsnapc != snapc) { - doutc(cl, "page snapc %p %lld != oldest %p %lld\n", - pgsnapc, pgsnapc->seq, snapc, snapc->seq); - if (!should_loop && - !ceph_wbc.head_snapc && - wbc->sync_mode != WB_SYNC_NONE) - should_loop = true; - unlock_page(page); - continue; + /* only if matching snap context */ + pgsnapc = page_snap_context(&folio->page); + if (pgsnapc != ceph_wbc->snapc) { + doutc(cl, "folio snapc %p %lld != oldest %p %lld\n", + pgsnapc, pgsnapc->seq, + ceph_wbc->snapc, ceph_wbc->snapc->seq); + + if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc && + wbc->sync_mode != WB_SYNC_NONE) + ceph_wbc->should_loop = true; + + return -ENODATA; + } + + if (folio_pos(folio) >= ceph_wbc->i_size) { + doutc(cl, "folio at %lu beyond eof %llu\n", + folio->index, ceph_wbc->i_size); + + if ((ceph_wbc->size_stable || + folio_pos(folio) >= i_size_read(inode)) && + folio_clear_dirty_for_io(folio)) + folio_invalidate(folio, 0, folio_size(folio)); + + return -ENODATA; + } + + if (ceph_wbc->strip_unit_end && + (folio->index > ceph_wbc->strip_unit_end)) { + doutc(cl, "end of strip unit %p\n", folio); + return -E2BIG; + } + + return 0; +} + +static inline +void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc, + unsigned int max_pages) +{ + ceph_wbc->pages = kmalloc_array(max_pages, + sizeof(*ceph_wbc->pages), + GFP_NOFS); + if (!ceph_wbc->pages) { + ceph_wbc->from_pool = true; + ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); + BUG_ON(!ceph_wbc->pages); + } +} + +static inline +void ceph_allocate_page_array(struct address_space *mapping, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objnum; + u64 objoff; + u32 xlen; + + /* prepare async write request */ + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_calc_file_object_mapping(&ci->i_layout, + ceph_wbc->offset, ceph_wbc->wsize, + &objnum, &objoff, &xlen); + + ceph_wbc->num_ops = 1; + ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT); + + BUG_ON(ceph_wbc->pages); + ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen); + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages); + + ceph_wbc->len = 0; +} + +static inline +bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc, + const struct folio *folio) +{ + return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT; +} + +static inline +bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->num_ops >= + (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS); +} + +static inline +bool is_write_congestion_happened(struct ceph_fs_client *fsc) +{ + return atomic_long_inc_return(&fsc->writeback_count) > + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb); +} + +static inline int move_dirty_folio_in_page_array(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct page **pages = ceph_wbc->pages; + unsigned int index = ceph_wbc->locked_pages; + gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS; + + if (IS_ENCRYPTED(inode)) { + pages[index] = fscrypt_encrypt_pagecache_blocks(folio, + PAGE_SIZE, + 0, + gfp_flags); + if (IS_ERR(pages[index])) { + if (PTR_ERR(pages[index]) == -EINVAL) { + pr_err_client(cl, "inode->i_blkbits=%hhu\n", + inode->i_blkbits); } - if (page_offset(page) >= ceph_wbc.i_size) { - doutc(cl, "folio at %lu beyond eof %llu\n", - folio->index, ceph_wbc.i_size); - if ((ceph_wbc.size_stable || - folio_pos(folio) >= i_size_read(inode)) && - folio_clear_dirty_for_io(folio)) - folio_invalidate(folio, 0, - folio_size(folio)); + + /* better not fail on first page! */ + BUG_ON(ceph_wbc->locked_pages == 0); + + pages[index] = NULL; + return PTR_ERR(pages[index]); + } + } else { + pages[index] = &folio->page; + } + + ceph_wbc->locked_pages++; + + return 0; +} + +static +int ceph_process_folio_batch(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct folio *folio = NULL; + unsigned i; + int rc = 0; + + for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) { + folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + doutc(cl, "? %p idx %lu, folio_test_writeback %#x, " + "folio_test_dirty %#x, folio_test_locked %#x\n", + folio, folio->index, folio_test_writeback(folio), + folio_test_dirty(folio), + folio_test_locked(folio)); + + if (folio_test_writeback(folio) || + folio_test_private_2(folio) /* [DEPRECATED] */) { + doutc(cl, "waiting on writeback %p\n", folio); + folio_wait_writeback(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ + continue; + } + + if (ceph_wbc->locked_pages == 0) + folio_lock(folio); + else if (!folio_trylock(folio)) + break; + + rc = ceph_check_page_before_write(mapping, wbc, + ceph_wbc, folio); + if (rc == -ENODATA) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } else if (rc == -E2BIG) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + break; + } + + if (!folio_clear_dirty_for_io(folio)) { + doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } + + /* + * We have something to write. If this is + * the first locked page this time through, + * calculate max possible write size and + * allocate a page array + */ + if (ceph_wbc->locked_pages == 0) { + ceph_allocate_page_array(mapping, ceph_wbc, folio); + } else if (!is_folio_index_contiguous(ceph_wbc, folio)) { + if (is_num_ops_too_big(ceph_wbc)) { + folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); - continue; - } - if (strip_unit_end && (page->index > strip_unit_end)) { - doutc(cl, "end of strip unit %p\n", page); - unlock_page(page); break; } - if (folio_test_writeback(folio) || - folio_test_private_2(folio) /* [DEPRECATED] */) { - if (wbc->sync_mode == WB_SYNC_NONE) { - doutc(cl, "%p under writeback\n", folio); - folio_unlock(folio); - continue; - } - doutc(cl, "waiting on writeback %p\n", folio); - folio_wait_writeback(folio); - folio_wait_private_2(folio); /* [DEPRECATED] */ - } - if (!clear_page_dirty_for_io(page)) { - doutc(cl, "%p !clear_page_dirty_for_io\n", page); - unlock_page(page); - continue; - } + ceph_wbc->num_ops++; + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_wbc->len = 0; + } - /* - * We have something to write. If this is - * the first locked page this time through, - * calculate max possinle write size and - * allocate a page array - */ - if (locked_pages == 0) { - u64 objnum; - u64 objoff; - u32 xlen; - - /* prepare async write request */ - offset = (u64)page_offset(page); - ceph_calc_file_object_mapping(&ci->i_layout, - offset, wsize, - &objnum, &objoff, - &xlen); - len = xlen; - - num_ops = 1; - strip_unit_end = page->index + - ((len - 1) >> PAGE_SHIFT); - - BUG_ON(pages); - max_pages = calc_pages_for(0, (u64)len); - pages = kmalloc_array(max_pages, - sizeof(*pages), - GFP_NOFS); - if (!pages) { - from_pool = true; - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); - BUG_ON(!pages); - } - - len = 0; - } else if (page->index != - (offset + len) >> PAGE_SHIFT) { - if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : - CEPH_OSD_MAX_OPS)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - break; - } - - num_ops++; - offset = (u64)page_offset(page); - len = 0; - } + /* note position of first page in fbatch */ + doutc(cl, "%llx.%llx will write folio %p idx %lu\n", + ceph_vinop(inode), folio, folio->index); - /* note position of first page in fbatch */ - doutc(cl, "%llx.%llx will write page %p idx %lu\n", - ceph_vinop(inode), page, page->index); - - if (atomic_long_inc_return(&fsc->writeback_count) > - CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) - fsc->write_congested = true; - - if (IS_ENCRYPTED(inode)) { - pages[locked_pages] = - fscrypt_encrypt_pagecache_blocks(page, - PAGE_SIZE, 0, - locked_pages ? GFP_NOWAIT : GFP_NOFS); - if (IS_ERR(pages[locked_pages])) { - if (PTR_ERR(pages[locked_pages]) == -EINVAL) - pr_err_client(cl, - "inode->i_blkbits=%hhu\n", - inode->i_blkbits); - /* better not fail on first page! */ - BUG_ON(locked_pages == 0); - pages[locked_pages] = NULL; - redirty_page_for_writepage(wbc, page); - unlock_page(page); - break; - } - ++locked_pages; - } else { - pages[locked_pages++] = page; - } + fsc->write_congested = is_write_congestion_happened(fsc); - fbatch.folios[i] = NULL; - len += thp_size(page); + rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, + folio); + if (rc) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + break; } - /* did we get anything? */ - if (!locked_pages) - goto release_folios; - if (i) { - unsigned j, n = 0; - /* shift unused page to beginning of fbatch */ - for (j = 0; j < nr_folios; j++) { - if (!fbatch.folios[j]) - continue; - if (n < j) - fbatch.folios[n] = fbatch.folios[j]; - n++; - } - fbatch.nr = n; + ceph_wbc->fbatch.folios[i] = NULL; + ceph_wbc->len += folio_size(folio); + } - if (nr_folios && i == nr_folios && - locked_pages < max_pages) { - doutc(cl, "reached end fbatch, trying for more\n"); - folio_batch_release(&fbatch); - goto get_more_pages; - } + ceph_wbc->processed_in_fbatch = i; + + return rc; +} + +static inline +void ceph_shift_unused_folios_left(struct folio_batch *fbatch) +{ + unsigned j, n = 0; + + /* shift unused page to beginning of fbatch */ + for (j = 0; j < folio_batch_count(fbatch); j++) { + if (!fbatch->folios[j]) + continue; + + if (n < j) { + fbatch->folios[n] = fbatch->folios[j]; } + n++; + } + + fbatch->nr = n; +} + +static +int ceph_submit_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_vino vino = ceph_vino(inode); + struct ceph_osd_request *req = NULL; + struct page *page = NULL; + bool caching = ceph_is_cache_enabled(inode); + u64 offset; + u64 len; + unsigned i; + new_request: - offset = ceph_fscrypt_page_offset(pages[0]); - len = wsize; + offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]); + len = ceph_wbc->wsize; + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, ceph_wbc->num_ops, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, false); + if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, num_ops, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, false); - if (IS_ERR(req)) { - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, - min(num_ops, - CEPH_OSD_SLAB_OPS), - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE, - snapc, ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, true); - BUG_ON(IS_ERR(req)); + &ci->i_layout, vino, + offset, &len, 0, + min(ceph_wbc->num_ops, + CEPH_OSD_SLAB_OPS), + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, + ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, + true); + BUG_ON(IS_ERR(req)); + } + + page = ceph_wbc->pages[ceph_wbc->locked_pages - 1]; + BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) { + struct folio *folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + page = &folio->page; + redirty_page_for_writepage(wbc, page); + unlock_page(page); } - BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + - thp_size(pages[locked_pages - 1]) - offset); - if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { - rc = -EIO; - goto release_folios; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + + if (!page) + continue; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); } - req->r_callback = writepages_finish; - req->r_inode = inode; - - /* Format the osd request message and submit the write */ - len = 0; - data_pages = pages; - op_idx = 0; - for (i = 0; i < locked_pages; i++) { - struct page *page = ceph_fscrypt_pagecache_page(pages[i]); - - u64 cur_offset = page_offset(page); - /* - * Discontinuity in page range? Ceph can handle that by just passing - * multiple extents in the write op. - */ - if (offset + len != cur_offset) { - /* If it's full, stop here */ - if (op_idx + 1 == req->r_num_ops) - break; - - /* Kick off an fscache write with what we have so far. */ - ceph_fscache_write_to_cache(inode, offset, len, caching); - - /* Start a new extent */ - osd_req_op_extent_dup_last(req, op_idx, - cur_offset - offset); - doutc(cl, "got pages at %llu~%llu\n", offset, - len); - osd_req_op_extent_osd_data_pages(req, op_idx, - data_pages, len, 0, - from_pool, false); - osd_req_op_extent_update(req, op_idx, len); - - len = 0; - offset = cur_offset; - data_pages = pages + i; - op_idx++; - } - set_page_writeback(page); - if (caching) - ceph_set_page_fscache(page); - len += thp_size(page); + ceph_osdc_put_request(req); + return -EIO; + } + + req->r_callback = writepages_finish; + req->r_inode = inode; + + /* Format the osd request message and submit the write */ + len = 0; + ceph_wbc->data_pages = ceph_wbc->pages; + ceph_wbc->op_idx = 0; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + u64 cur_offset; + + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + cur_offset = page_offset(page); + + /* + * Discontinuity in page range? Ceph can handle that by just passing + * multiple extents in the write op. + */ + if (offset + len != cur_offset) { + /* If it's full, stop here */ + if (ceph_wbc->op_idx + 1 == req->r_num_ops) + break; + + /* Kick off an fscache write with what we have so far. */ + ceph_fscache_write_to_cache(inode, offset, len, caching); + + /* Start a new extent */ + osd_req_op_extent_dup_last(req, ceph_wbc->op_idx, + cur_offset - offset); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, + len, 0, + ceph_wbc->from_pool, + false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + len = 0; + offset = cur_offset; + ceph_wbc->data_pages = ceph_wbc->pages + i; + ceph_wbc->op_idx++; } - ceph_fscache_write_to_cache(inode, offset, len, caching); - - if (ceph_wbc.size_stable) { - len = min(len, ceph_wbc.i_size - offset); - } else if (i == locked_pages) { - /* writepages_finish() clears writeback pages - * according to the data length, so make sure - * data length covers all locked pages */ - u64 min_len = len + 1 - thp_size(page); - len = get_writepages_data_length(inode, pages[i - 1], - offset); - len = max(len, min_len); + + set_page_writeback(page); + + if (caching) + ceph_set_page_fscache(page); + + len += thp_size(page); + } + + ceph_fscache_write_to_cache(inode, offset, len, caching); + + if (ceph_wbc->size_stable) { + len = min(len, ceph_wbc->i_size - offset); + } else if (i == ceph_wbc->locked_pages) { + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + u64 min_len = len + 1 - thp_size(page); + len = get_writepages_data_length(inode, + ceph_wbc->pages[i - 1], + offset); + len = max(len, min_len); + } + + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) { + pr_warn_client(cl, + "bad encrypted write offset=%lld len=%llu\n", + offset, len); + } + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, len, + 0, ceph_wbc->from_pool, false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops); + + ceph_wbc->from_pool = false; + if (i < ceph_wbc->locked_pages) { + BUG_ON(ceph_wbc->num_ops <= req->r_num_ops); + ceph_wbc->num_ops -= req->r_num_ops; + ceph_wbc->locked_pages -= i; + + /* allocate new pages array for next request */ + ceph_wbc->data_pages = ceph_wbc->pages; + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages); + memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + memset(ceph_wbc->data_pages + i, 0, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + } else { + BUG_ON(ceph_wbc->num_ops != req->r_num_ops); + /* request message now owns the pages array */ + ceph_wbc->pages = NULL; + } + + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(&fsc->client->osdc, req); + req = NULL; + + wbc->nr_to_write -= i; + if (ceph_wbc->pages) + goto new_request; + + return 0; +} + +static +void ceph_wait_until_current_writes_complete(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct page *page; + unsigned i, nr; + + if (wbc->sync_mode != WB_SYNC_NONE && + ceph_wbc->start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc->head_snapc) { + ceph_wbc->index = 0; + + while ((ceph_wbc->index <= ceph_wbc->end) && + (nr = filemap_get_folios_tag(mapping, + &ceph_wbc->index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &ceph_wbc->fbatch))) { + for (i = 0; i < nr; i++) { + page = &ceph_wbc->fbatch.folios[i]->page; + if (page_snap_context(page) != ceph_wbc->snapc) + continue; + wait_on_page_writeback(page); + } + + folio_batch_release(&ceph_wbc->fbatch); + cond_resched(); } - if (IS_ENCRYPTED(inode)) - len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + } +} + +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_writeback_ctl ceph_wbc; + int rc = 0; - doutc(cl, "got pages at %llu~%llu\n", offset, len); + if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) + return 0; - if (IS_ENCRYPTED(inode) && - ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) - pr_warn_client(cl, - "bad encrypted write offset=%lld len=%llu\n", - offset, len); - - osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, - 0, from_pool, false); - osd_req_op_extent_update(req, op_idx, len); - - BUG_ON(op_idx + 1 != req->r_num_ops); - - from_pool = false; - if (i < locked_pages) { - BUG_ON(num_ops <= req->r_num_ops); - num_ops -= req->r_num_ops; - locked_pages -= i; - - /* allocate new pages array for next request */ - data_pages = pages; - pages = kmalloc_array(locked_pages, sizeof(*pages), - GFP_NOFS); - if (!pages) { - from_pool = true; - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); - BUG_ON(!pages); + doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + if (is_forced_umount(mapping)) { + /* we're in a forced umount, don't write! */ + return -EIO; + } + + ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto out; + } + +retry: + rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc); + if (rc == -ENODATA) { + /* hmm, why does writepages get called when there + is no dirty data? */ + rc = 0; + goto dec_osd_stopping_blocker; + } + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end); + + while (!has_writeback_done(&ceph_wbc)) { + ceph_wbc.locked_pages = 0; + ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT; + +get_more_pages: + ceph_folio_batch_reinit(&ceph_wbc); + + ceph_wbc.nr_folios = filemap_get_folios_tag(mapping, + &ceph_wbc.index, + ceph_wbc.end, + ceph_wbc.tag, + &ceph_wbc.fbatch); + doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n", + ceph_wbc.tag, ceph_wbc.nr_folios); + + if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages) + break; + +process_folio_batch: + rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; + + /* did we get anything? */ + if (!ceph_wbc.locked_pages) + goto release_folios; + + if (ceph_wbc.processed_in_fbatch) { + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); + + if (folio_batch_count(&ceph_wbc.fbatch) == 0 && + ceph_wbc.locked_pages < ceph_wbc.max_pages) { + doutc(cl, "reached end fbatch, trying for more\n"); + goto get_more_pages; } - memcpy(pages, data_pages + i, - locked_pages * sizeof(*pages)); - memset(data_pages + i, 0, - locked_pages * sizeof(*pages)); - } else { - BUG_ON(num_ops != req->r_num_ops); - index = pages[i - 1]->index + 1; - /* request message now owns the pages array */ - pages = NULL; } - req->r_mtime = inode_get_mtime(inode); - ceph_osdc_start_request(&fsc->client->osdc, req); - req = NULL; + rc = ceph_submit_write(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; + + ceph_wbc.locked_pages = 0; + ceph_wbc.strip_unit_end = 0; - wbc->nr_to_write -= i; - if (pages) - goto new_request; + if (folio_batch_count(&ceph_wbc.fbatch) > 0) { + ceph_wbc.nr_folios = + folio_batch_count(&ceph_wbc.fbatch); + goto process_folio_batch; + } /* * We stop writing back only if we are not doing @@ -1377,61 +1713,44 @@ new_request: * we tagged for writeback prior to entering this loop. */ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) - done = true; + ceph_wbc.done = true; release_folios: doutc(cl, "folio_batch release on %d folios (%p)\n", - (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL); - folio_batch_release(&fbatch); + (int)ceph_wbc.fbatch.nr, + ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL); + folio_batch_release(&ceph_wbc.fbatch); } - if (should_loop && !done) { + if (ceph_wbc.should_loop && !ceph_wbc.done) { /* more to do; loop back to beginning of file */ doutc(cl, "looping back to beginning of file\n"); - end = start_index - 1; /* OK even when start_index == 0 */ + /* OK even when start_index == 0 */ + ceph_wbc.end = ceph_wbc.start_index - 1; /* to write dirty pages associated with next snapc, * we need to wait until current writes complete */ - if (wbc->sync_mode != WB_SYNC_NONE && - start_index == 0 && /* all dirty pages were checked */ - !ceph_wbc.head_snapc) { - struct page *page; - unsigned i, nr; - index = 0; - while ((index <= end) && - (nr = filemap_get_folios_tag(mapping, &index, - (pgoff_t)-1, - PAGECACHE_TAG_WRITEBACK, - &fbatch))) { - for (i = 0; i < nr; i++) { - page = &fbatch.folios[i]->page; - if (page_snap_context(page) != snapc) - continue; - wait_on_page_writeback(page); - } - folio_batch_release(&fbatch); - cond_resched(); - } - } + ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc); - start_index = 0; - index = 0; + ceph_wbc.start_index = 0; + ceph_wbc.index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = ceph_wbc.index; + +dec_osd_stopping_blocker: + ceph_dec_osd_stopping_blocker(fsc->mdsc); out: - ceph_osdc_put_request(req); - ceph_put_snap_context(last_snapc); + ceph_put_snap_context(ceph_wbc.last_snapc); doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), rc); + return rc; } - - /* * See if a given @snapc is either writeable, or already written. */ @@ -1447,56 +1766,56 @@ static int context_is_writeable_or_written(struct inode *inode, /** * ceph_find_incompatible - find an incompatible context and return it - * @page: page being dirtied + * @folio: folio being dirtied * - * We are only allowed to write into/dirty a page if the page is + * We are only allowed to write into/dirty a folio if the folio is * clean, or already dirty within the same snap context. Returns a * conflicting context if there is one, NULL if there isn't, or a * negative error code on other errors. * - * Must be called with page lock held. + * Must be called with folio lock held. */ static struct ceph_snap_context * -ceph_find_incompatible(struct page *page) +ceph_find_incompatible(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); if (ceph_inode_is_shutdown(inode)) { - doutc(cl, " %llx.%llx page %p is shutdown\n", - ceph_vinop(inode), page); + doutc(cl, " %llx.%llx folio %p is shutdown\n", + ceph_vinop(inode), folio); return ERR_PTR(-ESTALE); } for (;;) { struct ceph_snap_context *snapc, *oldest; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - snapc = page_snap_context(page); + snapc = page_snap_context(&folio->page); if (!snapc || snapc == ci->i_head_snapc) break; /* - * this page is already dirty in another (older) snap + * this folio is already dirty in another (older) snap * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); - doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n", - ceph_vinop(inode), page, snapc); + doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n", + ceph_vinop(inode), folio, snapc); return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); - /* yay, writeable, do it now (without dropping page lock) */ - doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n", - ceph_vinop(inode), page, snapc); - if (clear_page_dirty_for_io(page)) { - int r = writepage_nounlock(page, NULL); + /* yay, writeable, do it now (without dropping folio lock) */ + doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n", + ceph_vinop(inode), folio, snapc); + if (folio_clear_dirty_for_io(folio)) { + int r = write_folio_nounlock(folio, NULL); if (r < 0) return ERR_PTR(r); } @@ -1511,7 +1830,7 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(*foliop, 0)); + snapc = ceph_find_incompatible(*foliop); if (snapc) { int r; @@ -1594,7 +1913,6 @@ out: const struct address_space_operations ceph_aops = { .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, @@ -1602,6 +1920,7 @@ const struct address_space_operations ceph_aops = { .invalidate_folio = ceph_invalidate_folio, .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, + .migrate_folio = filemap_migrate_folio, }; static void ceph_block_sigs(sigset_t *oldset) @@ -1718,8 +2037,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_cap_flush *prealloc_cf; - struct page *page = vmf->page; - loff_t off = page_offset(page); + struct folio *folio = page_folio(vmf->page); + loff_t off = folio_pos(folio); loff_t size = i_size_read(inode); size_t len; int want, got, err; @@ -1736,10 +2055,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (off + thp_size(page) <= size) - len = thp_size(page); + if (off + folio_size(folio) <= size) + len = folio_size(folio); else - len = offset_in_thp(page, size); + len = offset_in_folio(folio, size); doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", ceph_vinop(inode), off, len, size); @@ -1756,30 +2075,30 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), off, len, ceph_cap_string(got)); - /* Update time before taking page lock */ + /* Update time before taking folio lock */ file_update_time(vma->vm_file); inode_inc_iversion_raw(inode); do { struct ceph_snap_context *snapc; - lock_page(page); + folio_lock(folio); - if (page_mkwrite_check_truncate(page, inode) < 0) { - unlock_page(page); + if (folio_mkwrite_check_truncate(folio, inode) < 0) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; break; } - snapc = ceph_find_incompatible(page); + snapc = ceph_find_incompatible(folio); if (!snapc) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); + /* success. we'll keep the folio locked. */ + folio_mark_dirty(folio); ret = VM_FAULT_LOCKED; break; } - unlock_page(page); + folio_unlock(folio); if (IS_ERR(snapc)) { ret = VM_FAULT_SIGBUS; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 62e99e65250d..a321aa6d0ed2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,17 +141,18 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, if (ptr_pos >= i_size_read(dir)) return NULL; - if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) { + if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) { ceph_readdir_cache_release(cache_ctl); - cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); - if (!cache_ctl->page) { - doutc(cl, " page %lu not found\n", ptr_pgoff); + cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff); + if (IS_ERR(cache_ctl->folio)) { + cache_ctl->folio = NULL; + doutc(cl, " folio %lu not found\n", ptr_pgoff); return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by - i_rwsem, no need to use page lock */ - unlock_page(cache_ctl->page); - cache_ctl->dentries = kmap(cache_ctl->page); + i_rwsem, no need to use folio lock */ + folio_unlock(cache_ctl->folio); + cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0); } cache_ctl->index = idx & idx_mask; @@ -1092,19 +1093,20 @@ out: return err; } -static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; + struct dentry *ret; int err; int op; err = ceph_wait_on_conflict_unlink(dentry); if (err) - return err; + return ERR_PTR(err); if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ @@ -1116,32 +1118,32 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, ceph_vinop(dir), dentry, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { - err = -EROFS; + ret = ERR_PTR(-EROFS); goto out; } if (op == CEPH_MDS_OP_MKDIR && ceph_quota_is_max_files_exceeded(dir)) { - err = -EDQUOT; + ret = ERR_PTR(-EDQUOT); goto out; } if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir)) { - err = -ENOKEY; + ret = ERR_PTR(-ENOKEY); goto out; } req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { - err = PTR_ERR(req); + ret = ERR_CAST(req); goto out; } mode |= S_IFDIR; req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); if (IS_ERR(req->r_new_inode)) { - err = PTR_ERR(req->r_new_inode); + ret = ERR_CAST(req->r_new_inode); req->r_new_inode = NULL; goto out_req; } @@ -1165,15 +1167,22 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, !req->r_reply_info.head->is_target && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); + ret = ERR_PTR(err); out_req: + if (!IS_ERR(ret) && req->r_dentry != dentry) + /* Some other dentry was spliced in */ + ret = dget(req->r_dentry); ceph_mdsc_put_request(req); out: - if (!err) + if (!IS_ERR(ret)) { + if (ret) + dentry = ret; ceph_init_inode_acls(d_inode(dentry), &as_ctx); - else + } else { d_drop(dentry); + } ceph_release_acl_sec_ctx(&as_ctx); - return err; + return ret; } static int ceph_link(struct dentry *old_dentry, struct inode *dir, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7dd6c2275085..6ac2bd555e86 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1845,10 +1845,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) { - if (ctl->page) { - kunmap(ctl->page); - put_page(ctl->page); - ctl->page = NULL; + if (ctl->folio) { + folio_release_kmap(ctl->folio, ctl->dentries); + ctl->folio = NULL; } } @@ -1862,20 +1861,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, unsigned idx = ctl->index % nsize; pgoff_t pgoff = ctl->index / nsize; - if (!ctl->page || pgoff != ctl->page->index) { + if (!ctl->folio || pgoff != ctl->folio->index) { ceph_readdir_cache_release(ctl); + fgf_t fgf = FGP_LOCK; + if (idx == 0) - ctl->page = grab_cache_page(&dir->i_data, pgoff); - else - ctl->page = find_lock_page(&dir->i_data, pgoff); - if (!ctl->page) { + fgf |= FGP_ACCESSED | FGP_CREAT; + + ctl->folio = __filemap_get_folio(&dir->i_data, pgoff, + fgf, mapping_gfp_mask(&dir->i_data)); + if (IS_ERR(ctl->folio)) { + int err = PTR_ERR(ctl->folio); + + ctl->folio = NULL; ctl->index = -1; - return idx == 0 ? -ENOMEM : 0; + return idx == 0 ? err : 0; } /* reading/filling the cache are serialized by - * i_rwsem, no need to use page lock */ - unlock_page(ctl->page); - ctl->dentries = kmap(ctl->page); + * i_rwsem, no need to use folio lock */ + folio_unlock(ctl->folio); + ctl->dentries = kmap_local_folio(ctl->folio, 0); if (idx == 0) memset(ctl->dentries, 0, PAGE_SIZE); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 54b3421501e9..230e0c3f341f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5489,6 +5489,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->stopping_lock); atomic_set(&mdsc->stopping_blockers, 0); init_completion(&mdsc->stopping_waiter); + atomic64_set(&mdsc->dirty_folios, 0); + init_waitqueue_head(&mdsc->flush_end_wq); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 7c9fee9e80d4..3e2a6fa7c19a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -458,6 +458,9 @@ struct ceph_mds_client { atomic_t stopping_blockers; struct completion stopping_waiter; + atomic64_t dirty_folios; + wait_queue_head_t flush_end_wq; + atomic64_t quotarealms_count; /* # realms with quota */ /* * We keep a list of inodes we don't see in the mountpoint but that we diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4344e1f11806..f3951253e393 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1563,6 +1563,17 @@ static void ceph_kill_sb(struct super_block *s) */ sync_filesystem(s); + if (atomic64_read(&mdsc->dirty_folios) > 0) { + wait_queue_head_t *wq = &mdsc->flush_end_wq; + long timeleft = wait_event_killable_timeout(*wq, + atomic64_read(&mdsc->dirty_folios) <= 0, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn_client(cl, "umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn_client(cl, "umount was killed, %ld\n", timeleft); + } + spin_lock(&mdsc->stopping_lock); mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; wait = !!atomic_read(&mdsc->stopping_blockers); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7fa1e7be50e4..bb0db0cc8003 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -903,7 +903,7 @@ ceph_find_rw_context(struct ceph_file_info *cf) } struct ceph_readdir_cache_control { - struct page *page; + struct folio *folio; struct dentry **dentries; int index; }; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index a3e2dfeedfbf..ab69d8f0cec2 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -166,8 +166,8 @@ err_out: return error; } -static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *de, umode_t mode) +static struct dentry *coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *de, umode_t mode) { struct inode *inode; struct coda_vattr attrs; @@ -177,14 +177,14 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct CodaFid newfid; if (is_root_inode(dir) && coda_iscontrol(name, len)) - return -EPERM; + return ERR_PTR(-EPERM); attrs.va_mode = mode; - error = venus_mkdir(dir->i_sb, coda_i2f(dir), + error = venus_mkdir(dir->i_sb, coda_i2f(dir), name, len, &newfid, &attrs); if (error) goto err_out; - + inode = coda_iget(dir->i_sb, &newfid, &attrs); if (IS_ERR(inode)) { error = PTR_ERR(inode); @@ -195,10 +195,10 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, coda_dir_inc_nlink(dir); coda_dir_update_mtime(dir); d_instantiate(de, inode); - return 0; + return NULL; err_out: d_drop(de); - return error; + return ERR_PTR(error); } /* try to make de an entry in dir_inodde linked to source_de */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7d10278db30d..5568cb74b322 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1280,8 +1280,8 @@ out_root_unlock: } EXPORT_SYMBOL(configfs_depend_item_unlocked); -static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int ret = 0; int module_got = 0; @@ -1461,7 +1461,7 @@ out_put: put_fragment(frag); out: - return ret; + return ERR_PTR(ret); } static int configfs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/coredump.c b/fs/coredump.c index 4375c70144d0..d6a92cd6018e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -926,14 +926,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, { unsigned long addr; struct page *dump_page; + int locked, ret; dump_page = dump_page_alloc(); if (!dump_page) return 0; + ret = 0; + locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; + if (!locked) { + if (mmap_read_lock_killable(current->mm)) + goto out; + locked = 1; + } + /* * To avoid having to allocate page tables for virtual address * ranges that have never been used yet, and also to make it @@ -941,21 +950,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, * NULL when encountering an empty page table entry that would * otherwise have been filled with the zero page. */ - page = get_dump_page(addr); + page = get_dump_page(addr, &locked); if (page) { + if (locked) { + mmap_read_unlock(current->mm); + locked = 0; + } int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page)); put_page(page); - if (stop) { - dump_page_free(dump_page); - return 0; - } + if (stop) + goto out; } else { dump_skip(cprm, PAGE_SIZE); } + + if (dump_interrupted()) + goto out; + + if (!need_resched()) + continue; + if (locked) { + mmap_read_unlock(current->mm); + locked = 0; + } cond_resched(); } + ret = 1; +out: + if (locked) + mmap_read_unlock(current->mm); + dump_page_free(dump_page); - return 1; + return ret; } #endif diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 328470d40dec..b74b5937e695 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -153,8 +153,8 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page - * @page: the locked pagecache page containing the data to encrypt + * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache folio + * @folio: the locked pagecache folio containing the data to encrypt * @len: size of the data to encrypt, in bytes * @offs: offset within @page of the data to encrypt, in bytes * @gfp_flags: memory allocation flags; see details below @@ -177,23 +177,21 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, * * Return: the new encrypted bounce page on success; an ERR_PTR() on failure */ -struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags) - +struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags) { - const struct inode *inode = page->mapping->host; + const struct inode *inode = folio->mapping->host; const struct fscrypt_inode_info *ci = inode->i_crypt_info; const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; - u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) + + u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + (offs >> du_bits); unsigned int i; int err; - if (WARN_ON_ONCE(!PageLocked(page))) + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + if (WARN_ON_ONCE(!folio_test_locked(folio))) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) @@ -205,7 +203,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, for (i = offs; i < offs + len; i += du_size, index++) { err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index, - page, ciphertext_page, + &folio->page, ciphertext_page, du_size, i, gfp_flags); if (err) { fscrypt_free_bounce_page(ciphertext_page); @@ -213,7 +211,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, } } SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)page); + set_page_private(ciphertext_page, (unsigned long)folio); return ciphertext_page; } EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); @@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -static s64 dax_unshare_iter(struct iomap_iter *iter) +static int dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) u64 copy_len = iomap_length(iter); u32 mod; int id = 0; - s64 ret = 0; + s64 ret; void *daddr = NULL, *saddr = NULL; if (!iomap_want_unshare_iter(iter)) - return iomap_length(iter); + return iomap_iter_advance_full(iter); /* * Extend the file range to be aligned to fsblock/pagesize, because @@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) - ret = iomap_length(iter); - else + if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) ret = -EIO; out_unlock: dax_read_unlock(id); - return dax_mem2blk_err(ret); + if (ret < 0) + return dax_mem2blk_err(ret); + return iomap_iter_advance_full(iter); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, @@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_unshare_iter(&iter); + iter.status = dax_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(dax_file_unshare); @@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) return ret; } -static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; u64 length = iomap_length(iter); - s64 written = 0; + int ret; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return length; + return iomap_iter_advance(iter, &length); /* * invalidate the pages whose sharing state is to be changed @@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) */ if (iomap->flags & IOMAP_F_SHARED) invalidate_inode_pages2_range(iter->inode->i_mapping, - pos >> PAGE_SHIFT, - (pos + length - 1) >> PAGE_SHIFT); + iter->pos >> PAGE_SHIFT, + (iter->pos + length - 1) >> PAGE_SHIFT); do { + loff_t pos = iter->pos; unsigned offset = offset_in_page(pos); - unsigned size = min_t(u64, PAGE_SIZE - offset, length); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); - long rc; int id; + length = min_t(u64, PAGE_SIZE - offset, length); + id = dax_read_lock(); - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); + if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE) + ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else - rc = dax_memzero(iter, pos, size); + ret = dax_memzero(iter, pos, length); dax_read_unlock(id); - if (rc < 0) - return rc; - pos += size; - length -= size; - written += size; + if (ret < 0) + return ret; + + ret = iomap_iter_advance(iter, &length); + if (ret) + return ret; } while (length > 0); if (did_zero) *did_zero = true; - return written; + return ret; } int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, @@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_zero_iter(&iter, did_zero); + iter.status = dax_zero_iter(&iter, did_zero); return ret; } EXPORT_SYMBOL_GPL(dax_zero_range); @@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(dax_truncate_page); -static loff_t dax_iomap_iter(const struct iomap_iter *iomi, - struct iov_iter *iter) +static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iomi); @@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (pos >= end) return 0; - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) - return iov_iter_zero(min(length, end - pos), iter); + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { + done = iov_iter_zero(min(length, end - pos), iter); + return iomap_iter_advance(iomi, &done); + } } /* @@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } id = dax_read_lock(); - while (pos < end) { + while ((pos = iomi->pos) < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); @@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - pos += xfer; - length -= xfer; - done += xfer; - - if (xfer == 0) + length = xfer; + ret = iomap_iter_advance(iomi, &length); + if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; } dax_read_unlock(id); - return done ? done : ret; + return ret; } /** @@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iomi, ops)) > 0) - iomi.processed = dax_iomap_iter(&iomi, iter); + iomi.status = dax_iomap_iter(&iomi, iter); done = iomi.pos - iocb->ki_pos; iocb->ki_pos = iomi.pos; @@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, while ((error = iomap_iter(&iter, ops)) > 0) { if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { - iter.processed = -EIO; /* fs corruption? */ + iter.status = -EIO; /* fs corruption? */ continue; } @@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) - iter.processed = PAGE_SIZE; + if (!(ret & VM_FAULT_ERROR)) { + u64 length = PAGE_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } if (iomap_errp) @@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) - iter.processed = PMD_SIZE; + if (ret != VM_FAULT_FALLBACK) { + u64 length = PMD_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } unlock_entry: @@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); -static loff_t dax_range_compare_iter(struct iomap_iter *it_src, +static int dax_range_compare_iter(struct iomap_iter *it_src, struct iomap_iter *it_dest, u64 len, bool *same) { const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { *same = true; - return len; + goto advance; } if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { @@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (!*same) len = 0; dax_read_unlock(id); - return len; + +advance: + dest_len = len; + ret = iomap_iter_advance(it_src, &len); + if (!ret) + ret = iomap_iter_advance(it_dest, &dest_len); + return ret; out_unlock: dax_read_unlock(id); @@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret, compared = 0; + int ret, status; while ((ret = iomap_iter(&src_iter, ops)) > 0 && (ret = iomap_iter(&dst_iter, ops)) > 0) { - compared = dax_range_compare_iter(&src_iter, &dst_iter, + status = dax_range_compare_iter(&src_iter, &dst_iter, min(src_iter.len, dst_iter.len), same); - if (compared < 0) + if (status < 0) return ret; - src_iter.processed = dst_iter.processed = compared; + src_iter.status = dst_iter.status = status; } return ret; } diff --git a/fs/dcache.c b/fs/dcache.c index e3634916ffb9..623947d6a676 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2480,7 +2480,8 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); - wake_up_all(d_wait); + if (wq_has_sleeper(d_wait)) + wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) @@ -2687,52 +2688,6 @@ void d_add(struct dentry *entry, struct inode *inode) } EXPORT_SYMBOL(d_add); -/** - * d_exact_alias - find and hash an exact unhashed alias - * @entry: dentry to add - * @inode: The inode to go with this dentry - * - * If an unhashed dentry with the same name/parent and desired - * inode already exists, hash and return it. Otherwise, return - * NULL. - * - * Parent directory should be locked. - */ -struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) -{ - struct dentry *alias; - unsigned int hash = entry->d_name.hash; - - spin_lock(&inode->i_lock); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { - /* - * Don't need alias->d_lock here, because aliases with - * d_parent == entry->d_parent are not subject to name or - * parent changes, because the parent inode i_mutex is held. - */ - if (alias->d_name.hash != hash) - continue; - if (alias->d_parent != entry->d_parent) - continue; - if (!d_same_name(alias, entry->d_parent, &entry->d_name)) - continue; - spin_lock(&alias->d_lock); - if (!d_unhashed(alias)) { - spin_unlock(&alias->d_lock); - alias = NULL; - } else { - dget_dlock(alias); - __d_rehash(alias); - spin_unlock(&alias->d_lock); - } - spin_unlock(&inode->i_lock); - return alias; - } - spin_unlock(&inode->i_lock); - return NULL; -} -EXPORT_SYMBOL(d_exact_alias); - static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 1096ff8562fa..42e4d6eeb29f 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -12,6 +12,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> @@ -21,7 +23,6 @@ #include <linux/magic.h> #include <linux/idr.h> #include <linux/devpts_fs.h> -#include <linux/parser.h> #include <linux/fsnotify.h> #include <linux/seq_file.h> @@ -87,14 +88,14 @@ enum { Opt_err }; -static const match_table_t tokens = { - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_mode, "mode=%o"}, - {Opt_ptmxmode, "ptmxmode=%o"}, - {Opt_newinstance, "newinstance"}, - {Opt_max, "max=%d"}, - {Opt_err, NULL} +static const struct fs_parameter_spec devpts_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_s32 ("max", Opt_max), + fsparam_u32oct ("mode", Opt_mode), + fsparam_flag ("newinstance", Opt_newinstance), + fsparam_u32oct ("ptmxmode", Opt_ptmxmode), + fsparam_u32 ("uid", Opt_uid), + {} }; struct pts_fs_info { @@ -214,93 +215,48 @@ void devpts_release(struct pts_fs_info *fsi) deactivate_super(fsi->sb); } -#define PARSE_MOUNT 0 -#define PARSE_REMOUNT 1 - /* - * parse_mount_options(): - * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. - * - * Note: @data may be NULL (in which case all options are set to default). + * devpts_parse_param - Parse mount parameters */ -static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) +static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - kuid_t uid; - kgid_t gid; - - opts->setuid = 0; - opts->setgid = 0; - opts->uid = GLOBAL_ROOT_UID; - opts->gid = GLOBAL_ROOT_GID; - opts->mode = DEVPTS_DEFAULT_MODE; - opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; - opts->max = NR_UNIX98_PTY_MAX; - - /* Only allow instances mounted from the initial mount - * namespace to tap the reserve pool of ptys. - */ - if (op == PARSE_MOUNT) - opts->reserve = - (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); - - while ((p = strsep(&data, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - int option; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return -EINVAL; - opts->uid = uid; - opts->setuid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return -EINVAL; - opts->gid = gid; - opts->setgid = 1; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; - case Opt_ptmxmode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->ptmxmode = option & S_IALLUGO; - break; - case Opt_newinstance: - break; - case Opt_max: - if (match_int(&args[0], &option) || - option < 0 || option > NR_UNIX98_PTY_MAX) - return -EINVAL; - opts->max = option; - break; - default: - pr_err("called with bogus options\n"); - return -EINVAL; - } + struct pts_fs_info *fsi = fc->s_fs_info; + struct pts_mount_opts *opts = &fsi->mount_opts; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, devpts_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->uid = result.uid; + opts->setuid = 1; + break; + case Opt_gid: + opts->gid = result.gid; + opts->setgid = 1; + break; + case Opt_mode: + opts->mode = result.uint_32 & S_IALLUGO; + break; + case Opt_ptmxmode: + opts->ptmxmode = result.uint_32 & S_IALLUGO; + break; + case Opt_newinstance: + break; + case Opt_max: + if (result.uint_32 > NR_UNIX98_PTY_MAX) + return invalf(fc, "max out of range"); + opts->max = result.uint_32; + break; } return 0; } -static int mknod_ptmx(struct super_block *sb) +static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) { int mode; int rc = -ENOMEM; @@ -362,13 +318,23 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) } } -static int devpts_remount(struct super_block *sb, int *flags, char *data) +static int devpts_reconfigure(struct fs_context *fc) { - int err; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct pts_fs_info *fsi = DEVPTS_SB(fc->root->d_sb); + struct pts_fs_info *new = fc->s_fs_info; - err = parse_mount_options(data, PARSE_REMOUNT, opts); + /* Apply the revised options. We don't want to change ->reserve. + * Ideally, we'd update each option conditionally on it having been + * explicitly changed, but the default is to reset everything so that + * would break UAPI... + */ + fsi->mount_opts.setuid = new->mount_opts.setuid; + fsi->mount_opts.setgid = new->mount_opts.setgid; + fsi->mount_opts.uid = new->mount_opts.uid; + fsi->mount_opts.gid = new->mount_opts.gid; + fsi->mount_opts.mode = new->mount_opts.mode; + fsi->mount_opts.ptmxmode = new->mount_opts.ptmxmode; + fsi->mount_opts.max = new->mount_opts.max; /* * parse_mount_options() restores options to default values @@ -378,7 +344,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) */ update_ptmx_mode(fsi); - return err; + return 0; } static int devpts_show_options(struct seq_file *seq, struct dentry *root) @@ -402,31 +368,13 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations devpts_sops = { .statfs = simple_statfs, - .remount_fs = devpts_remount, .show_options = devpts_show_options, }; -static void *new_pts_fs_info(struct super_block *sb) -{ - struct pts_fs_info *fsi; - - fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); - if (!fsi) - return NULL; - - ida_init(&fsi->allocated_ptys); - fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; - fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; - fsi->sb = sb; - - return fsi; -} - -static int -devpts_fill_super(struct super_block *s, void *data, int silent) +static int devpts_fill_super(struct super_block *s, struct fs_context *fc) { + struct pts_fs_info *fsi = DEVPTS_SB(s); struct inode *inode; - int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -435,20 +383,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; + fsi->sb = s; - error = -ENOMEM; - s->s_fs_info = new_pts_fs_info(s); - if (!s->s_fs_info) - goto fail; - - error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); - if (error) - goto fail; - - error = -ENOMEM; inode = new_inode(s); if (!inode) - goto fail; + return -ENOMEM; inode->i_ino = 1; simple_inode_init_ts(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; @@ -459,31 +398,60 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_root = d_make_root(inode); if (!s->s_root) { pr_err("get root dentry failed\n"); - goto fail; + return -ENOMEM; } - error = mknod_ptmx(s); - if (error) - goto fail_dput; - - return 0; -fail_dput: - dput(s->s_root); - s->s_root = NULL; -fail: - return error; + return mknod_ptmx(s, fc); } /* - * devpts_mount() + * devpts_get_tree() * * Mount a new (private) instance of devpts. PTYs created in this * instance are independent of the PTYs in other devpts instances. */ -static struct dentry *devpts_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int devpts_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, devpts_fill_super); +} + +static void devpts_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations devpts_context_ops = { + .free = devpts_free_fc, + .parse_param = devpts_parse_param, + .get_tree = devpts_get_tree, + .reconfigure = devpts_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +static int devpts_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, devpts_fill_super); + struct pts_fs_info *fsi; + + fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); + if (!fsi) + return -ENOMEM; + + ida_init(&fsi->allocated_ptys); + fsi->mount_opts.uid = GLOBAL_ROOT_UID; + fsi->mount_opts.gid = GLOBAL_ROOT_GID; + fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; + fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->mount_opts.max = NR_UNIX98_PTY_MAX; + + if (fc->purpose == FS_CONTEXT_FOR_MOUNT && + current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns) + fsi->mount_opts.reserve = true; + + fc->s_fs_info = fsi; + fc->ops = &devpts_context_ops; + return 0; } static void devpts_kill_sb(struct super_block *sb) @@ -498,7 +466,8 @@ static void devpts_kill_sb(struct super_block *sb) static struct file_system_type devpts_fs_type = { .name = "devpts", - .mount = devpts_mount, + .init_fs_context = devpts_init_fs_context, + .parameters = devpts_param_specs, .kill_sb = devpts_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a9819ddb1ab8..51a5c54eb740 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -503,18 +503,24 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int rc; struct dentry *lower_dentry; struct inode *lower_dir; rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_mkdir(&nop_mnt_idmap, lower_dir, - lower_dentry, mode); - if (rc || d_really_is_negative(lower_dentry)) + if (rc) + goto out; + + lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir, + lower_dentry, mode); + rc = PTR_ERR(lower_dentry); + if (IS_ERR(lower_dentry)) + goto out; + rc = 0; + if (d_unhashed(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) @@ -526,7 +532,7 @@ out: inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); - return rc; + return ERR_PTR(rc); } static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 0b1c878317ab..e7b7f426fecf 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -172,7 +172,6 @@ const struct super_operations ecryptfs_sops = { .destroy_inode = ecryptfs_destroy_inode, .free_inode = ecryptfs_free_inode, .statfs = ecryptfs_statfs, - .remount_fs = NULL, .evict_inode = ecryptfs_evict_inode, .show_options = ecryptfs_show_options }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 6eae8cf655c1..0486e9b68bc6 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -421,7 +421,7 @@ static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len, if (err) size = 0; - inode_lock(inode); + inode_lock_nested(inode, I_MUTEX_CHILD); i_size_write(inode, size); inode_unlock(inode); @@ -474,12 +474,25 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor, return err; } +static void efivarfs_deactivate_super_work(struct work_struct *work) +{ + struct super_block *s = container_of(work, struct super_block, + destroy_work); + /* + * note: here s->destroy_work is free for reuse (which + * will happen in deactivate_super) + */ + deactivate_super(s); +} + +static struct file_system_type efivarfs_type; + static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action, void *ptr) { struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info, pm_nb); - struct path path = { .mnt = NULL, .dentry = sfi->sb->s_root, }; + struct path path; struct efivarfs_ctx ectx = { .ctx = { .actor = efivarfs_actor, @@ -487,6 +500,7 @@ static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action, .sb = sfi->sb, }; struct file *file; + struct super_block *s = sfi->sb; static bool rescan_done = true; if (action == PM_HIBERNATION_PREPARE) { @@ -499,11 +513,43 @@ static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action, if (rescan_done) return NOTIFY_DONE; + /* ensure single superblock is alive and pin it */ + if (!atomic_inc_not_zero(&s->s_active)) + return NOTIFY_DONE; + pr_info("efivarfs: resyncing variable state\n"); - /* O_NOATIME is required to prevent oops on NULL mnt */ + path.dentry = sfi->sb->s_root; + + /* + * do not add SB_KERNMOUNT which a single superblock could + * expose to userspace and which also causes MNT_INTERNAL, see + * below + */ + path.mnt = vfs_kern_mount(&efivarfs_type, 0, + efivarfs_type.name, NULL); + if (IS_ERR(path.mnt)) { + pr_err("efivarfs: internal mount failed\n"); + /* + * We may be the last pinner of the superblock but + * calling efivarfs_kill_sb from within the notifier + * here would deadlock trying to unregister it + */ + INIT_WORK(&s->destroy_work, efivarfs_deactivate_super_work); + schedule_work(&s->destroy_work); + return PTR_ERR(path.mnt); + } + + /* path.mnt now has pin on superblock, so this must be above one */ + atomic_dec(&s->s_active); + file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME, current_cred()); + /* + * safe even if last put because no MNT_INTERNAL means this + * will do delayed deactivate_super and not deadlock + */ + mntput(path.mnt); if (IS_ERR(file)) return NOTIFY_DONE; diff --git a/fs/eventfd.c b/fs/eventfd.c index 76129bfcd663..af42b2c7d235 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -406,14 +406,13 @@ static int do_eventfd(unsigned int count, int flags) if (fd < 0) goto err; - file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); + file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, + ctx, flags, FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); goto err; } - - file->f_mode |= FMODE_NOWAIT; fd_install(fd, file); return fd; err: diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7c0980db77b3..9b06a0ab9c32 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -438,7 +438,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) * * we must do our busy polling with irqs enabled */ -static bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); @@ -448,7 +448,7 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) budget = BUSY_POLL_BUDGET; if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; @@ -560,7 +560,7 @@ static void ep_resume_napi_irqs(struct eventpoll *ep) #else -static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } @@ -1980,6 +1980,22 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, return ret; } +static int ep_try_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + int res; + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + res = ep_send_events(ep, events, maxevents); + if (res > 0) + ep_suspend_napi_irqs(ep); + return res; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -2031,23 +2047,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, while (1) { if (eavail) { - /* - * Try to transfer events to user space. In case we get - * 0 events and there's still timeout left over, we go - * trying again in search of more luck. - */ - res = ep_send_events(ep, events, maxevents); - if (res) { - if (res > 0) - ep_suspend_napi_irqs(ep); + res = ep_try_send_events(ep, events, maxevents); + if (res) return res; - } } if (timed_out) return 0; - eavail = ep_busy_loop(ep, timed_out); + eavail = ep_busy_loop(ep); if (eavail) continue; @@ -2445,6 +2453,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return do_epoll_ctl(epfd, op, fd, &epds, false); } +static int ep_check_params(struct file *file, struct epoll_event __user *evs, + int maxevents) +{ + /* The maximum number of event must be greater than zero */ + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) + return -EINVAL; + + /* Verify that the area passed by the user is writeable */ + if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) + return -EFAULT; + + /* + * We have to check that the file structure underneath the fd + * the user passed to us _is_ an eventpoll file. + */ + if (!is_file_epoll(file)) + return -EINVAL; + + return 0; +} + +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents) +{ + struct eventpoll *ep; + int ret; + + ret = ep_check_params(file, events, maxevents); + if (unlikely(ret)) + return ret; + + ep = file->private_data; + /* + * Racy call, but that's ok - it should get retried based on + * poll readiness anyway. + */ + if (ep_events_available(ep)) + return ep_try_send_events(ep, events, maxevents); + return 0; +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). @@ -2453,26 +2502,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; - - /* The maximum number of event must be greater than zero */ - if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) - return -EINVAL; - - /* Verify that the area passed by the user is writeable */ - if (!access_ok(events, maxevents * sizeof(struct epoll_event))) - return -EFAULT; + int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; - /* - * We have to check that the file structure underneath the fd - * the user passed to us _is_ an eventpoll file. - */ - if (!is_file_epoll(fd_file(f))) - return -EINVAL; + ret = ep_check_params(fd_file(f), events, maxevents); + if (unlikely(ret)) + return ret; /* * At this point it is safe to assume that the "private_data" contains diff --git a/fs/exec.c b/fs/exec.c index 506cd411f4ac..f45859ad13ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -755,8 +755,6 @@ int setup_arg_pages(struct linux_binprm *bprm, mm->arg_start = bprm->p; #endif - if (bprm->loader) - bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 8b30027d8251..fede0283d6e2 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -840,8 +840,8 @@ unlock: return err; } -static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -851,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, loff_t size = i_size_read(dir); if (unlikely(exfat_forced_shutdown(sb))) - return -EIO; + return ERR_PTR(-EIO); mutex_lock(&EXFAT_SB(sb)->s_lock); exfat_set_volume_dirty(sb); @@ -882,7 +882,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, unlock: mutex_unlock(&EXFAT_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } static int exfat_check_dir_empty(struct super_block *sb, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 0c899cfba578..b5845c4846b8 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -126,10 +126,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("get_parent of %lu failed, err %ld\n", diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 8346ab9534c1..bde617a66cec 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -225,15 +225,16 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct mnt_idmap * idmap, - struct inode * dir, struct dentry * dentry, umode_t mode) +static struct dentry *ext2_mkdir(struct mnt_idmap * idmap, + struct inode * dir, struct dentry * dentry, + umode_t mode) { struct inode * inode; int err; err = dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); inode_inc_link_count(dir); @@ -258,7 +259,7 @@ static int ext2_mkdir(struct mnt_idmap * idmap, d_instantiate_new(dentry, inode); out: - return err; + return ERR_PTR(err); out_fail: inode_dec_link_count(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c54ae5fcbd4..d04d8a7f12e7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3290,6 +3290,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + /* HW-offload atomics are always used */ + if (flags & IOMAP_ATOMIC) + iomap->flags |= IOMAP_F_ATOMIC_BIO; + if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 536d56d15072..716cc6096870 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3004,19 +3004,19 @@ out: return err; } -static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) - return -EMLINK; + return ERR_PTR(-EMLINK); err = dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -3066,7 +3066,7 @@ out_stop: out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return ERR_PTR(err); } /* diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 69b8a7221a2b..37abee5016c3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -522,7 +522,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index de4da6d9cd93..ece5208223c1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2500,7 +2500,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio) return 0; retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page), PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a278c7da8177..24dca4dc85a9 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -684,23 +684,23 @@ out_free_encrypted_link: return err; } -static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err; if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + return ERR_PTR(-EIO); err = f2fs_dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; @@ -722,12 +722,12 @@ static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); f2fs_balance_fs(sbi, true); - return 0; + return NULL; out_fail: clear_inode_flag(inode, FI_INC_LINK); f2fs_handle_failed_inode(inode); - return err; + return ERR_PTR(err); } static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index f06f6ba643cc..23e9b9371ec3 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -339,8 +339,8 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; @@ -389,13 +389,13 @@ static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, mutex_unlock(&MSDOS_SB(sb)->s_lock); fat_flush_inodes(sb, dir, inode); - return 0; + return NULL; out_free: fat_free_clusters(dir, cluster); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } /***** Unlink a file */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 926c26e90ef8..dd910edd2404 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -841,8 +841,8 @@ out: return err; } -static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -877,13 +877,13 @@ static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); mutex_unlock(&MSDOS_SB(sb)->s_lock); - return 0; + return NULL; out_free: fat_free_clusters(dir, cluster); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh, diff --git a/fs/file.c b/fs/file.c index d868cdb95d1e..dc3f7e120e3e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,6 +26,28 @@ #include "internal.h" +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +{ + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} + /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count @@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) return true; } - /* - * If the reference count was already in the dead zone, then this - * put() operation is imbalanced. Warn, put the reference count back to - * DEAD and tell the caller to not deconstruct the object. - */ - if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { - atomic_long_set(&ref->refcnt, FILE_REF_DEAD); - return false; - } - - /* - * This is a put() operation on a saturated refcount. Restore the - * mean saturation value and tell the caller to not deconstruct the - * object. - */ - if (cnt > FILE_REF_MAXREF) - atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); - return false; + return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); @@ -418,17 +423,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho old_fds = old_fdt->fd; new_fds = new_fdt->fd; + /* + * We may be racing against fd allocation from other threads using this + * files_struct, despite holding ->file_lock. + * + * alloc_fd() might have already claimed a slot, while fd_install() + * did not populate it yet. Note the latter operates locklessly, so + * the file can show up as we are walking the array below. + * + * At the same time we know no files will disappear as all other + * operations take the lock. + * + * Instead of trying to placate userspace racing with itself, we + * ref the file if we see it and mark the fd slot as unused otherwise. + */ for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; + struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); @@ -577,6 +590,7 @@ repeat: __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); @@ -612,22 +626,14 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); -/* - * Install a file pointer in the fd array. - * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array. At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us. We need to detect this and - * fput() the struct file we are about to overwrite in this case. - * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. +/** + * fd_install - install a file pointer in the fd array + * @fd: file descriptor to install the file in + * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ - void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; @@ -642,7 +648,7 @@ void fd_install(unsigned int fd, struct file *file) rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); - WARN_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; @@ -650,7 +656,7 @@ void fd_install(unsigned int fd, struct file *file) /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); - BUG_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } @@ -679,7 +685,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) return NULL; fd = array_index_nospec(fd, fdt->max_fds); - file = fdt->fd[fd]; + file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); @@ -1178,8 +1184,23 @@ struct fd fdget_raw(unsigned int fd) */ static inline bool file_needs_f_pos_lock(struct file *file) { - return (file->f_mode & FMODE_ATOMIC_POS) && - (file_count(file) > 1 || file->f_op->iterate_shared); + if (!(file->f_mode & FMODE_ATOMIC_POS)) + return false; + if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) + return true; + if (file->f_op->iterate_shared) + return true; + return false; +} + +bool file_seek_cur_needs_f_lock(struct file *file) +{ + if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) + return false; + + VFS_WARN_ON_ONCE((file_count(file) > 1) && + !mutex_is_locked(&file->f_pos_lock)); + return true; } struct fd fdget_pos(unsigned int fd) @@ -1187,7 +1208,7 @@ struct fd fdget_pos(unsigned int fd) struct fd f = fdget(fd); struct file *file = fd_file(f); - if (file && file_needs_f_pos_lock(file)) { + if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } @@ -1230,14 +1251,34 @@ __releases(&files->file_lock) struct fdtable *fdt; /* - * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. + * dup2() is expected to close the file installed in the target fd slot + * (if any). However, userspace hand-picking a fd may be racing against + * its own threads which happened to allocate it in open() et al but did + * not populate it yet. + * + * Broadly speaking we may be racing against the following: + * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL + * file = hard_work_goes_here(); + * fd_install(fd, file); // only now ->fd[fd] == file + * + * It is an invariant that a successfully allocated fd has a NULL entry + * in the array until the matching fd_install(). + * + * If we fit the window, we have the fd to populate, yet no target file + * to close. Trying to ignore it and install our new file would violate + * the invariant and make fd_install() overwrite our file. + * + * Things can be done(tm) to handle this. However, the issue does not + * concern legitimate programs and we only need to make sure the kernel + * does not trip over it. + * + * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); - tofree = fdt->fd[fd]; + tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); diff --git a/fs/file_table.c b/fs/file_table.c index 5c00dc38558d..c04ed94cdc4b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -221,7 +221,8 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (unlikely(get_nr_files() >= files_stat.max_files) && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. @@ -511,31 +512,37 @@ void flush_delayed_fput(void) } EXPORT_SYMBOL_GPL(flush_delayed_fput); -void fput(struct file *file) +static void __fput_deferred(struct file *file) { - if (file_ref_put(&file->f_ref)) { - struct task_struct *task = current; + struct task_struct *task = current; + + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } - if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { - file_free(file); + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; - } - if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_task_work, ____fput); - if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) - return; - /* - * After this task has run exit_task_work(), - * task_work_add() will fail. Fall through to delayed - * fput to avoid leaking *file. - */ - } - - if (llist_add(&file->f_llist, &delayed_fput_list)) - schedule_delayed_work(&delayed_fput_work, 1); + /* + * After this task has run exit_task_work(), + * task_work_add() will fail. Fall through to delayed + * fput to avoid leaking *file. + */ } + + if (llist_add(&file->f_llist, &delayed_fput_list)) + schedule_delayed_work(&delayed_fput_work, 1); } +void fput(struct file *file) +{ + if (unlikely(file_ref_put(&file->f_ref))) + __fput_deferred(file); +} +EXPORT_SYMBOL(fput); + /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without @@ -549,10 +556,32 @@ void __fput_sync(struct file *file) if (file_ref_put(&file->f_ref)) __fput(file); } - -EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); +/* + * Equivalent to __fput_sync(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close_sync(struct file *file) +{ + if (likely(file_ref_put_close(&file->f_ref))) + __fput(file); +} + +/* + * Equivalent to fput(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close(struct file *file) +{ + if (file_ref_put_close(&file->f_ref)) + __fput_deferred(file); +} + void __init files_init(void) { struct kmem_cache_args args = { diff --git a/fs/fsopen.c b/fs/fsopen.c index 094a7f510edf..1aaf4cb2afb2 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -453,7 +453,7 @@ SYSCALL_DEFINE5(fsconfig, case FSCONFIG_SET_FD: param.type = fs_value_is_file; ret = -EBADF; - param.file = fget(aux); + param.file = fget_raw(aux); if (!param.file) goto out_key; param.dirfd = aux; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2c3a4d09e500..51e31df4c546 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -77,7 +77,7 @@ void fuse_set_initialized(struct fuse_conn *fc) static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { return !fc->initialized || (for_background && fc->blocked) || - (fc->io_uring && !fuse_uring_ready(fc)); + (fc->io_uring && fc->connected && !fuse_uring_ready(fc)); } static void fuse_drop_waiting(struct fuse_conn *fc) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index ebd2931b4f2a..82bf458fa9db 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -208,11 +208,11 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - fc->ring = ring; ring->nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; atomic_set(&ring->queue_refs, 0); + smp_store_release(&fc->ring, ring); spin_unlock(&fc->lock); return ring; @@ -1041,7 +1041,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, unsigned int issue_flags, struct fuse_conn *fc) { const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); - struct fuse_ring *ring = fc->ring; + struct fuse_ring *ring = smp_load_acquire(&fc->ring); struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; int err; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3805f9b06c9d..fa8f1141ea74 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -781,9 +781,9 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, - struct fuse_args *args, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -792,11 +792,11 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_forget_link *forget; if (fuse_is_bad(dir)) - return -EIO; + return ERR_PTR(-EIO); forget = fuse_alloc_forget(); if (!forget) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memset(&outarg, 0, sizeof(outarg)); args->nodeid = get_node_id(dir); @@ -826,29 +826,43 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } kfree(forget); d_drop(entry); d = d_splice_alias(inode, entry); if (IS_ERR(d)) - return PTR_ERR(d); + return d; - if (d) { + if (d) fuse_change_entry_timeout(d, &outarg); - dput(d); - } else { + else fuse_change_entry_timeout(entry, &outarg); - } fuse_dir_changed(dir); - return 0; + return d; out_put_forget_req: if (err == -EEXIST) fuse_invalidate_entry(entry); kfree(forget); - return err; + return ERR_PTR(err); +} + +static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) +{ + /* + * Note that when creating anything other than a directory we + * can be sure create_new_entry() will NOT return an alternate + * dentry as d_splice_alias() only returns an alternate dentry + * for directories. So we don't need to check for that case + * when passing back the result. + */ + WARN_ON_ONCE(S_ISDIR(mode)); + + return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode)); } static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -871,7 +885,7 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(idmap, fm, &args, dir, entry, mode); + return create_new_nondir(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, @@ -898,8 +912,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, return err; } -static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -934,7 +948,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, args.in_args[1].value = entry->d_name.name; args.in_args[2].size = len; args.in_args[2].value = link; - return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); + return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) @@ -1131,7 +1145,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); + err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf6..366516b98b3f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, + NULL); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6fbbaaad1cd0..198a8cbaf5e5 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1248,14 +1248,15 @@ static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, * @dentry: The dentry of the new directory * @mode: The mode of the new directory * - * Returns: errno + * Returns: the dentry, or ERR_PTR(errno) */ -static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); - return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0); + + return ERR_PTR(gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0)); } /** diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index b75c26045df4..86a6b317b474 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -219,26 +219,26 @@ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); - return res; + return ERR_PTR(res); } d_instantiate(dentry, inode); mark_inode_dirty(inode); - return 0; + return NULL; } /* diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index f5c4b3e31a1c..876bbb80fb4d 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -523,10 +523,10 @@ static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir, return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); + return ERR_PTR(hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0)); } static int hfsplus_rename(struct mnt_idmap *idmap, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e0741e468956..a2c6b9051c5b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -679,17 +679,25 @@ static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino, return err; } -static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, - struct dentry *dentry, umode_t mode) +static struct dentry *hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, + struct dentry *dentry, umode_t mode) { + struct inode *inode; char *file; int err; if ((file = dentry_name(dentry)) == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); err = do_mkdir(file, mode); + if (err) { + dentry = ERR_PTR(err); + } else { + inode = hostfs_iget(dentry->d_sb, file); + d_drop(dentry); + dentry = d_splice_alias(inode, dentry); + } __putname(file); - return err; + return dentry; } static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index d0edf9ed33b6..e3cdc421dfba 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -19,8 +19,8 @@ static void hpfs_update_directory_times(struct inode *dir) hpfs_write_inode_nolock(dir); } -static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -35,7 +35,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, int r; struct hpfs_dirent dee; int err; - if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + if ((err = hpfs_chk_name(name, &len))) return ERR_PTR(err==-ENOENT ? -EINVAL : err); hpfs_lock(dir->i_sb); err = -ENOSPC; fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); @@ -112,7 +112,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); - return 0; + return NULL; bail3: iput(result); bail2: @@ -123,7 +123,7 @@ bail1: hpfs_free_sectors(dir->i_sb, fno, 1); bail: hpfs_unlock(dir->i_sb); - return err; + return ERR_PTR(err); } static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0fc179a59830..d98caedbb723 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -991,14 +991,14 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, return 0; } -static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); - return retval; + return ERR_PTR(retval); } static int hugetlbfs_create(struct mnt_idmap *idmap, diff --git a/fs/init.c b/fs/init.c index e9387b6c4f30..eef5124885e3 100644 --- a/fs/init.c +++ b/fs/init.c @@ -230,9 +230,12 @@ int __init init_mkdir(const char *pathname, umode_t mode) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mkdir(&path, dentry, mode); - if (!error) - error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + if (!error) { + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); + } done_path_create(&path, dentry); return error; } diff --git a/fs/inode.c b/fs/inode.c index 5587aabdaa5e..99318b157a9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -327,7 +327,17 @@ static void i_callback(struct rcu_head *head) free_inode_nonrcu(inode); } -static struct inode *alloc_inode(struct super_block *sb) +/** + * alloc_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; @@ -613,18 +623,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode->i_sb->s_inode_list_lock); + struct super_block *sb = inode->i_sb; + + spin_lock(&sb->s_inode_list_lock); + list_add(&inode->i_sb_list, &sb->s_inodes); + spin_unlock(&sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { + struct super_block *sb = inode->i_sb; + if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode->i_sb->s_inode_list_lock); + spin_lock(&sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode->i_sb->s_inode_list_lock); + spin_unlock(&sb->s_inode_list_lock); } } @@ -806,23 +820,16 @@ static void evict(struct inode *inode) /* * Wake up waiters in __wait_on_freeing_inode(). * - * Lockless hash lookup may end up finding the inode before we removed - * it above, but only lock it *after* we are done with the wakeup below. - * In this case the potential waiter cannot safely block. + * It is an invariant that any thread we need to wake up is already + * accounted for before remove_inode_hash() acquires ->i_lock -- both + * sides take the lock and sleep is aborted if the inode is found + * unhashed. Thus either the sleeper wins and goes off CPU, or removal + * wins and the sleeper aborts after testing with the lock. * - * The inode being unhashed after the call to remove_inode_hash() is - * used as an indicator whether blocking on it is safe. + * This also means we don't need any fences for the call below. */ - spin_lock(&inode->i_lock); - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); - spin_unlock(&inode->i_lock); destroy_inode(inode); } @@ -900,46 +907,6 @@ again: } EXPORT_SYMBOL_GPL(evict_inodes); -/** - * invalidate_inodes - attempt to free all inodes on a superblock - * @sb: superblock to operate on - * - * Attempts to free all inodes (including dirty inodes) for a given superblock. - */ -void invalidate_inodes(struct super_block *sb) -{ - struct inode *inode, *next; - LIST_HEAD(dispose); - -again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - spin_unlock(&inode->i_lock); - continue; - } - if (atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - continue; - } - - inode->i_state |= I_FREEING; - inode_lru_list_del(inode); - spin_unlock(&inode->i_lock); - list_add(&inode->i_lru, &dispose); - if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); - cond_resched(); - dispose_list(&dispose); - goto again; - } - } - spin_unlock(&sb->s_inode_list_lock); - - dispose_list(&dispose); -} - /* * Isolate the inode from the LRU in preparation for freeing it. * @@ -1160,21 +1127,6 @@ unsigned int get_next_ino(void) EXPORT_SYMBOL(get_next_ino); /** - * new_inode_pseudo - obtain an inode - * @sb: superblock - * - * Allocates a new inode for given superblock. - * Inode wont be chained in superblock s_inodes list - * This means : - * - fs can't be unmount - * - quotas, fsnotify, writeback can't work - */ -struct inode *new_inode_pseudo(struct super_block *sb) -{ - return alloc_inode(sb); -} - -/** * new_inode - obtain an inode * @sb: superblock * @@ -1190,7 +1142,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - inode = new_inode_pseudo(sb); + inode = alloc_inode(sb); if (inode) inode_sb_list_add(inode); return inode; @@ -1348,8 +1300,8 @@ again: } if (set && unlikely(set(inode, data))) { - inode = NULL; - goto unlock; + spin_unlock(&inode_hash_lock); + return NULL; } /* @@ -1361,14 +1313,14 @@ again: hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + /* * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); -unlock: - spin_unlock(&inode_hash_lock); return inode; } @@ -1497,8 +1449,8 @@ again: inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); - inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); + inode_sb_list_add(inode); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -2953,3 +2905,18 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap, return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); + +#ifdef CONFIG_DEBUG_VFS +/* + * Dump an inode. + * + * TODO: add a proper inode dumping routine, this is a stub to get debug off the + * ground. + */ +void dump_inode(struct inode *inode, const char *reason) +{ + pr_warn("%s encountered for inode %px", reason, inode); +} + +EXPORT_SYMBOL(dump_inode); +#endif diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e098..b9b3e29a73fd 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -118,6 +118,9 @@ static inline void put_file_access(struct file *file) } } +void fput_close_sync(struct file *); +void fput_close(struct file *); + /* * super.c */ @@ -187,8 +190,8 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); -long do_ftruncate(struct file *file, loff_t length, int small); -long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_ftruncate(struct file *file, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); @@ -207,7 +210,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap, * fs-writeback.c */ extern long get_nr_dirty_inodes(void); -void invalidate_inodes(struct super_block *sb); /* * dcache.c @@ -325,6 +327,7 @@ struct stashed_operations { int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +struct dentry *stashed_dentry_get(struct dentry **stashed); /** * path_mounted - check whether path is mounted * @path: path to check @@ -338,3 +341,5 @@ static inline bool path_mounted(const struct path *path) return path->mnt->mnt_root == path->dentry; } void file_f_owner_release(struct file *file); +bool file_seek_cur_needs_f_lock(struct file *file); +int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map); diff --git a/fs/ioctl.c b/fs/ioctl.c index 638a36be31c1..c91fd2b46a77 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -228,8 +228,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) return error; } -static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) { CLASS(fd, src_file)(srcfd); loff_t cloned; @@ -248,8 +248,8 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, return ret; } -static long ioctl_file_clone_range(struct file *file, - struct file_clone_range __user *argp) +static int ioctl_file_clone_range(struct file *file, + struct file_clone_range __user *argp) { struct file_clone_range args; diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 381d76c5c232..69e8ebb41302 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -12,6 +12,7 @@ iomap-y += trace.o \ iter.o iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ + ioend.o \ fiemap.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..814b7f679486 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -12,17 +12,15 @@ #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/writeback.h> -#include <linux/list_sort.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" -#define IOEND_BATCH_SIZE 4096 - /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -40,8 +38,6 @@ struct iomap_folio_state { unsigned long state[]; }; -static struct bio_set iomap_ioend_bioset; - static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { @@ -366,20 +362,24 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, pos >= i_size_read(iter->inode); } -static loff_t iomap_readpage_iter(const struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx, loff_t offset) +static int iomap_readpage_iter(struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) { const struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos + offset; - loff_t length = iomap_length(iter) - offset; + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; struct iomap_folio_state *ifs; - loff_t orig_pos = pos; size_t poff, plen; sector_t sector; + int ret; - if (iomap->type == IOMAP_INLINE) - return iomap_read_inline_data(iter, folio); + if (iomap->type == IOMAP_INLINE) { + ret = iomap_read_inline_data(iter, folio); + if (ret) + return ret; + return iomap_iter_advance(iter, &length); + } /* zero post-eof blocks as the page may be mapped */ ifs = ifs_alloc(iter->inode, folio, iter->flags); @@ -438,25 +438,22 @@ done: * we can skip trailing ones as they will be handled in the next * iteration. */ - return pos - orig_pos + plen; + length = pos - iter->pos + plen; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_read_folio_iter(const struct iomap_iter *iter, +static int iomap_read_folio_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - struct folio *folio = ctx->cur_folio; - size_t offset = offset_in_folio(folio, iter->pos); - loff_t length = min_t(loff_t, folio_size(folio) - offset, - iomap_length(iter)); - loff_t done, ret; - - for (done = 0; done < length; done += ret) { - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + int ret; + + while (iomap_length(iter)) { + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) @@ -474,7 +471,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); @@ -493,15 +490,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_read_folio); -static loff_t iomap_readahead_iter(const struct iomap_iter *iter, +static int iomap_readahead_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - loff_t length = iomap_length(iter); - loff_t done, ret; + int ret; - for (done = 0; done < length; done += ret) { + while (iomap_length(iter)) { if (ctx->cur_folio && - offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { + offset_in_folio(ctx->cur_folio, iter->pos) == 0) { if (!ctx->cur_folio_in_bio) folio_unlock(ctx->cur_folio); ctx->cur_folio = NULL; @@ -510,12 +506,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter, ctx->cur_folio = readahead_folio(ctx->rac); ctx->cur_folio_in_bio = false; } - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } /** @@ -547,7 +543,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.processed = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); @@ -603,6 +599,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + if (iter->flags & IOMAP_DONTCACHE) + fgp |= FGP_DONTCACHE; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, @@ -907,12 +905,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, return __iomap_write_end(iter->inode, pos, len, copied, folio); } -static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) +static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - loff_t length = iomap_length(iter); - loff_t pos = iter->pos; ssize_t total_written = 0; - long status = 0; + int status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; @@ -923,7 +919,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - size_t written; /* Bytes have been written */ + u64 written; /* Bytes have been written */ + loff_t pos = iter->pos; bytes = iov_iter_count(i); retry: @@ -934,8 +931,8 @@ retry: if (unlikely(status)) break; - if (bytes > length) - bytes = length; + if (bytes > iomap_length(iter)) + bytes = iomap_length(iter); /* * Bring in the user page that we'll copy from _first_. @@ -1006,17 +1003,12 @@ retry: goto retry; } } else { - pos += written; total_written += written; - length -= written; + iomap_iter_advance(iter, &written); } - } while (iov_iter_count(i) && length); + } while (iov_iter_count(i) && iomap_length(iter)); - if (status == -EAGAIN) { - iov_iter_revert(i, total_written); - return -EAGAIN; - } - return total_written ? total_written : status; + return total_written ? 0 : status; } ssize_t @@ -1034,9 +1026,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_DONTCACHE) + iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_write_iter(&iter, i); + iter.status = iomap_write_iter(&iter, i); if (unlikely(iter.pos == iocb->ki_pos)) return ret; @@ -1270,23 +1264,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, } EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); -static loff_t iomap_unshare_iter(struct iomap_iter *iter) +static int iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; if (!iomap_want_unshare_iter(iter)) - return length; + return iomap_iter_advance(iter, &bytes); do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; @@ -1304,14 +1297,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) cond_resched(); - pos += bytes; - written += bytes; - length -= bytes; - balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length > 0); - return written; + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); + + return status; } int @@ -1331,7 +1324,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_unshare_iter(&iter); + iter.status = iomap_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); @@ -1350,19 +1343,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; @@ -1383,25 +1375,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) if (WARN_ON_ONCE(!ret)) return -EIO; - pos += bytes; - length -= bytes; - written += bytes; - } while (length > 0); + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); if (did_zero) *did_zero = true; - return written; + return status; } int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, + .private = private, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); @@ -1424,7 +1417,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); iter.len = len - (iter.pos - pos); if (ret || !iter.len) @@ -1443,17 +1436,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { - loff_t proc = iomap_length(&iter); + s64 status; if (range_dirty) { range_dirty = false; - proc = iomap_zero_iter_flush_and_stale(&iter); + status = iomap_zero_iter_flush_and_stale(&iter); + } else { + status = iomap_iter_advance_full(&iter); } - iter.processed = proc; + iter.status = status; continue; } - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); } return ret; } @@ -1461,7 +1456,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1469,11 +1464,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, + private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, +static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, struct folio *folio) { loff_t length = iomap_length(iter); @@ -1490,14 +1486,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, folio_mark_dirty(folio); } - return length; + return iomap_iter_advance(iter, &length); } -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, + .private = private, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; @@ -1509,7 +1507,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_folio_mkwrite_iter(&iter, folio); + iter.status = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; @@ -1538,16 +1536,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; - if (error) { - mapping_set_error(inode->i_mapping, error); + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", @@ -1566,116 +1563,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) return folio_count; } -/* - * Ioend completion routine for merged bios. This can only be called from task - * contexts as merged ioends can be of unbound length. Hence we have to break up - * the writeback completions into manageable chunks to avoid long scheduler - * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get - * good batch processing throughput without creating adverse scheduler latency - * conditions. - */ -void -iomap_finish_ioends(struct iomap_ioend *ioend, int error) -{ - struct list_head tmp; - u32 completions; - - might_sleep(); - - list_replace_init(&ioend->io_list, &tmp); - completions = iomap_finish_ioend(ioend, error); - - while (!list_empty(&tmp)) { - if (completions > IOEND_BATCH_SIZE * 8) { - cond_resched(); - completions = 0; - } - ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); - list_del_init(&ioend->io_list); - completions += iomap_finish_ioend(ioend, error); - } -} -EXPORT_SYMBOL_GPL(iomap_finish_ioends); - -/* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) -{ - if (ioend->io_bio.bi_status != next->io_bio.bi_status) - return false; - if (next->io_flags & IOMAP_F_BOUNDARY) - return false; - if ((ioend->io_flags & IOMAP_F_SHARED) ^ - (next->io_flags & IOMAP_F_SHARED)) - return false; - if ((ioend->io_type == IOMAP_UNWRITTEN) ^ - (next->io_type == IOMAP_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - /* - * Do not merge physically discontiguous ioends. The filesystem - * completion functions will have to iterate the physical - * discontiguities even if we merge the ioends at a logical level, so - * we don't gain anything by merging physical discontiguities here. - * - * We cannot use bio->bi_iter.bi_sector here as it is modified during - * submission so does not point to the start sector of the bio at - * completion. - */ - if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) - return false; - return true; -} - -void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) -{ - struct iomap_ioend *next; - - INIT_LIST_HEAD(&ioend->io_list); - - while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, - io_list))) { - if (!iomap_ioend_can_merge(ioend, next)) - break; - list_move_tail(&next->io_list, &ioend->io_list); - ioend->io_size += next->io_size; - } -} -EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); - -static int -iomap_ioend_compare(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); - struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); - - if (ia->io_offset < ib->io_offset) - return -1; - if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - -void -iomap_sort_ioends(struct list_head *ioend_list) -{ - list_sort(NULL, ioend_list, iomap_ioend_compare); -} -EXPORT_SYMBOL_GPL(iomap_sort_ioends); - static void iomap_writepage_end_bio(struct bio *bio) { - iomap_finish_ioend(iomap_ioend_from_bio(bio), - blk_status_to_errno(bio->bi_status)); + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); } /* - * Submit the final bio for an ioend. + * Submit an ioend. * * If @error is non-zero, it means that we have a situation where some part of * the submission process has failed after we've marked pages for writeback. @@ -1694,14 +1591,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (wpc->ops->submit_ioend) { + error = wpc->ops->submit_ioend(wpc, error); + } else { + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + if (!error) + submit_bio(&wpc->ioend->io_bio); + } if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); bio_endio(&wpc->ioend->io_bio); - } else { - submit_bio(&wpc->ioend->io_bio); } wpc->ioend = NULL; @@ -1709,9 +1610,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos) + struct writeback_control *wbc, struct inode *inode, loff_t pos, + u16 ioend_flags) { - struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, @@ -1719,36 +1620,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; - wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; - - ioend = iomap_ioend_from_bio(bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = wpc->iomap.type; - ioend->io_flags = wpc->iomap.flags; - if (pos > wpc->iomap.offset) - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = pos; - ioend->io_sector = bio->bi_iter.bi_sector; - + wbc_init_bio(wbc, bio); wpc->nr_folios = 0; - return ioend; + return iomap_init_ioend(inode, bio, pos, ioend_flags); } -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) { - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) - return false; - if ((wpc->iomap.flags & IOMAP_F_SHARED) != - (wpc->ioend->io_flags & IOMAP_F_SHARED)) + if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; - if (wpc->iomap.type != wpc->ioend->io_type) + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (iomap_sector(&wpc->iomap, pos) != + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* @@ -1779,14 +1668,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) @@ -2062,11 +1960,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, return iomap_submit_ioend(wpc, error); } EXPORT_SYMBOL_GPL(iomap_writepages); - -static int __init iomap_buffered_init(void) -{ - return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_bio), - BIOSET_NEED_BVECS); -} -fs_initcall(iomap_buffered_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 0e47da82b0c2..844261a31156 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2021 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -12,6 +12,7 @@ #include <linux/backing-dev.h> #include <linux/uio.h> #include <linux/task_io_accounting_ops.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, WRITE_ONCE(iocb->private, bio); } - if (dio->dops && dio->dops->submit_io) + if (dio->dops && dio->dops->submit_io) { dio->dops->submit_io(iter, bio, pos); - else + } else { + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); submit_bio(bio); + } } ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } -void iomap_dio_bio_end_io(struct bio *bio) +/* + * Called when dio->ref reaches zero from an I/O completion. + */ +static void iomap_dio_done(struct iomap_dio *dio) { - struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); struct kiocb *iocb = dio->iocb; - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - if (!atomic_dec_and_test(&dio->ref)) - goto release_bio; - - /* - * Synchronous dio, task itself will handle any completion work - * that needs after IO. All we need to do is wake the task. - */ if (dio->wait_for_completion) { + /* + * Synchronous I/O, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - goto release_bio; - } - - /* - * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline - */ - if (dio->flags & IOMAP_DIO_INLINE_COMP) { + } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); - goto release_bio; - } - - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule - * our completion that way to avoid an async punt to a workqueue. - */ - if (dio->flags & IOMAP_DIO_CALLER_COMP) { + } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then + * schedule our completion that way to avoid an async punt to a + * workqueue. + */ /* only polled IO cares about private cleared */ iocb->private = dio; iocb->dio_complete = iomap_dio_deferred_complete; @@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio) * issuer. */ iocb->ki_complete(iocb, 0); - goto release_bio; + } else { + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Async DIO completion that requires filesystem level + * completion work gets punted to a work queue to complete as + * the operation may require more IO to be issued to finalise + * filesystem metadata changes or guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } +} + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + + if (atomic_dec_and_test(&dio->ref)) + iomap_dio_done(dio); - /* - * Async DIO completion that requires filesystem level completion work - * gets punted to a work queue to complete as the operation may require - * more IO to be issued to finalise filesystem metadata changes or - * guarantee data integrity. - */ - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, - &dio->aio.work); -release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -239,6 +244,47 @@ release_bio: } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Try to avoid another context switch for the completion given + * that we are already called from the ioend completion + * workqueue, but never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. Tough luck if + * you hit the tiny race with someone dirtying the range now + * between this check and the actual completion. + */ + if (!dio->iocb->ki_filp->f_mapping->nrpages) { + dio->flags |= IOMAP_DIO_INLINE_COMP; + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + iomap_dio_done(dio); + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + /* + * Return the number of bvecs completed as even direct I/O completions + * do significant per-folio work and we'll still want to give up the + * CPU after a lot of completions. + */ + return vec_count; +} + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { @@ -266,81 +312,85 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, } /* - * Figure out the bio's operation flags from the dio request, the - * mapping, and whether or not we want FUA. Note that we can end up - * clearing the WRITE_THROUGH flag in the dio request. + * Use a FUA write if we need datasync semantics and this is a pure data I/O + * that doesn't require any metadata updates (including after I/O completion + * such as unwritten extent conversion) and the underlying device either + * doesn't have a volatile write cache or supports FUA. + * This allows us to avoid cache flushes on I/O completion. */ -static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua, bool atomic) +static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, + struct iomap_dio *dio) { - blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - - if (!(dio->flags & IOMAP_DIO_WRITE)) - return REQ_OP_READ; - - opflags |= REQ_OP_WRITE; - if (use_fua) - opflags |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - if (atomic) - opflags |= REQ_ATOMIC; - - return opflags; + if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) + return false; + if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) + return false; + return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); } -static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; const loff_t length = iomap_length(iter); - bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; - blk_opf_t bio_opf; + blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE; struct bio *bio; bool need_zeroout = false; - bool use_fua = false; int nr_pages, ret = 0; - size_t copied = 0; + u64 copied = 0; size_t orig_count; - if (atomic && length != fs_block_size) - return -EINVAL; - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } + if (dio->flags & IOMAP_DIO_WRITE) { + bio_opf |= REQ_OP_WRITE; + + if (iomap->flags & IOMAP_F_ATOMIC_BIO) { + /* + * Ensure that the mapping covers the full write + * length, otherwise it won't be submitted as a single + * bio, which is required to use hardware atomics. + */ + if (length != iter->len) + return -EINVAL; + bio_opf |= REQ_ATOMIC; + } - if (iomap->flags & IOMAP_F_SHARED) - dio->flags |= IOMAP_DIO_COW; + if (iomap->type == IOMAP_UNWRITTEN) { + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + } + + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; + + if (iomap->flags & IOMAP_F_NEW) { + need_zeroout = true; + } else if (iomap->type == IOMAP_MAPPED) { + if (iomap_dio_can_use_fua(iomap, dio)) + bio_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + } - if (iomap->flags & IOMAP_F_NEW) { - need_zeroout = true; - } else if (iomap->type == IOMAP_MAPPED) { /* - * Use a FUA write if we need datasync semantics, this is a pure - * data IO that doesn't require any metadata updates (including - * after IO completion such as unwritten extent conversion) and - * the underlying device either supports FUA or doesn't have - * a volatile write cache. This allows us to avoid cache flushes - * on IO completion. If we can't use writethrough and need to - * sync, disable in-task completions as dio completion will - * need to call generic_write_sync() which will do a blocking - * fsync / cache flush call. + * We can only do deferred completion for pure overwrites that + * don't require additional I/O at completion time. + * + * This rules out writes that need zeroing or extent conversion, + * extend the file size, or issue metadata I/O or cache flushes + * during completion processing. */ - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_THROUGH) && - (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) - use_fua = true; - else if (dio->flags & IOMAP_DIO_NEED_SYNC) + if (need_zeroout || (pos >= i_size_read(inode)) || + ((dio->flags & IOMAP_DIO_NEED_SYNC) && + !(bio_opf & REQ_FUA))) dio->flags &= ~IOMAP_DIO_CALLER_COMP; + } else { + bio_opf |= REQ_OP_READ; } /* @@ -355,18 +405,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; /* - * We can only do deferred completion for pure overwrites that - * don't require additional IO at completion. This rules out - * writes that need zeroing or extent conversion, extend - * the file size, or issue journal IO or cache flushes - * during completion processing. - */ - if (need_zeroout || - ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || - ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) - dio->flags &= ~IOMAP_DIO_CALLER_COMP; - - /* * The rules for polled IO completions follow the guidelines as the * ones we set for inline and deferred completions. If none of those * are available for this IO, clear the polled flag. @@ -383,8 +421,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; @@ -416,9 +452,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; - if (WARN_ON_ONCE(atomic && n != length)) { + if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) { /* - * This bio should have covered the complete length, + * An atomic write bio must cover the complete length, * which it doesn't, so error. We may need to zero out * the tail (complete FS block), similar to when * bio_iov_iter_get_pages() returns an error, above. @@ -465,30 +501,28 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return copied; + return iomap_iter_advance(iter, &copied); return ret; } -static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) { loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); dio->size += length; if (!length) return -EFAULT; - return length; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, - struct iomap_dio *dio) +static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) { const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; void *inline_data = iomap_inline_data(iomap, iomi->pos); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; - size_t copied; + u64 copied; if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; @@ -510,11 +544,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, dio->size += copied; if (!copied) return -EFAULT; - return copied; + return iomap_iter_advance(iomi, &copied); } -static loff_t iomap_dio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { switch (iter->iomap.type) { case IOMAP_HOLE: @@ -608,9 +641,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iocb->ki_flags & IOCB_ATOMIC) - iomi.flags |= IOMAP_ATOMIC; - if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -645,6 +675,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + /* for data sync or sync, we need sync completion processing */ if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; @@ -698,7 +731,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); while ((ret = iomap_iter(&iomi, ops)) > 0) { - iomi.processed = iomap_dio_iter(&iomi, dio); + iomi.status = iomap_dio_iter(&iomi, dio); /* * We can only poll for single bio I/Os. diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 610ca6f1ec9b..80675c42e94e 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, +static int iomap_fiemap_iter(struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) - return iomap_length(iter); + goto advance; ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; - switch (ret) { - case 0: /* success */ - return iomap_length(iter); - case 1: /* extent array full */ - return 0; - default: /* error */ + if (ret < 0) return ret; - } + if (ret == 1) /* extent array full */ + return 0; + +advance: + return iomap_iter_advance_full(iter); } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, @@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_fiemap_iter(&iter, fi, &prev); + iter.status = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); @@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno, while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; - /* leave iter.processed unset to abort loop */ + /* leave iter.status unset to abort loop */ } if (ret) return 0; diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h new file mode 100644 index 000000000000..f6992a3bf66a --- /dev/null +++ b/fs/iomap/internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IOMAP_INTERNAL_H +#define _IOMAP_INTERNAL_H 1 + +#define IOEND_BATCH_SIZE 4096 + +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); + +#endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c new file mode 100644 index 000000000000..18894ebba6db --- /dev/null +++ b/fs/iomap/ioend.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024-2025 Christoph Hellwig. + */ +#include <linux/iomap.h> +#include <linux/list_sort.h> +#include "internal.h" + +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); + +struct iomap_ioend *iomap_init_ioend(struct inode *inode, + struct bio *bio, loff_t file_offset, u16 ioend_flags) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_parent = NULL; + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_flags = ioend_flags; + ioend->io_inode = inode; + ioend->io_offset = file_offset; + ioend->io_size = bio->bi_iter.bi_size; + ioend->io_sector = bio->bi_iter.bi_sector; + ioend->io_private = NULL; + return ioend; +} +EXPORT_SYMBOL_GPL(iomap_init_ioend); + +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); + return iomap_finish_ioend_buffered(ioend); +} + +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ +void iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + u32 completions; + + might_sleep(); + + list_replace_init(&ioend->io_list, &tmp); + completions = iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + completions += iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, + struct iomap_ioend *next) +{ + if (ioend->io_bio.bi_status != next->io_bio.bi_status) + return false; + if (next->io_flags & IOMAP_IOEND_BOUNDARY) + return false; + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != + next->io_sector) + return false; + return true; +} + +void iomap_ioend_try_merge(struct iomap_ioend *ioend, + struct list_head *more_ioends) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + +/* + * Split up to the first @max_len bytes from @ioend if the ioend covers more + * than @max_len bytes. + * + * If @is_append is set, the split will be based on the hardware limits for + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware + * limits don't allow the entire @max_len length. + * + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to + * switch the operation after this call, but before submitting the bio. + */ +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append) +{ + struct bio *bio = &ioend->io_bio; + struct iomap_ioend *split_ioend; + unsigned int nr_segs; + int sector_offset; + struct bio *split; + + if (is_append) { + struct queue_limits *lim = bdev_limits(bio->bi_bdev); + + max_len = min(max_len, + lim->max_zone_append_sectors << SECTOR_SHIFT); + + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); + if (unlikely(sector_offset < 0)) + return ERR_PTR(sector_offset); + if (!sector_offset) + return NULL; + } else { + if (bio->bi_iter.bi_size <= max_len) + return NULL; + sector_offset = max_len >> SECTOR_SHIFT; + } + + /* ensure the split ioend is still block size aligned */ + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; + + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); + if (IS_ERR(split)) + return ERR_CAST(split); + split->bi_private = bio->bi_private; + split->bi_end_io = bio->bi_end_io; + + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, + ioend->io_flags); + split_ioend->io_parent = ioend; + + atomic_inc(&ioend->io_remaining); + ioend->io_offset += split_ioend->io_size; + ioend->io_size -= split_ioend->io_size; + + split_ioend->io_sector = ioend->io_sector; + if (!is_append) + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); + return split_ioend; +} +EXPORT_SYMBOL_GPL(iomap_split_ioend); + +static int __init iomap_ioend_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_ioend_init); diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 3790918646af..6ffc6a7b9ba5 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,40 +7,25 @@ #include <linux/iomap.h> #include "trace.h" -/* - * Advance to the next range we need to map. - * - * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully - * processed - it was aborted because the extent the iomap spanned may have been - * changed during the operation. In this case, the iteration behaviour is to - * remap the unprocessed range of the iter, and that means we may need to remap - * even when we've made no progress (i.e. iter->processed = 0). Hence the - * "finished iterating" case needs to distinguish between - * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we - * need to remap the entire remaining range. - */ -static inline int iomap_iter_advance(struct iomap_iter *iter) +static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { - bool stale = iter->iomap.flags & IOMAP_F_STALE; - int ret = 1; - - /* handle the previous iteration (if any) */ - if (iter->iomap.length) { - if (iter->processed < 0) - return iter->processed; - if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) - return -EIO; - iter->pos += iter->processed; - iter->len -= iter->processed; - if (!iter->len || (!iter->processed && !stale)) - ret = 0; - } - - /* clear the per iteration state */ - iter->processed = 0; + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); - return ret; +} + +/* + * Advance the current iterator position and output the length remaining for the + * current mapping. + */ +int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +{ + if (WARN_ON_ONCE(*count > iomap_length(iter))) + return -EIO; + iter->pos += *count; + iter->len -= *count; + *count = iomap_length(iter); + return 0; } static inline void iomap_iter_done(struct iomap_iter *iter) @@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); + iter->iter_start_pos = iter->pos; + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); @@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter) * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out - * of the loop body: leave @iter.processed unchanged, or set it to a negative + * of the loop body: leave @iter.status unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + ssize_t advanced; + u64 olen; int ret; - if (iter->iomap.length && ops->iomap_end) { - ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), - iter->processed > 0 ? iter->processed : 0, - iter->flags, &iter->iomap); - if (ret < 0 && !iter->processed) + trace_iomap_iter(iter, ops, _RET_IP_); + + if (!iter->iomap.length) + goto begin; + + /* + * Calculate how far the iter was advanced and the original length bytes + * for ->iomap_end(). + */ + advanced = iter->pos - iter->iter_start_pos; + olen = iter->len + advanced; + + if (ops->iomap_end) { + ret = ops->iomap_end(iter->inode, iter->iter_start_pos, + iomap_length_trim(iter, iter->iter_start_pos, + olen), + advanced, iter->flags, &iter->iomap); + if (ret < 0 && !advanced) return ret; } - trace_iomap_iter(iter, ops, _RET_IP_); - ret = iomap_iter_advance(iter); + /* detect old return semantics where this would advance */ + if (WARN_ON_ONCE(iter->status > 0)) + iter->status = -EIO; + + /* + * Use iter->len to determine whether to continue onto the next mapping. + * Explicitly terminate on error status or if the current iter has not + * advanced at all (i.e. no work was done for some reason) unless the + * mapping has been marked stale and needs to be reprocessed. + */ + if (iter->status < 0) + ret = iter->status; + else if (iter->len == 0 || (!advanced && !stale)) + ret = 0; + else + ret = 1; + iomap_iter_reset_iomap(iter); if (ret <= 0) return ret; +begin: ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index a845c012b50c..04d7919636c1 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, +static int iomap_seek_hole_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); @@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return length; + return iomap_iter_advance(iter, &length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return length; + return iomap_iter_advance(iter, &length); } } @@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_hole_iter(&iter, &pos); + iter.status = iomap_seek_hole_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found hole before EOF */ @@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t iomap_seek_data_iter(const struct iomap_iter *iter, +static int iomap_seek_data_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); switch (iter->iomap.type) { case IOMAP_HOLE: - return length; + return iomap_iter_advance(iter, &length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return length; + return iomap_iter_advance(iter, &length); return 0; default: *hole_pos = iter->pos; @@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_data_iter(&iter, &pos); + iter.status = iomap_seek_data_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found data before EOF */ diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index b90d0eda9e51..c1a762c10ce4 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, +static int iomap_swapfile_iter(struct iomap_iter *iter, struct iomap *iomap, struct iomap_swapfile_info *isi) { switch (iomap->type) { @@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, return error; memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return iomap_length(iter); + + return iomap_iter_advance_full(iter); } /* @@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi); if (ret < 0) return ret; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4118a42cdab0..9eab2c8ac3c5 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) - __field(s64, processed) + __field(int, status) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); - __entry->processed = iter->processed; + __entry->status = iter->status; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, - __entry->processed, + __entry->status, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 2b2938970da3..dd91f725ded6 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -32,8 +32,8 @@ static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); static int jffs2_symlink (struct mnt_idmap *, struct inode *, struct dentry *, const char *); -static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, - umode_t); +static struct dentry *jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, + umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); @@ -446,8 +446,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, } -static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, - struct dentry *dentry, umode_t mode) +static struct dentry *jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, + struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -464,7 +464,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, ri = jffs2_alloc_raw_inode(); if (!ri) - return -ENOMEM; + return ERR_PTR(-ENOMEM); c = JFFS2_SB_INFO(dir_i->i_sb); @@ -477,7 +477,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, if (ret) { jffs2_free_raw_inode(ri); - return ret; + return ERR_PTR(ret); } inode = jffs2_new_inode(dir_i, mode, ri); @@ -485,7 +485,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, if (IS_ERR(inode)) { jffs2_free_raw_inode(ri); jffs2_complete_reservation(c); - return PTR_ERR(inode); + return ERR_CAST(inode); } inode->i_op = &jffs2_dir_inode_operations; @@ -584,11 +584,11 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, jffs2_complete_reservation(c); d_instantiate_new(dentry, inode); - return 0; + return NULL; fail: iget_failed(inode); - return ret; + return ERR_PTR(ret); } static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index fc8ede43afde..65a218eba8fa 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -187,13 +187,13 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, * dentry - dentry of child directory * mode - create mode (rwxrwxrwx). * - * RETURN: Errors from subroutines + * RETURN: ERR_PTR() of errors from subroutines. * * note: * EACCES: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, - struct dentry *dentry, umode_t mode) +static struct dentry *jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, + struct dentry *dentry, umode_t mode) { int rc = 0; tid_t tid; /* transaction id */ @@ -308,7 +308,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, out1: jfs_info("jfs_mkdir: rc:%d", rc); - return rc; + return ERR_PTR(rc); } /* diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5f0f8b95f44c..d296aad70800 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1230,24 +1230,24 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, return d_splice_alias(inode, dentry); } -static int kernfs_iop_mkdir(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) - return -EPERM; + return ERR_PTR(-EPERM); if (!kernfs_get_active(parent)) - return -ENODEV; + return ERR_PTR(-ENODEV); ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); - return ret; + return ERR_PTR(ret); } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/libfs.c b/fs/libfs.c index 8444f5cc4064..6393d7c49ee6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -496,7 +496,7 @@ offset_dir_lookup(struct dentry *parent, loff_t offset) found = find_positive_dentry(parent, NULL, false); else { rcu_read_lock(); - child = mas_find(&mas, DIR_OFFSET_MAX); + child = mas_find_rev(&mas, DIR_OFFSET_MIN); found = find_positive_dentry(parent, child, false); rcu_read_unlock(); } @@ -2113,7 +2113,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) } EXPORT_SYMBOL(simple_inode_init_ts); -static inline struct dentry *get_stashed_dentry(struct dentry **stashed) +struct dentry *stashed_dentry_get(struct dentry **stashed) { struct dentry *dentry; @@ -2215,7 +2215,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = get_stashed_dentry(stashed); + path->dentry = stashed_dentry_get(stashed); if (path->dentry) { sops->put_data(data); goto out_path; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 5d9c1406fe27..8938536d8d3c 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -104,15 +104,15 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode * inode; int err; inode = minix_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode_inc_link_count(dir); minix_set_inode(inode, 0); @@ -128,7 +128,7 @@ static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); out: - return err; + return ERR_PTR(err); out_fail: inode_dec_link_count(inode); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 7b1df8cc2821..a37991fdb194 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -6,6 +6,7 @@ #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/seq_file.h> #include "internal.h" @@ -334,3 +335,53 @@ void mnt_idmap_put(struct mnt_idmap *idmap) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); + +int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) +{ + struct uid_gid_map *map, *map_up; + u32 idx, nr_mappings; + + if (!is_valid_mnt_idmap(idmap)) + return 0; + + /* + * Idmappings are shown relative to the caller's idmapping. + * This is both the most intuitive and most useful solution. + */ + if (uid_map) { + map = &idmap->uid_map; + map_up = ¤t_user_ns()->uid_map; + } else { + map = &idmap->gid_map; + map_up = ¤t_user_ns()->gid_map; + } + + for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { + uid_t lower; + struct uid_gid_extent *extent; + + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = &map->extent[idx]; + else + extent = &map->forward[idx]; + + /* + * Verify that the whole range of the mapping can be + * resolved in the caller's idmapping. If it cannot be + * resolved skip the mapping. + */ + lower = map_id_range_up(map_up, extent->lower_first, extent->count); + if (lower == (uid_t) -1) + continue; + + seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); + + seq->count++; /* mappings are separated by \0 */ + if (seq_has_overflowed(seq)) + return -EAGAIN; + + nr_mappings++; + } + + return nr_mappings; +} diff --git a/fs/mount.h b/fs/mount.h index ffb613cdfeee..7aecf2a60472 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,12 @@ #include <linux/ns_common.h> #include <linux/fs_pin.h> +extern struct list_head notify_list; + +typedef __u32 __bitwise mntns_flags_t; + +#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0)) + struct mnt_namespace { struct ns_common ns; struct mount * root; @@ -20,12 +26,18 @@ struct mnt_namespace { wait_queue_head_t poll; struct rcu_head mnt_ns_rcu; }; + u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; +#ifdef CONFIG_FSNOTIFY + __u32 n_fsnotify_mask; + struct fsnotify_mark_connector __rcu *n_fsnotify_marks; +#endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ + mntns_flags_t mntns_flags; } __randomize_layout; struct mnt_pcp { @@ -76,6 +88,8 @@ struct mount { #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; + struct list_head to_notify; /* need to queue notification */ + struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */ #endif int mnt_id; /* mount identifier, reused */ u64 mnt_id_unique; /* mount ID unique until reboot */ @@ -156,6 +170,11 @@ static inline bool mnt_ns_attached(const struct mount *mnt) return !RB_EMPTY_NODE(&mnt->mnt_node); } +static inline bool mnt_ns_empty(const struct mnt_namespace *ns) +{ + return RB_EMPTY_ROOT(&ns->mounts); +} + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { struct mnt_namespace *ns = mnt->mnt_ns; @@ -177,3 +196,21 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } + +#ifdef CONFIG_FSNOTIFY +static inline void mnt_notify_add(struct mount *m) +{ + /* Optimize the case where there are no watches */ + if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) || + (m->prev_ns && m->prev_ns->n_fsnotify_marks)) + list_add_tail(&m->to_notify, ¬ify_list); + else + m->prev_ns = m->mnt_ns; +} +#else +static inline void mnt_notify_add(struct mount *m) +{ +} +#endif + +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/mpage.c b/fs/mpage.c index 82aecf372743..ad7844de87c3 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -107,7 +107,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, * don't make any buffers if there is only one buffer on * the folio and the folio just needs to be set up to date */ - if (inode->i_blkbits == PAGE_SHIFT && + if (inode->i_blkbits == folio_shift(folio) && buffer_uptodate(bh)) { folio_mark_uptodate(folio); return; @@ -153,7 +153,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct folio *folio = args->folio; struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + const unsigned blocks_per_folio = folio_size(folio) >> blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; @@ -161,7 +161,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) sector_t last_block_in_file; sector_t first_block; unsigned page_block; - unsigned first_hole = blocks_per_page; + unsigned first_hole = blocks_per_folio; struct block_device *bdev = NULL; int length; int fully_mapped = 1; @@ -170,9 +170,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) unsigned relative_block; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); - /* MAX_BUF_PER_PAGE, for example */ - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - if (args->is_readahead) { opf |= REQ_RAHEAD; gfp |= __GFP_NORETRY | __GFP_NOWARN; @@ -181,8 +178,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (folio_buffers(folio)) goto confused; - block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + args->nr_pages * blocks_per_page; + block_in_file = folio_pos(folio) >> blkbits; + last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits); last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -204,7 +201,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) clear_buffer_mapped(map_bh); break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -216,7 +213,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * Then do more get_blocks calls until we are done with this folio. */ map_bh->b_folio = folio; - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { map_bh->b_state = 0; map_bh->b_size = 0; @@ -229,7 +226,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (!buffer_mapped(map_bh)) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; @@ -247,7 +244,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) goto confused; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ @@ -260,7 +257,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -268,8 +265,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) bdev = map_bh->b_bdev; } - if (first_hole != blocks_per_page) { - folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); + if (first_hole != blocks_per_folio) { + folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { folio_mark_uptodate(folio); folio_unlock(folio); @@ -303,10 +300,10 @@ alloc_new: relative_block = block_in_file - args->first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || - (first_hole != blocks_per_page)) + (first_hole != blocks_per_folio)) args->bio = mpage_bio_submit_read(args->bio); else - args->last_block_in_bio = first_block + blocks_per_page - 1; + args->last_block_in_bio = first_block + blocks_per_folio - 1; out: return args->bio; @@ -385,7 +382,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block) { struct mpage_readpage_args args = { .folio = folio, - .nr_pages = 1, + .nr_pages = folio_nr_pages(folio), .get_block = get_block, }; @@ -456,12 +453,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + const unsigned blocks_per_folio = folio_size(folio) >> blkbits; sector_t last_block; sector_t block_in_file; sector_t first_block; unsigned page_block; - unsigned first_unmapped = blocks_per_page; + unsigned first_unmapped = blocks_per_folio; struct block_device *bdev = NULL; int boundary = 0; sector_t boundary_block = 0; @@ -486,12 +483,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, */ if (buffer_dirty(bh)) goto confused; - if (first_unmapped == blocks_per_page) + if (first_unmapped == blocks_per_folio) first_unmapped = page_block; continue; } - if (first_unmapped != blocks_per_page) + if (first_unmapped != blocks_per_folio) goto confused; /* hole -> non-hole */ if (!buffer_dirty(bh) || !buffer_uptodate(bh)) @@ -527,7 +524,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, * The page has no buffers: map it to disk */ BUG_ON(!folio_test_uptodate(folio)); - block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); + block_in_file = folio_pos(folio) >> blkbits; /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. @@ -536,7 +533,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, goto page_is_mapped; last_block = (i_size - 1) >> blkbits; map_bh.b_folio = folio; - for (page_block = 0; page_block < blocks_per_page; ) { + for (page_block = 0; page_block < blocks_per_folio; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; @@ -618,14 +615,14 @@ alloc_new: BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); folio_unlock(folio); - if (boundary || (first_unmapped != blocks_per_page)) { + if (boundary || (first_unmapped != blocks_per_folio)) { bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); } } else { - mpd->last_block_in_bio = first_block + blocks_per_page - 1; + mpd->last_block_in_bio = first_block + blocks_per_folio - 1; } goto out; diff --git a/fs/namei.c b/fs/namei.c index ecb7b95c2ca3..360a86ca1f02 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -125,6 +125,13 @@ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) +static inline void initname(struct filename *name) +{ + name->uptr = NULL; + name->aname = NULL; + atomic_set(&name->refcnt, 1); +} + struct filename * getname_flags(const char __user *filename, int flags) { @@ -203,10 +210,7 @@ getname_flags(const char __user *filename, int flags) return ERR_PTR(-ENAMETOOLONG); } } - - atomic_set(&result->refcnt, 1); - result->uptr = filename; - result->aname = NULL; + initname(result); audit_getname(result); return result; } @@ -218,11 +222,6 @@ struct filename *getname_uflags(const char __user *filename, int uflags) return getname_flags(filename, flags); } -struct filename *getname(const char __user * filename) -{ - return getname_flags(filename, 0); -} - struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; @@ -269,25 +268,27 @@ struct filename *getname_kernel(const char * filename) return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); - result->uptr = NULL; - result->aname = NULL; - atomic_set(&result->refcnt, 1); + initname(result); audit_getname(result); - return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { + int refcnt; + if (IS_ERR_OR_NULL(name)) return; - if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) - return; + refcnt = atomic_read(&name->refcnt); + if (refcnt != 1) { + if (WARN_ON_ONCE(!refcnt)) + return; - if (!atomic_dec_and_test(&name->refcnt)) - return; + if (!atomic_dec_and_test(&name->refcnt)) + return; + } if (name->name != name->iname) { __putname(name->name); @@ -1670,6 +1671,8 @@ static struct dentry *lookup_dcache(const struct qstr *name, * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, @@ -1680,7 +1683,7 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct inode *dir = base->d_inode; if (dentry) - return dentry; + goto found; /* Don't create child dentry for a dead directory. */ if (unlikely(IS_DEADDIR(dir))) @@ -1695,6 +1698,17 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, dput(dentry); dentry = old; } +found: + if (IS_ERR(dentry)) + return dentry; + if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { + dput(dentry); + return ERR_PTR(-ENOENT); + } + if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) { + dput(dentry); + return ERR_PTR(-EEXIST); + } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); @@ -2863,15 +2877,14 @@ static int lookup_one_common(struct mnt_idmap *idmap, * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * - * The caller must hold base->i_mutex. + * No locks need be held - only a counted reference to @base is needed. + * */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; - WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -3415,6 +3428,8 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path, if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; + default: + VFS_BUG_ON_INODE(1, inode); } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); @@ -3995,7 +4010,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -4078,27 +4093,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) - create_flags = 0; + create_flags &= ~LOOKUP_CREATE; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; - error = -EEXIST; - if (d_is_positive(dentry)) - goto fail; - - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (unlikely(!create_flags)) { - error = -ENOENT; - goto fail; - } if (unlikely(err2)) { error = err2; goto fail; @@ -4129,7 +4130,8 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { - dput(dentry); + if (!IS_ERR(dentry)) + dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); @@ -4275,7 +4277,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d } /** - * vfs_mkdir - create directory + * vfs_mkdir - create directory returning correct dentry if possible * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory @@ -4288,32 +4290,51 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. + * + * In the event that the filesystem does not use the *@dentry but leaves it + * negative or unhashes it and possibly splices a different one returning it, + * the original dentry is dput() and the alternate is returned. + * + * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ -int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; + struct dentry *de; error = may_create(idmap, dir, dentry); if (error) - return error; + goto err; + error = -EPERM; if (!dir->i_op->mkdir) - return -EPERM; + goto err; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) - return error; + goto err; + error = -EMLINK; if (max_links && dir->i_nlink >= max_links) - return -EMLINK; + goto err; - error = dir->i_op->mkdir(idmap, dir, dentry, mode); - if (!error) - fsnotify_mkdir(dir, dentry); - return error; + de = dir->i_op->mkdir(idmap, dir, dentry, mode); + error = PTR_ERR(de); + if (IS_ERR(de)) + goto err; + if (de) { + dput(dentry); + dentry = de; + } + fsnotify_mkdir(dir, dentry); + return dentry; + +err: + dput(dentry); + return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); @@ -4333,8 +4354,10 @@ retry: error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { - error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { @@ -4445,10 +4468,6 @@ retry: error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; - if (!dentry->d_inode) { - error = -ENOENT; - goto exit4; - } error = security_path_rmdir(&path, dentry); if (error) goto exit4; @@ -4579,7 +4598,7 @@ retry_deleg: if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (last.name[last.len] || d_is_negative(dentry)) + if (last.name[last.len]) goto slashes; inode = dentry->d_inode; ihold(inode); @@ -4613,9 +4632,7 @@ exit1: return error; slashes: - if (d_is_negative(dentry)) - error = -ENOENT; - else if (d_is_dir(dentry)) + if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; @@ -5115,7 +5132,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; - unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; + unsigned int lookup_flags = 0, target_flags = + LOOKUP_RENAME_TARGET | LOOKUP_CREATE; bool should_retry = false; int error = -EINVAL; @@ -5128,6 +5146,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, if (flags & RENAME_EXCHANGE) target_flags = 0; + if (flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, @@ -5169,23 +5189,12 @@ retry_deleg: error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; - /* source must exist */ - error = -ENOENT; - if (d_is_negative(old_dentry)) - goto exit4; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; - error = -EEXIST; - if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) - goto exit5; if (flags & RENAME_EXCHANGE) { - error = -ENOENT; - if (d_is_negative(new_dentry)) - goto exit5; - if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) diff --git a/fs/namespace.c b/fs/namespace.c index 8f1000f9f3df..6100e5b962a6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -81,15 +81,23 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); +#ifdef CONFIG_FSNOTIFY +LIST_HEAD(notify_list); /* protected by namespace_sem */ +#endif static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ +enum mount_kattr_flags_t { + MOUNT_KATTR_RECURSE = (1 << 0), + MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), +}; + struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; - bool recurse; + enum mount_kattr_flags_t kflags; struct user_namespace *mnt_userns; struct mnt_idmap *mnt_idmap; }; @@ -163,6 +171,7 @@ static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { + fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); } @@ -998,6 +1007,17 @@ static inline int check_mnt(struct mount *mnt) return mnt->mnt_ns == current->nsproxy->mnt_ns; } +static inline bool check_anonymous_mnt(struct mount *mnt) +{ + u64 seq; + + if (!is_anon_ns(mnt->mnt_ns)) + return false; + + seq = mnt->mnt_ns->seq_origin; + return !seq || (seq == current->nsproxy->mnt_ns->seq); +} + /* * vfsmount lock must be held for write */ @@ -1176,6 +1196,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); + + mnt_notify_add(mnt); } /* @@ -1723,6 +1745,50 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); +#ifdef CONFIG_FSNOTIFY +static void mnt_notify(struct mount *p) +{ + if (!p->prev_ns && p->mnt_ns) { + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } else if (p->prev_ns && !p->mnt_ns) { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + } else if (p->prev_ns == p->mnt_ns) { + fsnotify_mnt_move(p->mnt_ns, &p->mnt); + } else { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } + p->prev_ns = p->mnt_ns; +} + +static void notify_mnt_list(void) +{ + struct mount *m, *tmp; + /* + * Notify about mounts that were added/reparented/detached/remain + * connected after unmount. + */ + list_for_each_entry_safe(m, tmp, ¬ify_list, to_notify) { + mnt_notify(m); + list_del_init(&m->to_notify); + } +} + +static bool need_notify_mnt_list(void) +{ + return !list_empty(¬ify_list); +} +#else +static void notify_mnt_list(void) +{ +} + +static bool need_notify_mnt_list(void) +{ + return false; +} +#endif + static void namespace_unlock(void) { struct hlist_head head; @@ -1733,7 +1799,18 @@ static void namespace_unlock(void) hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); - up_write(&namespace_sem); + if (need_notify_mnt_list()) { + /* + * No point blocking out concurrent readers while notifications + * are sent. This will also allow statmount()/listmount() to run + * concurrently. + */ + downgrade_write(&namespace_sem); + notify_mnt_list(); + up_read(&namespace_sem); + } else { + up_write(&namespace_sem); + } shrink_dentry_list(&list); @@ -1846,6 +1923,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); + + /* + * At this point p->mnt_ns is NULL, notification will be queued + * only if + * + * - p->prev_ns is non-NULL *and* + * - p->prev_ns->n_fsnotify_marks is non-NULL + * + * This will preclude queuing the mount if this is a cleanup + * after a failed copy_tree() or destruction of an anonymous + * namespace, etc. + */ + mnt_notify_add(p); } } @@ -2026,6 +2116,7 @@ static void warn_mandlock(void) static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); + struct super_block *sb = path->dentry->d_sb; if (!may_mount()) return -EPERM; @@ -2035,7 +2126,7 @@ static int can_umount(const struct path *path, int flags) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; - if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } @@ -2145,16 +2236,24 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr } } +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) +{ + if (!is_mnt_ns_file(dentry)) + return NULL; + + return to_mnt_ns(get_proc_ns(dentry->d_inode)); +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ - struct mnt_namespace *mnt_ns; - if (!is_mnt_ns_file(dentry)) + struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); + + if (!mnt_ns) return false; - mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -2246,22 +2345,75 @@ struct vfsmount *collect_mounts(const struct path *path) static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); +static inline bool must_dissolve(struct mnt_namespace *mnt_ns) +{ + /* + * This mount belonged to an anonymous mount namespace + * but was moved to a non-anonymous mount namespace and + * then unmounted. + */ + if (unlikely(!mnt_ns)) + return false; + + /* + * This mount belongs to a non-anonymous mount namespace + * and we know that such a mount can never transition to + * an anonymous mount namespace again. + */ + if (!is_anon_ns(mnt_ns)) { + /* + * A detached mount either belongs to an anonymous mount + * namespace or a non-anonymous mount namespace. It + * should never belong to something purely internal. + */ + VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); + return false; + } + + return true; +} + void dissolve_on_fput(struct vfsmount *mnt) { struct mnt_namespace *ns; - namespace_lock(); - lock_mount_hash(); - ns = real_mount(mnt)->mnt_ns; - if (ns) { - if (is_anon_ns(ns)) - umount_tree(real_mount(mnt), UMOUNT_CONNECTED); - else - ns = NULL; + struct mount *m = real_mount(mnt); + + scoped_guard(rcu) { + if (!must_dissolve(READ_ONCE(m->mnt_ns))) + return; } - unlock_mount_hash(); - namespace_unlock(); - if (ns) - free_mnt_ns(ns); + + scoped_guard(rwsem_write, &namespace_sem) { + ns = m->mnt_ns; + if (!must_dissolve(ns)) + return; + + /* + * After must_dissolve() we know that this is a detached + * mount in an anonymous mount namespace. + * + * Now when mnt_has_parent() reports that this mount + * tree has a parent, we know that this anonymous mount + * tree has been moved to another anonymous mount + * namespace. + * + * So when closing this file we cannot unmount the mount + * tree. This will be done when the file referring to + * the root of the anonymous mount namespace will be + * closed (It could already be closed but it would sync + * on @namespace_sem and wait for us to finish.). + */ + if (mnt_has_parent(m)) + return; + + lock_mount_hash(); + umount_tree(m, UMOUNT_CONNECTED); + unlock_mount_hash(); + } + + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); + free_mnt_ns(ns); } void drop_collected_mounts(struct vfsmount *mnt) @@ -2287,6 +2439,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +/* + * Check that there aren't references to earlier/same mount namespaces in the + * specified subtree. Such references can act as pins for mount namespaces + * that aren't checked by the mount-cycle checking code, thereby allowing + * cycles to be made. + */ +static bool check_for_nsfs_mounts(struct mount *subtree) +{ + struct mount *p; + bool ret = false; + + lock_mount_hash(); + for (p = subtree; p; p = next_mnt(p, subtree)) + if (mnt_ns_loop(p->mnt.mnt_root)) + goto out; + + ret = true; +out: + unlock_mount_hash(); + return ret; +} + /** * clone_private_mount - create a private clone of a path * @path: path to clone @@ -2295,6 +2469,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * + * This assumes caller has called or done the equivalent of may_mount(). + * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) @@ -2302,30 +2478,36 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - down_read(&namespace_sem); + scoped_guard(rwsem_read, &namespace_sem) if (IS_MNT_UNBINDABLE(old_mnt)) - goto invalid; + return ERR_PTR(-EINVAL); - if (!check_mnt(old_mnt)) - goto invalid; + if (mnt_has_parent(old_mnt)) { + if (!check_mnt(old_mnt)) + return ERR_PTR(-EINVAL); + } else { + if (!is_mounted(&old_mnt->mnt)) + return ERR_PTR(-EINVAL); + + /* Make sure this isn't something purely kernel internal. */ + if (!is_anon_ns(old_mnt->mnt_ns)) + return ERR_PTR(-EINVAL); + + /* Make sure we don't create mount namespace loops. */ + if (!check_for_nsfs_mounts(old_mnt)) + return ERR_PTR(-EINVAL); + } if (has_locked_children(old_mnt, path->dentry)) - goto invalid; + return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); - up_read(&namespace_sem); - if (IS_ERR(new_mnt)) - return ERR_CAST(new_mnt); + return ERR_PTR(-EINVAL); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; - return &new_mnt->mnt; - -invalid: - up_read(&namespace_sem); - return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -2424,6 +2606,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) enum mnt_tree_flags_t { MNT_TREE_MOVE = BIT(0), MNT_TREE_BENEATH = BIT(1), + MNT_TREE_PROPAGATION = BIT(2), }; /** @@ -2547,6 +2730,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, dest_mp = smp; unhash_mnt(source_mnt); attach_mnt(source_mnt, top_mnt, dest_mp, beneath); + mnt_notify_add(source_mnt); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { @@ -2773,6 +2957,71 @@ static int do_change_type(struct path *path, int ms_flags) return err; } +/* may_copy_tree() - check if a mount tree can be copied + * @path: path to the mount tree to be copied + * + * This helper checks if the caller may copy the mount tree starting + * from @path->mnt. The caller may copy the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller tries to copy an nfs mount referring to a mount + * namespace, i.e., the caller is trying to copy a mount namespace + * entry from nsfs. + * (3) The caller tries to copy a pidfs mount referring to a pidfd. + * (4) The caller is trying to copy a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * This is not strictly necessary. The current semantics of the new + * mount api enforce that the caller must be located in the same + * mount namespace as the mount tree it interacts with. Using the + * origin sequence number preserves these semantics even for + * anonymous mount namespaces. However, one could envision extending + * the api to directly operate across mount namespace if needed. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to copy the mount tree. + * + * Returns true if the mount tree can be copied, false otherwise. + */ +static inline bool may_copy_tree(struct path *path) +{ + struct mount *mnt = real_mount(path->mnt); + const struct dentry_operations *d_op; + + if (check_mnt(mnt)) + return true; + + d_op = path->dentry->d_op; + if (d_op == &ns_dentry_operations) + return true; + + if (d_op == &pidfs_dentry_operations) + return true; + + if (!is_mounted(path->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + + static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); @@ -2780,13 +3029,8 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) if (IS_MNT_UNBINDABLE(old)) return mnt; - if (!check_mnt(old)) { - const struct dentry_operations *d_op = old_path->dentry->d_op; - - if (d_op != &ns_dentry_operations && - d_op != &pidfs_dentry_operations) - return mnt; - } + if (!may_copy_tree(old_path)) + return mnt; if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; @@ -2853,15 +3097,30 @@ out: static struct file *open_detached_copy(struct path *path, bool recursive) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; + struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; struct file *file; + ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) return ERR_CAST(ns); namespace_lock(); + + /* + * Record the sequence number of the source mount namespace. + * This needs to hold namespace_sem to ensure that the mount + * doesn't get attached. + */ + if (is_mounted(path->mnt)) { + src_mnt_ns = real_mount(path->mnt)->mnt_ns; + if (is_anon_ns(src_mnt_ns)) + ns->seq_origin = src_mnt_ns->seq_origin; + else + ns->seq_origin = src_mnt_ns->seq; + } + mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { namespace_unlock(); @@ -2889,24 +3148,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return file; } -SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { - struct file *file; - struct path path; + int ret; + struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; - int error; - int fd; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) - return -EINVAL; + return ERR_PTR(-EINVAL); if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) - return -EINVAL; + return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; @@ -2916,27 +3173,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) - return -EPERM; + return ERR_PTR(-EPERM); + + ret = user_path_at(dfd, filename, lookup_flags, &path); + if (unlikely(ret)) + return ERR_PTR(ret); + + if (detached) + return open_detached_copy(&path, flags & AT_RECURSIVE); + + return dentry_open(&path, O_PATH, current_cred()); +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +{ + int fd; + struct file *file __free(fput) = NULL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; - error = user_path_at(dfd, filename, lookup_flags, &path); - if (unlikely(error)) { - file = ERR_PTR(error); - } else { - if (detached) - file = open_detached_copy(&path, flags & AT_RECURSIVE); - else - file = dentry_open(&path, O_PATH, current_cred()); - path_put(&path); - } - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - fd_install(fd, file); + fd_install(fd, no_free_ptr(file)); return fd; } @@ -3123,28 +3385,6 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -/* - * Check that there aren't references to earlier/same mount namespaces in the - * specified subtree. Such references can act as pins for mount namespaces - * that aren't checked by the mount-cycle checking code, thereby allowing - * cycles to be made. - */ -static bool check_for_nsfs_mounts(struct mount *subtree) -{ - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) - if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; -} - static int do_set_group(struct path *from_path, struct path *to_path) { struct mount *from, *to; @@ -3320,8 +3560,56 @@ static int can_move_mount_beneath(const struct path *from, return 0; } -static int do_move_mount(struct path *old_path, struct path *new_path, - bool beneath) +/* may_use_mount() - check if a mount tree can be used + * @mnt: vfsmount to be used + * + * This helper checks if the caller may use the mount tree starting + * from @path->mnt. The caller may use the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller is trying to use a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to use the mount tree. + * + * Returns true if the mount tree can be used, false otherwise. + */ +static inline bool may_use_mount(struct mount *mnt) +{ + if (check_mnt(mnt)) + return true; + + /* + * Make sure that noone unmounted the target path or somehow + * managed to get their hands on something purely kernel + * internal. + */ + if (!is_mounted(&mnt->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + +static int do_move_mount(struct path *old_path, + struct path *new_path, enum mnt_tree_flags_t flags) { struct mnt_namespace *ns; struct mount *p; @@ -3329,8 +3617,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path, struct mount *parent; struct mountpoint *mp, *old_mp; int err; - bool attached; - enum mnt_tree_flags_t flags = 0; + bool attached, beneath = flags & MNT_TREE_BENEATH; mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) @@ -3346,8 +3633,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path, ns = old->mnt_ns; err = -EINVAL; - /* The mountpoint must be in our namespace. */ - if (!check_mnt(p)) + if (!may_use_mount(p)) goto out; /* The thing moved must be mounted... */ @@ -3358,6 +3644,32 @@ static int do_move_mount(struct path *old_path, struct path *new_path, if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; + if (is_anon_ns(ns)) { + /* + * Ending up with two files referring to the root of the + * same anonymous mount namespace would cause an error + * as this would mean trying to move the same mount + * twice into the mount tree which would be rejected + * later. But be explicit about it right here. + */ + if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns)) + goto out; + + /* + * If this is an anonymous mount tree ensure that mount + * propagation can detect mounts that were just + * propagated to the target mount tree so we don't + * propagate onto them. + */ + ns->mntns_flags |= MNTNS_PROPAGATING; + } else if (is_anon_ns(p->mnt_ns)) { + /* + * Don't allow moving an attached mount tree to an + * anonymous mount tree. + */ + goto out; + } + if (old->mnt.mnt_flags & MNT_LOCKED) goto out; @@ -3400,6 +3712,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path, if (err) goto out; + if (is_anon_ns(ns)) + ns->mntns_flags &= ~MNTNS_PROPAGATING; + /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); @@ -3408,10 +3723,13 @@ static int do_move_mount(struct path *old_path, struct path *new_path, out: unlock_mount(mp); if (!err) { - if (attached) + if (attached) { mntput_no_expire(parent); - else + } else { + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); free_mnt_ns(ns); + } } return err; } @@ -3428,7 +3746,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, false); + err = do_move_mount(&old_path, path, 0); path_put(&old_path); return err; } @@ -4269,6 +4587,21 @@ err_unlock: return ret; } +static inline int vfs_move_mount(struct path *from_path, struct path *to_path, + enum mnt_tree_flags_t mflags) +{ + int ret; + + ret = security_move_mount(from_path, to_path); + if (ret) + return ret; + + if (mflags & MNT_TREE_PROPAGATION) + return do_set_group(from_path, to_path); + + return do_move_mount(from_path, to_path, mflags); +} + /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination @@ -4282,8 +4615,12 @@ SYSCALL_DEFINE5(move_mount, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { - struct path from_path, to_path; - unsigned int lflags; + struct path to_path __free(path_put) = {}; + struct path from_path __free(path_put) = {}; + struct filename *to_name __free(putname) = NULL; + struct filename *from_name __free(putname) = NULL; + unsigned int lflags, uflags; + enum mnt_tree_flags_t mflags = 0; int ret = 0; if (!may_mount()) @@ -4296,43 +4633,53 @@ SYSCALL_DEFINE5(move_mount, (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) return -EINVAL; - /* If someone gives a pathname, they aren't permitted to move - * from an fd that requires unmount as we can't get at the flag - * to clear it afterwards. - */ + if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; + if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; + lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; - - ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); - if (ret < 0) - return ret; + uflags = 0; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; + from_name = getname_maybe_null(from_pathname, uflags); + if (IS_ERR(from_name)) + return PTR_ERR(from_name); lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + uflags = 0; + if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; + to_name = getname_maybe_null(to_pathname, uflags); + if (IS_ERR(to_name)) + return PTR_ERR(to_name); + + if (!to_name && to_dfd >= 0) { + CLASS(fd_raw, f_to)(to_dfd); + if (fd_empty(f_to)) + return -EBADF; + + to_path = fd_file(f_to)->f_path; + path_get(&to_path); + } else { + ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); + if (ret) + return ret; + } - ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); - if (ret < 0) - goto out_from; + if (!from_name && from_dfd >= 0) { + CLASS(fd_raw, f_from)(from_dfd); + if (fd_empty(f_from)) + return -EBADF; - ret = security_move_mount(&from_path, &to_path); - if (ret < 0) - goto out_to; + return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); + } - if (flags & MOVE_MOUNT_SET_GROUP) - ret = do_set_group(&from_path, &to_path); - else - ret = do_move_mount(&from_path, &to_path, - (flags & MOVE_MOUNT_BENEATH)); + ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); + if (ret) + return ret; -out_to: - path_put(&to_path); -out_from: - path_put(&from_path); - return ret; + return vfs_move_mount(&from_path, &to_path, mflags); } /* @@ -4468,6 +4815,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, list_del_init(&new_mnt->mnt_expire); put_mountpoint(root_mp); unlock_mount_hash(); + mnt_notify_add(root_mnt); + mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: @@ -4512,11 +4861,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) return -EINVAL; /* - * Once a mount has been idmapped we don't allow it to change its - * mapping. It makes things simpler and callers can just create - * another bind-mount they can idmap if they want to. + * We only allow an mount to change it's idmapping if it has + * never been accessible to userspace. */ - if (is_idmapped_mnt(m)) + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ @@ -4576,7 +4924,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) break; } - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) return 0; } @@ -4606,18 +4954,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { + struct mnt_idmap *old_idmap; + if (!kattr->mnt_idmap) return; - /* - * Pairs with smp_load_acquire() in mnt_idmap(). - * - * Since we only allow a mount to change the idmapping once and - * verified this in can_idmap_mount() we know that the mount has - * @nop_mnt_idmap attached to it. So there's no need to drop any - * references. - */ + old_idmap = mnt_idmap(&mnt->mnt); + + /* Pairs with smp_load_acquire() in mnt_idmap(). */ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); + mnt_idmap_put(old_idmap); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4637,7 +4983,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) break; } touch_mnt_namespace(mnt->mnt_ns); @@ -4667,7 +5013,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) */ namespace_lock(); if (kattr->propagation == MS_SHARED) { - err = invent_group_ids(mnt, kattr->recurse); + err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); if (err) { namespace_unlock(); return err; @@ -4718,7 +5064,7 @@ out: } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { struct ns_common *ns; struct user_namespace *mnt_userns; @@ -4726,13 +5072,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; - /* - * We currently do not support clearing an idmapped mount. If this ever - * is a use-case we can revisit this but for now let's keep it simple - * and not allow it. - */ - if (attr->attr_clr & MOUNT_ATTR_IDMAP) - return -EINVAL; + if (attr->attr_clr & MOUNT_ATTR_IDMAP) { + /* + * We can only remove an idmapping if it's never been + * exposed to userspace. + */ + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) + return -EINVAL; + + /* + * Removal of idmappings is equivalent to setting + * nop_mnt_idmap. + */ + if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { + kattr->mnt_idmap = &nop_mnt_idmap; + return 0; + } + } if (attr->userns_fd > INT_MAX) return -EINVAL; @@ -4769,22 +5125,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { - unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; - - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - - *kattr = (struct mount_kattr) { - .lookup_flags = lookup_flags, - .recurse = !!(flags & AT_RECURSIVE), - }; - if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) @@ -4832,35 +5174,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize, return -EINVAL; } - return build_mount_idmapped(attr, usize, kattr, flags); + return build_mount_idmapped(attr, usize, kattr); } static void finish_mount_kattr(struct mount_kattr *kattr) { - put_user_ns(kattr->mnt_userns); - kattr->mnt_userns = NULL; + if (kattr->mnt_userns) { + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; + } if (kattr->mnt_idmap) mnt_idmap_put(kattr->mnt_idmap); } -SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, - unsigned int, flags, struct mount_attr __user *, uattr, - size_t, usize) +static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize, + struct mount_kattr *kattr) { - int err; - struct path target; + int ret; struct mount_attr attr; - struct mount_kattr kattr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); - if (flags & ~(AT_EMPTY_PATH | - AT_RECURSIVE | - AT_SYMLINK_NOFOLLOW | - AT_NO_AUTOMOUNT)) - return -EINVAL; - if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) @@ -4869,9 +5204,9 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, if (!may_mount()) return -EPERM; - err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); - if (err) - return err; + ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (ret) + return ret; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && @@ -4879,7 +5214,39 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, attr.propagation == 0) return 0; - err = build_mount_kattr(&attr, usize, &kattr, flags); + return build_mount_kattr(&attr, usize, kattr); +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_kattr kattr; + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + }; + + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + + err = copy_mount_setattr(uattr, usize, &kattr); if (err) return err; @@ -4892,6 +5259,47 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; } +SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, + unsigned, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + struct file __free(fput) *file = NULL; + int fd; + + if (!uattr && usize) + return -EINVAL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (uattr) { + int ret; + struct mount_kattr kattr = {}; + + kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + + ret = copy_mount_setattr(uattr, usize, &kattr); + if (ret) + return ret; + + ret = do_mount_setattr(&file->f_path, &kattr); + if (ret) + return ret; + + finish_mount_kattr(&kattr); + } + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + fd_install(fd, no_free_ptr(file)); + return fd; +} + int show_path(struct seq_file *m, struct dentry *root) { if (root->d_sb->s_op->show_path) @@ -4915,6 +5323,7 @@ struct kstatmount { struct statmount __user *buf; size_t bufsize; struct vfsmount *mnt; + struct mnt_idmap *idmap; u64 mask; struct path root; struct statmount sm; @@ -5184,6 +5593,46 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, true); + if (ret < 0) + return ret; + + s->sm.mnt_uidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_UIDMAP; + return 0; +} + +static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, false); + if (ret < 0) + return ret; + + s->sm.mnt_gidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_GIDMAP; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; @@ -5231,6 +5680,14 @@ static int statmount_string(struct kstatmount *s, u64 flag) offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; + case STATMOUNT_MNT_UIDMAP: + sm->mnt_uidmap = start; + ret = statmount_mnt_uidmap(s, seq); + break; + case STATMOUNT_MNT_GIDMAP: + sm->mnt_gidmap = start; + ret = statmount_mnt_gidmap(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5306,7 +5763,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) * We have to find the first mount in our ns and use that, however it * may not exist, so handle that properly. */ - if (RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_empty(ns)) return -ENOENT; first = child = ns->root; @@ -5323,6 +5780,21 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) return 0; } +/* This must be updated whenever a new flag is added */ +#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ + STATMOUNT_MNT_BASIC | \ + STATMOUNT_PROPAGATE_FROM | \ + STATMOUNT_MNT_ROOT | \ + STATMOUNT_MNT_POINT | \ + STATMOUNT_FS_TYPE | \ + STATMOUNT_MNT_NS_ID | \ + STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | \ + STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | \ + STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_SUPPORTED_MASK) + static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5331,7 +5803,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, int err; /* Has the namespace already been emptied? */ - if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_id && mnt_ns_empty(ns)) return -ENOENT; s->mnt = lookup_mnt_in_ns(mnt_id, ns); @@ -5356,6 +5828,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, return err; s->root = root; + s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_SB_BASIC) statmount_sb_basic(s); @@ -5389,12 +5862,26 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_SB_SOURCE) err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_UIDMAP) + err = statmount_string(s, STATMOUNT_MNT_UIDMAP); + + if (!err && s->mask & STATMOUNT_MNT_GIDMAP) + err = statmount_string(s, STATMOUNT_MNT_GIDMAP); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); + if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { + s->sm.mask |= STATMOUNT_SUPPORTED_MASK; + s->sm.supported_mask = STATMOUNT_SUPPORTED; + } + if (err) return err; + /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ + WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); + return 0; } @@ -5412,7 +5899,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ - STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 0bf3c2f5a710..5e3f0aeb51f3 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -125,9 +125,9 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) * Perform a read to an application buffer, bypassing the pagecache and the * local disk cache. */ -static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) +static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) { - int ret; + ssize_t ret; _enter("R=%x %llx-%llx", rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); @@ -155,7 +155,7 @@ static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) else ret = -EIOCBQUEUED; out: - _leave(" = %d", ret); + _leave(" = %zd", ret); return ret; } diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 636cc5a98ef5..23c75755ad4e 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -682,14 +682,16 @@ void netfs_wait_for_pause(struct netfs_io_request *rreq) trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - subreq = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); - if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || - test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { - __set_current_state(TASK_RUNNING); - netfs_read_collection(rreq); - continue; + if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + subreq = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + if (subreq && + (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { + __set_current_state(TASK_RUNNING); + netfs_read_collection(rreq); + continue; + } } if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || diff --git a/fs/netfs/rolling_buffer.c b/fs/netfs/rolling_buffer.c index 75d97af14b4a..207b6a326651 100644 --- a/fs/netfs/rolling_buffer.c +++ b/fs/netfs/rolling_buffer.c @@ -146,10 +146,6 @@ ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll, /* Store the counter after setting the slot. */ smp_store_release(&roll->next_head_slot, to); - - for (; ix < folioq_nr_slots(fq); ix++) - folioq_clear(fq, ix); - return size; } diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 294f67795f79..3fca59e6475d 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -400,7 +400,8 @@ void netfs_write_collection_worker(struct work_struct *work) trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); if (wreq->io_streams[1].active && - wreq->io_streams[1].failed) { + wreq->io_streams[1].failed && + ictx->ops->invalidate_cache) { /* Cache write failure doesn't prevent writeback completion * unless we're in disconnected mode. */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2b04038b0e40..bc957487f6ec 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1532,7 +1532,8 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; - return flags & LOOKUP_EXCL; + return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) == + (LOOKUP_CREATE | LOOKUP_EXCL); } /* @@ -2421,11 +2422,11 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct iattr attr; - int error; + struct dentry *ret; dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -2434,14 +2435,9 @@ int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, attr.ia_mode = mode | S_IFDIR; trace_nfs_mkdir_enter(dir, dentry); - error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); - trace_nfs_mkdir_exit(dir, dentry, error); - if (error != 0) - goto out_err; - return 0; -out_err: - d_drop(dentry); - return error; + ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret)); + return ret; } EXPORT_SYMBOL_GPL(nfs_mkdir); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index fae2c7ae4acc..1ac1d3eec517 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -400,8 +400,8 @@ struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t); +struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 0c3bc98cd999..755ed3c37051 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -578,13 +578,13 @@ out: return status; } -static int +static struct dentry * nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - struct dentry *d_alias; - int status = -ENOMEM; + struct dentry *ret = ERR_PTR(-ENOMEM); + int status; dprintk("NFS call mkdir %pd\n", dentry); @@ -592,8 +592,9 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) if (data == NULL) goto out; - status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); - if (status) + ret = ERR_PTR(posix_acl_create(dir, &sattr->ia_mode, + &default_acl, &acl)); + if (IS_ERR(ret)) goto out; data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; @@ -602,25 +603,27 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - d_alias = nfs3_do_create(dir, dentry, data); - status = PTR_ERR_OR_ZERO(d_alias); + ret = nfs3_do_create(dir, dentry, data); - if (status != 0) + if (IS_ERR(ret)) goto out_release_acls; - if (d_alias) - dentry = d_alias; + if (ret) + dentry = ret; status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + if (status) { + dput(ret); + ret = ERR_PTR(status); + } - dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); out: nfs3_free_createdata(data); - dprintk("NFS reply mkdir: %d\n", status); - return status; + dprintk("NFS reply mkdir: %d\n", PTR_ERR_OR_ZERO(ret)); + return ret; } static int diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6e95db6c17e9..70c8ea943019 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3154,9 +3154,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (d_really_is_negative(dentry)) { struct dentry *alias; d_drop(dentry); - alias = d_exact_alias(dentry, state->inode); - if (!alias) - alias = d_splice_alias(igrab(state->inode), dentry); + alias = d_splice_alias(igrab(state->inode), dentry); /* d_splice_alias() can't fail here - it's a non-directory */ if (alias) { dput(ctx->dentry); @@ -5139,9 +5137,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { spin_lock(&dir->i_lock); - /* Creating a directory bumps nlink in the parent */ - if (data->arg.ftype == NF4DIR) - nfs4_inc_nlink_locked(dir); nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, data->res.fattr->time_start, NFS_INO_INVALID_DATA); @@ -5151,6 +5146,25 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ return status; } +static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, + struct nfs4_createdata *data) +{ + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); + + if (status) + return ERR_PTR(status); + + spin_lock(&dir->i_lock); + /* Creating a directory bumps nlink in the parent */ + nfs4_inc_nlink_locked(dir); + nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, + data->res.fattr->time_start, + NFS_INO_INVALID_DATA); + spin_unlock(&dir->i_lock); + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); +} + static void nfs4_free_createdata(struct nfs4_createdata *data) { nfs4_label_free(data->fattr.label); @@ -5207,32 +5221,34 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, struct nfs4_label *label) +static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, + struct nfs4_label *label) { struct nfs4_createdata *data; - int status = -ENOMEM; + struct dentry *ret = ERR_PTR(-ENOMEM); data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - status = nfs4_do_create(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data); nfs4_free_createdata(data); out: - return status; + return ret; } -static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) +static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) { struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { .interruptible = true, }; struct nfs4_label l, *label; + struct dentry *alias; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -5240,14 +5256,15 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - err = _nfs4_proc_mkdir(dir, dentry, sattr, label); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label); + err = PTR_ERR_OR_ZERO(alias); trace_nfs4_mkdir(dir, &dentry->d_name, err); err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); nfs4_label_release_security(label); - return err; + return alias; } static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 77920a2e3cef..63e71310b9f6 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -446,13 +446,14 @@ out: return status; } -static int +static struct dentry * nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], }; + struct dentry *alias = NULL; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -464,12 +465,15 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + if (status == 0) { + alias = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + status = PTR_ERR_OR_ZERO(alias); + } else + alias = ERR_PTR(status); nfs_free_createdata(data); out: dprintk("NFS reply mkdir: %d\n", status); - return status; + return alias; } static int diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 28f4d5311c40..c1d9bd07285f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -233,9 +233,12 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + if (IS_ERR(dentry)) + status = PTR_ERR(dentry); out_put: - dput(dentry); + if (!status) + dput(dentry); out_unlock: inode_unlock(d_inode(dir)); if (status == 0) { diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 29cb7b812d71..34d7aa531662 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1461,7 +1461,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct inode *dirp; struct iattr *iap = attrs->na_iattr; __be32 err; - int host_err; + int host_err = 0; dentry = fhp->fh_dentry; dirp = d_inode(dentry); @@ -1488,28 +1488,15 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); - if (!host_err && unlikely(d_unhashed(dchild))) { - struct dentry *d; - d = lookup_one_len(dchild->d_name.name, - dchild->d_parent, - dchild->d_name.len); - if (IS_ERR(d)) { - host_err = PTR_ERR(d); - break; - } - if (unlikely(d_is_negative(d))) { - dput(d); - err = nfserr_serverfault; - goto out; - } + dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); + if (IS_ERR(dchild)) { + host_err = PTR_ERR(dchild); + } else if (d_is_negative(dchild)) { + err = nfserr_serverfault; + goto out; + } else if (unlikely(dchild != resfhp->fh_dentry)) { dput(resfhp->fh_dentry); - resfhp->fh_dentry = dget(d); - err = fh_update(resfhp); - dput(dchild); - dchild = d; - if (err) - goto out; + resfhp->fh_dentry = dget(dchild); } break; case S_IFCHR: @@ -1530,7 +1517,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs); out: - dput(dchild); + if (!IS_ERR(dchild)) + dput(dchild); return err; out_nfserr: diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 953fbd5f0851..40f4b1a28705 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -218,8 +218,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct nilfs_transaction_info ti; @@ -227,7 +227,7 @@ static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = nilfs_transaction_begin(dir->i_sb, &ti, 1); if (err) - return err; + return ERR_PTR(err); inc_nlink(dir); @@ -258,7 +258,7 @@ out: else nilfs_transaction_abort(dir->i_sb); - return err; + return ERR_PTR(err); out_fail: drop_nlink(inode); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 95646f7c46ca..6d386080faf2 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -166,6 +166,8 @@ static bool fanotify_should_merge(struct fanotify_event *old, case FANOTIFY_EVENT_TYPE_FS_ERROR: return fanotify_error_event_equal(FANOTIFY_EE(old), FANOTIFY_EE(new)); + case FANOTIFY_EVENT_TYPE_MNT: + return false; default: WARN_ON_ONCE(1); } @@ -312,7 +314,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", __func__, iter_info->report_mask, event_mask, data, data_type); - if (!fid_mode) { + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (data_type != FSNOTIFY_EVENT_MNT) + return 0; + } else if (!fid_mode) { /* Do we have path to open a file descriptor? */ if (!path) return 0; @@ -557,6 +562,20 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, return &pevent->fae; } +static struct fanotify_event *fanotify_alloc_mnt_event(u64 mnt_id, gfp_t gfp) +{ + struct fanotify_mnt_event *pevent; + + pevent = kmem_cache_alloc(fanotify_mnt_event_cachep, gfp); + if (!pevent) + return NULL; + + pevent->fae.type = FANOTIFY_EVENT_TYPE_MNT; + pevent->mnt_id = mnt_id; + + return &pevent->fae; +} + static struct fanotify_event *fanotify_alloc_perm_event(const void *data, int data_type, gfp_t gfp) @@ -731,6 +750,7 @@ static struct fanotify_event *fanotify_alloc_event( fid_mode); struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); + u64 mnt_id = fsnotify_data_mnt_id(data, data_type); struct mem_cgroup *old_memcg; struct dentry *moved = NULL; struct inode *child = NULL; @@ -826,8 +846,12 @@ static struct fanotify_event *fanotify_alloc_event( moved, &hash, gfp); } else if (fid_mode) { event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); - } else { + } else if (path) { event = fanotify_alloc_path_event(path, &hash, gfp); + } else if (mnt_id) { + event = fanotify_alloc_mnt_event(mnt_id, gfp); + } else { + WARN_ON_ONCE(1); } if (!event) @@ -927,7 +951,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_RENAME != FS_RENAME); BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 24); mask = fanotify_group_event_mask(group, iter_info, &match_mask, mask, data, data_type, dir); @@ -1028,6 +1052,11 @@ static void fanotify_free_error_event(struct fsnotify_group *group, mempool_free(fee, &group->fanotify_data.error_events_pool); } +static void fanotify_free_mnt_event(struct fanotify_event *event) +{ + kmem_cache_free(fanotify_mnt_event_cachep, FANOTIFY_ME(event)); +} + static void fanotify_free_event(struct fsnotify_group *group, struct fsnotify_event *fsn_event) { @@ -1054,6 +1083,9 @@ static void fanotify_free_event(struct fsnotify_group *group, case FANOTIFY_EVENT_TYPE_FS_ERROR: fanotify_free_error_event(group, event); break; + case FANOTIFY_EVENT_TYPE_MNT: + fanotify_free_mnt_event(event); + break; default: WARN_ON_ONCE(1); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index c12cbc270539..b44e70e44be6 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -9,6 +9,7 @@ extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_fid_event_cachep; extern struct kmem_cache *fanotify_path_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; +extern struct kmem_cache *fanotify_mnt_event_cachep; /* Possible states of the permission event */ enum { @@ -244,6 +245,7 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ + FANOTIFY_EVENT_TYPE_MNT, __FANOTIFY_EVENT_TYPE_NUM }; @@ -409,12 +411,23 @@ struct fanotify_path_event { struct path path; }; +struct fanotify_mnt_event { + struct fanotify_event fae; + u64 mnt_id; +}; + static inline struct fanotify_path_event * FANOTIFY_PE(struct fanotify_event *event) { return container_of(event, struct fanotify_path_event, fae); } +static inline struct fanotify_mnt_event * +FANOTIFY_ME(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_mnt_event, fae); +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the @@ -466,6 +479,11 @@ static inline bool fanotify_is_error_event(u32 mask) return mask & FAN_FS_ERROR; } +static inline bool fanotify_is_mnt_event(u32 mask) +{ + return mask & (FAN_MNT_ATTACH | FAN_MNT_DETACH); +} + static inline const struct path *fanotify_event_path(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_PATH) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ba3e2d09eb44..f2d840ae4ded 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -113,6 +113,7 @@ struct kmem_cache *fanotify_mark_cache __ro_after_init; struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; struct kmem_cache *fanotify_path_event_cachep __ro_after_init; struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; +struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ @@ -123,6 +124,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; (sizeof(struct fanotify_event_info_error)) #define FANOTIFY_RANGE_INFO_LEN \ (sizeof(struct fanotify_event_info_range)) +#define FANOTIFY_MNT_INFO_LEN \ + (sizeof(struct fanotify_event_info_mnt)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -178,6 +181,8 @@ static size_t fanotify_event_len(unsigned int info_mode, fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); } + if (fanotify_is_mnt_event(event->mask)) + event_len += FANOTIFY_MNT_INFO_LEN; if (info_mode & FAN_REPORT_PIDFD) event_len += FANOTIFY_PIDFD_INFO_LEN; @@ -405,6 +410,25 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } +static size_t copy_mnt_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_mnt info = { }; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT; + info.hdr.len = FANOTIFY_MNT_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.mnt_id = FANOTIFY_ME(event)->mnt_id; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + static size_t copy_error_info_to_user(struct fanotify_event *event, char __user *buf, int count) { @@ -700,6 +724,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_is_mnt_event(event->mask)) { + ret = copy_mnt_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } @@ -1508,6 +1541,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) return -EINVAL; + /* Don't allow mixing mnt events with inode events for now */ + if (flags & FAN_REPORT_MNT) { + if (class != FAN_CLASS_NOTIF) + return -EINVAL; + if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) + return -EINVAL; + } + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1767,7 +1808,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { struct inode *inode = NULL; - struct vfsmount *mnt = NULL; struct fsnotify_group *group; struct path path; struct fan_fsid __fsid, *fsid = NULL; @@ -1776,7 +1816,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; - void *obj; + void *obj = NULL; u32 umask = 0; int ret; @@ -1800,6 +1840,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, case FAN_MARK_FILESYSTEM: obj_type = FSNOTIFY_OBJ_TYPE_SB; break; + case FAN_MARK_MNTNS: + obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; + break; default: return -EINVAL; } @@ -1847,6 +1890,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; group = fd_file(f)->private_data; + /* Only report mount events on mnt namespace */ + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (mask & ~FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type != FAN_MARK_MNTNS) + return -EINVAL; + } else { + if (mask & FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type == FAN_MARK_MNTNS) + return -EINVAL; + } + /* * An unprivileged user is not allowed to setup mount nor filesystem * marks. This also includes setting up such marks by a group that @@ -1888,7 +1944,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); - if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && + if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) return -EINVAL; @@ -1938,17 +1994,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* inode held in place by reference to path; group by fget on fd */ - if (mark_type == FAN_MARK_INODE) { + if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { inode = path.dentry->d_inode; obj = inode; - } else { - mnt = path.mnt; - if (mark_type == FAN_MARK_MOUNT) - obj = mnt; - else - obj = mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + obj = path.mnt; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { + obj = path.mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { + obj = mnt_ns_from_dentry(path.dentry); } + ret = -EINVAL; + if (!obj) + goto path_put_and_out; + /* * If some other task has this inode open for write we should not add * an ignore mask, unless that ignore mask is supposed to survive @@ -1956,10 +2016,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, */ if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { - ret = mnt ? -EINVAL : -EISDIR; + ret = !inode ? -EINVAL : -EISDIR; /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ if (ignore == FAN_MARK_IGNORE && - (mnt || S_ISDIR(inode->i_mode))) + (!inode || S_ISDIR(inode->i_mode))) goto path_put_and_out; ret = 0; @@ -1968,7 +2028,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ - if (mnt || !S_ISDIR(inode->i_mode)) { + if (!inode || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; umask = FAN_EVENT_ON_CHILD; /* @@ -2042,7 +2102,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fanotify_mark, @@ -2055,6 +2115,7 @@ static int __init fanotify_user_setup(void) fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } + fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC); fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index e933f9c65d90..1161eabf11ee 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -121,6 +121,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", sb->s_dev, mflags, mark->mask, mark->ignore_mask); + } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_MNTNS) { + struct mnt_namespace *mnt_ns = fsnotify_conn_mntns(mark->connector); + + seq_printf(m, "fanotify mnt_ns:%u mflags:%x mask:%x ignored_mask:%x\n", + mnt_ns->ns.inum, mflags, mark->mask, mark->ignore_mask); } } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index fae1b6d397ea..e2b4f17a48bb 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -28,6 +28,11 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) fsnotify_clear_marks_by_mount(mnt); } +void __fsnotify_mntns_delete(struct mnt_namespace *mntns) +{ + fsnotify_clear_marks_by_mntns(mntns); +} + /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. @@ -420,7 +425,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, file_name, cookie, iter_info); } -static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp) +static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; @@ -538,14 +543,15 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); - struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); + struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; - __u32 test_mask, marks_mask; + __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); @@ -578,17 +584,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && - (!inode2 || !inode2->i_fsnotify_marks)) + (!inode2 || !inode2->i_fsnotify_marks) && + (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; - marks_mask = READ_ONCE(sb->s_fsnotify_mask); + if (sb) + marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); - + if (mnt_data) + marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. @@ -618,6 +627,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } + if (mnt_data) { + iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = + fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); + } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark @@ -708,11 +721,31 @@ void file_set_fsnotify_mode_from_watchers(struct file *file) } #endif +void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) +{ + struct fsnotify_mnt data = { + .ns = ns, + .mnt_id = real_mount(mnt)->mnt_id_unique, + }; + + if (WARN_ON_ONCE(!ns)) + return; + + /* + * This is an optimization as well as making sure fsnotify_init() has + * been called. + */ + if (!ns->n_fsnotify_marks) + return; + + fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); +} + static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 663759ed6fbc..5950c7a67f41 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -33,6 +33,12 @@ static inline struct super_block *fsnotify_conn_sb( return conn->obj; } +static inline struct mnt_namespace *fsnotify_conn_mntns( + struct fsnotify_mark_connector *conn) +{ + return conn->obj; +} + static inline struct super_block *fsnotify_object_sb(void *obj, enum fsnotify_obj_type obj_type) { @@ -89,6 +95,11 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } +static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns) +{ + fsnotify_destroy_marks(&mntns->n_fsnotify_marks); +} + /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 4981439e6209..798340db69d7 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -107,6 +107,8 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj, return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_sb_marks(obj); + case FSNOTIFY_OBJ_TYPE_MNTNS: + return &((struct mnt_namespace *)obj)->n_fsnotify_marks; default: return NULL; } @@ -120,6 +122,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; + else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) + return &fsnotify_conn_mntns(conn)->n_fsnotify_mask; return NULL; } @@ -346,12 +350,15 @@ static void *fsnotify_detach_connector_from_object( fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; + } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) { + fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0; } rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; - fsnotify_update_sb_watchers(sb, conn); + if (sb) + fsnotify_update_sb_watchers(sb, conn); return inode; } @@ -724,7 +731,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, * Attach the sb info before attaching a connector to any object on sb. * The sb info will remain attached as long as sb lives. */ - if (!fsnotify_sb_info(sb)) { + if (sb && !fsnotify_sb_info(sb)) { err = fsnotify_attach_info_to_sb(sb); if (err) return err; @@ -770,7 +777,8 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: - fsnotify_update_sb_watchers(sb, conn); + if (sb) + fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone diff --git a/fs/nsfs.c b/fs/nsfs.c index f7fddf8ecf73..59aa801347a7 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -151,19 +151,49 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, return 0; } +static bool nsfs_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case NS_GET_USERNS: + case NS_GET_PARENT: + case NS_GET_NSTYPE: + case NS_GET_OWNER_UID: + case NS_GET_MNTNS_ID: + case NS_GET_PID_FROM_PIDNS: + case NS_GET_TGID_FROM_PIDNS: + case NS_GET_PID_IN_PIDNS: + case NS_GET_TGID_IN_PIDNS: + return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + } + + /* Extensible ioctls require some extra handling. */ + switch (_IOC_NR(cmd)) { + case _IOC_NR(NS_MNT_GET_INFO): + case _IOC_NR(NS_MNT_GET_NEXT): + case _IOC_NR(NS_MNT_GET_PREV): + return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + } + + return false; +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct user_namespace *user_ns; struct pid_namespace *pid_ns; struct task_struct *tsk; - struct ns_common *ns = get_proc_ns(file_inode(filp)); + struct ns_common *ns; struct mnt_namespace *mnt_ns; bool previous = false; uid_t __user *argp; uid_t uid; int ret; + if (!nsfs_ioctl_valid(ioctl)) + return -ENOIOCTLCMD; + + ns = get_proc_ns(file_inode(filp)); switch (ioctl) { case NS_GET_USERNS: return open_related_ns(ns, ns_get_owner); diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index abf7e81584a9..652735a0b0c4 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -201,11 +201,11 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, /* * ntfs_mkdir- inode_operations::mkdir */ -static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, - NULL, 0, NULL); + return ERR_PTR(ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, + NULL, 0, NULL)); } /* diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 2a7f36643895..5130ec44e5e1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -402,10 +402,10 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct mnt_idmap * idmap, - struct inode * dir, - struct dentry * dentry, - umode_t mode) +static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap, + struct inode * dir, + struct dentry * dentry, + umode_t mode) { int status; struct inode *inode = NULL; @@ -448,7 +448,7 @@ static int dlmfs_mkdir(struct mnt_idmap * idmap, bail: if (status < 0) iput(inode); - return status; + return ERR_PTR(status); } static int dlmfs_create(struct mnt_idmap *idmap, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 0ec63a1a94b8..99278c8f0e24 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -644,10 +644,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, suballoc_loc, suballoc_bit); } -static int ocfs2_mkdir(struct mnt_idmap *idmap, - struct inode *dir, - struct dentry *dentry, - umode_t mode) +static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { int ret; @@ -657,7 +657,7 @@ static int ocfs2_mkdir(struct mnt_idmap *idmap, if (ret) mlog_errno(ret); - return ret; + return ERR_PTR(ret); } static int ocfs2_create(struct mnt_idmap *idmap, diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 6bda275826d6..2ed541fccf33 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -279,10 +279,10 @@ out_free_inode: return err; } -static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return omfs_add_node(dir, dentry, mode | S_IFDIR); + return ERR_PTR(omfs_add_node(dir, dentry, mode | S_IFDIR)); } static int omfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/open.c b/fs/open.c index 1be20de9f283..a9063cca9911 100644 --- a/fs/open.c +++ b/fs/open.c @@ -67,11 +67,11 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } -long vfs_truncate(const struct path *path, loff_t length) +int vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; - long error; + int error; inode = path->dentry->d_inode; @@ -123,7 +123,7 @@ mnt_drop_write_and_out: } EXPORT_SYMBOL_GPL(vfs_truncate); -long do_sys_truncate(const char __user *pathname, loff_t length) +int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -157,7 +157,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_ftruncate(struct file *file, loff_t length, int small) +int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; @@ -196,7 +196,7 @@ long do_ftruncate(struct file *file, loff_t length, int small) return error; } -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +int do_sys_ftruncate(unsigned int fd, loff_t length, int small) { if (length < 0) return -EINVAL; @@ -251,7 +251,7 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + int ret; loff_t sum; if (offset < 0 || len <= 0) @@ -460,7 +460,7 @@ static const struct cred *access_override_creds(void) return override_creds(override_cred); } -static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; @@ -1409,22 +1409,23 @@ struct file *file_open_root(const struct path *root, } EXPORT_SYMBOL(file_open_root); -static long do_sys_openat2(int dfd, const char __user *filename, - struct open_how *how) +static int do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(how, &op); struct filename *tmp; + int err, fd; - if (fd) - return fd; + err = build_open_flags(how, &op); + if (unlikely(err)) + return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); - if (fd >= 0) { + if (likely(fd >= 0)) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); @@ -1437,7 +1438,7 @@ static long do_sys_openat2(int dfd, const char __user *filename, return fd; } -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); @@ -1551,7 +1552,7 @@ int filp_close(struct file *filp, fl_owner_t id) int retval; retval = filp_flush(filp, id); - fput(filp); + fput_close(filp); return retval; } @@ -1577,13 +1578,16 @@ SYSCALL_DEFINE1(close, unsigned int, fd) * We're returning to user space. Don't bother * with any delayed fput() cases. */ - __fput_sync(file); + fput_close_sync(file); + + if (likely(retval == 0)) + return 0; /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || - retval == -ERESTARTNOINTR || - retval == -ERESTARTNOHAND || - retval == -ERESTART_RESTARTBLOCK)) + if (retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index d68372241b30..90c49c0de243 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -57,8 +57,8 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, int buffer_index; ssize_t ret; size_t copy_amount; - int open_for_read; - int open_for_write; + bool open_for_read; + bool open_for_write; new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); if (!new_op) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index aae6d2b8767d..5ac743c6bc2e 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -16,22 +16,22 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -static int orangefs_writepage_locked(struct page *page, - struct writeback_control *wbc) +static int orangefs_writepage_locked(struct folio *folio, + struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct orangefs_write_range *wr = NULL; struct iov_iter iter; struct bio_vec bv; - size_t len, wlen; + size_t wlen; ssize_t ret; - loff_t off; + loff_t len, off; - set_page_writeback(page); + folio_start_writeback(folio); len = i_size_read(inode); - if (PagePrivate(page)) { - wr = (struct orangefs_write_range *)page_private(page); + if (folio->private) { + wr = folio->private; WARN_ON(wr->pos >= len); off = wr->pos; if (off + wr->len > len) @@ -40,36 +40,27 @@ static int orangefs_writepage_locked(struct page *page, wlen = wr->len; } else { WARN_ON(1); - off = page_offset(page); - if (off + PAGE_SIZE > len) + off = folio_pos(folio); + wlen = folio_size(folio); + + if (wlen > len - off) wlen = len - off; - else - wlen = PAGE_SIZE; } /* Should've been handled in orangefs_invalidate_folio. */ WARN_ON(off == len || off + wlen > len); WARN_ON(wlen == 0); - bvec_set_page(&bv, page, wlen, off % PAGE_SIZE); + bvec_set_folio(&bv, folio, wlen, offset_in_folio(folio, off)); iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, len, wr, NULL, NULL); if (ret < 0) { - mapping_set_error(page->mapping, ret); + mapping_set_error(folio->mapping, ret); } else { ret = 0; } - kfree(detach_page_private(page)); - return ret; -} - -static int orangefs_writepage(struct page *page, struct writeback_control *wbc) -{ - int ret; - ret = orangefs_writepage_locked(page, wbc); - unlock_page(page); - end_page_writeback(page); + kfree(folio_detach_private(folio)); return ret; } @@ -79,33 +70,33 @@ struct orangefs_writepages { kuid_t uid; kgid_t gid; int maxpages; - int npages; - struct page **pages; + int nfolios; + struct address_space *mapping; + struct folio **folios; struct bio_vec *bv; }; static int orangefs_writepages_work(struct orangefs_writepages *ow, - struct writeback_control *wbc) + struct writeback_control *wbc) { - struct inode *inode = ow->pages[0]->mapping->host; + struct inode *inode = ow->mapping->host; struct orangefs_write_range *wrp, wr; struct iov_iter iter; ssize_t ret; - size_t len; - loff_t off; + size_t start; + loff_t len, off; int i; len = i_size_read(inode); - for (i = 0; i < ow->npages; i++) { - set_page_writeback(ow->pages[i]); - bvec_set_page(&ow->bv[i], ow->pages[i], - min(page_offset(ow->pages[i]) + PAGE_SIZE, - ow->off + ow->len) - - max(ow->off, page_offset(ow->pages[i])), - i == 0 ? ow->off - page_offset(ow->pages[i]) : 0); + start = offset_in_folio(ow->folios[0], ow->off); + for (i = 0; i < ow->nfolios; i++) { + folio_start_writeback(ow->folios[i]); + bvec_set_folio(&ow->bv[i], ow->folios[i], + folio_size(ow->folios[i]) - start, start); + start = 0; } - iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); + iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->nfolios, ow->len); WARN_ON(ow->off >= len); if (ow->off + ow->len > len) @@ -116,40 +107,24 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, wr.gid = ow->gid; ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, 0, &wr, NULL, NULL); - if (ret < 0) { - for (i = 0; i < ow->npages; i++) { - mapping_set_error(ow->pages[i]->mapping, ret); - if (PagePrivate(ow->pages[i])) { - wrp = (struct orangefs_write_range *) - page_private(ow->pages[i]); - ClearPagePrivate(ow->pages[i]); - put_page(ow->pages[i]); - kfree(wrp); - } - end_page_writeback(ow->pages[i]); - unlock_page(ow->pages[i]); - } - } else { + if (ret < 0) + mapping_set_error(ow->mapping, ret); + else ret = 0; - for (i = 0; i < ow->npages; i++) { - if (PagePrivate(ow->pages[i])) { - wrp = (struct orangefs_write_range *) - page_private(ow->pages[i]); - ClearPagePrivate(ow->pages[i]); - put_page(ow->pages[i]); - kfree(wrp); - } - end_page_writeback(ow->pages[i]); - unlock_page(ow->pages[i]); - } + + for (i = 0; i < ow->nfolios; i++) { + wrp = folio_detach_private(ow->folios[i]); + kfree(wrp); + folio_end_writeback(ow->folios[i]); + folio_unlock(ow->folios[i]); } + return ret; } static int orangefs_writepages_callback(struct folio *folio, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc, struct orangefs_writepages *ow) { - struct orangefs_writepages *ow = data; struct orangefs_write_range *wr = folio->private; int ret; @@ -162,41 +137,41 @@ static int orangefs_writepages_callback(struct folio *folio, } ret = -1; - if (ow->npages == 0) { + if (ow->nfolios == 0) { ow->off = wr->pos; ow->len = wr->len; ow->uid = wr->uid; ow->gid = wr->gid; - ow->pages[ow->npages++] = &folio->page; + ow->folios[ow->nfolios++] = folio; ret = 0; goto done; } if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; ret = -1; goto done; } if (ow->off + ow->len == wr->pos) { ow->len += wr->len; - ow->pages[ow->npages++] = &folio->page; + ow->folios[ow->nfolios++] = folio; ret = 0; goto done; } done: if (ret == -1) { - if (ow->npages) { + if (ow->nfolios) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; } - ret = orangefs_writepage_locked(&folio->page, wbc); + ret = orangefs_writepage_locked(folio, wbc); mapping_set_error(folio->mapping, ret); folio_unlock(folio); folio_end_writeback(folio); } else { - if (ow->npages == ow->maxpages) { + if (ow->nfolios == ow->maxpages) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; } } return ret; @@ -207,31 +182,35 @@ static int orangefs_writepages(struct address_space *mapping, { struct orangefs_writepages *ow; struct blk_plug plug; - int ret; + int error; + struct folio *folio = NULL; + ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL); if (!ow) return -ENOMEM; ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE; - ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL); - if (!ow->pages) { + ow->folios = kcalloc(ow->maxpages, sizeof(struct folio *), GFP_KERNEL); + if (!ow->folios) { kfree(ow); return -ENOMEM; } ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL); if (!ow->bv) { - kfree(ow->pages); + kfree(ow->folios); kfree(ow); return -ENOMEM; } + ow->mapping = mapping; blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow); - if (ow->npages) - ret = orangefs_writepages_work(ow, wbc); + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = orangefs_writepages_callback(folio, wbc, ow); + if (ow->nfolios) + error = orangefs_writepages_work(ow, wbc); blk_finish_plug(&plug); - kfree(ow->pages); + kfree(ow->folios); kfree(ow->bv); kfree(ow); - return ret; + return error; } static int orangefs_launder_folio(struct folio *); @@ -484,7 +463,7 @@ static int orangefs_launder_folio(struct folio *folio) }; folio_wait_writeback(folio); if (folio_clear_dirty_for_io(folio)) { - r = orangefs_writepage_locked(&folio->page, &wbc); + r = orangefs_writepage_locked(folio, &wbc); folio_end_writeback(folio); } return r; @@ -606,7 +585,6 @@ out: /** ORANGEFS2 implementation of address space operations */ static const struct address_space_operations orangefs_address_operations = { - .writepage = orangefs_writepage, .readahead = orangefs_readahead, .read_folio = orangefs_read_folio, .writepages = orangefs_writepages, @@ -616,6 +594,7 @@ static const struct address_space_operations orangefs_address_operations = { .invalidate_folio = orangefs_invalidate_folio, .release_folio = orangefs_release_folio, .free_folio = orangefs_free_folio, + .migrate_folio = filemap_migrate_folio, .launder_folio = orangefs_launder_folio, .direct_IO = orangefs_direct_IO, }; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 200558ec72f0..82395fe2b956 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -300,8 +300,8 @@ out: return ret; } -static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; @@ -312,7 +312,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR); if (!new_op) - return -ENOMEM; + return ERR_PTR(-ENOMEM); new_op->upcall.req.mkdir.parent_refn = parent->refn; @@ -366,7 +366,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, __orangefs_setattr(dir, &iattr); out: op_release(new_op); - return ret; + return ERR_PTR(ret); } static int orangefs_rename(struct mnt_idmap *idmap, diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h index 6e079d4230d0..d4463534cec6 100644 --- a/fs/orangefs/orangefs-debug.h +++ b/fs/orangefs/orangefs-debug.h @@ -43,47 +43,4 @@ #define GOSSIP_MAX_NR 16 #define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1) -/* a private internal type */ -struct __keyword_mask_s { - const char *keyword; - __u64 mask_val; -}; - -/* - * Map all kmod keywords to kmod debug masks here. Keep this - * structure "packed": - * - * "all" is always last... - * - * keyword mask_val index - * foo 1 0 - * bar 2 1 - * baz 4 2 - * qux 8 3 - * . . . - */ -static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { - {"super", GOSSIP_SUPER_DEBUG}, - {"inode", GOSSIP_INODE_DEBUG}, - {"file", GOSSIP_FILE_DEBUG}, - {"dir", GOSSIP_DIR_DEBUG}, - {"utils", GOSSIP_UTILS_DEBUG}, - {"wait", GOSSIP_WAIT_DEBUG}, - {"acl", GOSSIP_ACL_DEBUG}, - {"dcache", GOSSIP_DCACHE_DEBUG}, - {"dev", GOSSIP_DEV_DEBUG}, - {"name", GOSSIP_NAME_DEBUG}, - {"bufmap", GOSSIP_BUFMAP_DEBUG}, - {"cache", GOSSIP_CACHE_DEBUG}, - {"debugfs", GOSSIP_DEBUGFS_DEBUG}, - {"xattr", GOSSIP_XATTR_DEBUG}, - {"init", GOSSIP_INIT_DEBUG}, - {"sysfs", GOSSIP_SYSFS_DEBUG}, - {"none", GOSSIP_NO_DEBUG}, - {"all", GOSSIP_MAX_DEBUG} -}; - -static const int num_kmod_keyword_mask_map = (int) - (ARRAY_SIZE(s_kmod_keyword_mask_map)); - #endif /* __ORANGEFS_DEBUG_H */ diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index f52073022fae..f7095c91660c 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -44,6 +44,49 @@ #include "protocol.h" #include "orangefs-kernel.h" +/* a private internal type */ +struct __keyword_mask_s { + const char *keyword; + __u64 mask_val; +}; + +/* + * Map all kmod keywords to kmod debug masks here. Keep this + * structure "packed": + * + * "all" is always last... + * + * keyword mask_val index + * foo 1 0 + * bar 2 1 + * baz 4 2 + * qux 8 3 + * . . . + */ +static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { + {"super", GOSSIP_SUPER_DEBUG}, + {"inode", GOSSIP_INODE_DEBUG}, + {"file", GOSSIP_FILE_DEBUG}, + {"dir", GOSSIP_DIR_DEBUG}, + {"utils", GOSSIP_UTILS_DEBUG}, + {"wait", GOSSIP_WAIT_DEBUG}, + {"acl", GOSSIP_ACL_DEBUG}, + {"dcache", GOSSIP_DCACHE_DEBUG}, + {"dev", GOSSIP_DEV_DEBUG}, + {"name", GOSSIP_NAME_DEBUG}, + {"bufmap", GOSSIP_BUFMAP_DEBUG}, + {"cache", GOSSIP_CACHE_DEBUG}, + {"debugfs", GOSSIP_DEBUGFS_DEBUG}, + {"xattr", GOSSIP_XATTR_DEBUG}, + {"init", GOSSIP_INIT_DEBUG}, + {"sysfs", GOSSIP_SYSFS_DEBUG}, + {"none", GOSSIP_NO_DEBUG}, + {"all", GOSSIP_MAX_DEBUG} +}; + +static const int num_kmod_keyword_mask_map = (int) + (ARRAY_SIZE(s_kmod_keyword_mask_map)); + #define DEBUG_HELP_STRING_SIZE 4096 #define HELP_STRING_UNINITIALIZED \ "Client Debug Keywords are unknown until the first time\n" \ diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index c9993ff66fc2..fe493f3ed6b6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -138,37 +138,6 @@ kill_whiteout: goto out; } -int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, - struct dentry **newdentry, umode_t mode) -{ - int err; - struct dentry *d, *dentry = *newdentry; - - err = ovl_do_mkdir(ofs, dir, dentry, mode); - if (err) - return err; - - if (likely(!d_unhashed(dentry))) - return 0; - - /* - * vfs_mkdir() may succeed and leave the dentry passed - * to it unhashed and negative. If that happens, try to - * lookup a new hashed and positive dentry. - */ - d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); - if (IS_ERR(d)) { - pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", - dentry, err); - return PTR_ERR(d); - } - dput(dentry); - *newdentry = d; - - return 0; -} - struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr) { @@ -191,7 +160,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, case S_IFDIR: /* mkdir is special... */ - err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode); + newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode); + err = PTR_ERR_OR_ZERO(newdentry); break; case S_IFCHR: @@ -219,7 +189,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, } out: if (err) { - dput(newdentry); + if (!IS_ERR(newdentry)) + dput(newdentry); return ERR_PTR(err); } return newdentry; @@ -282,7 +253,8 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, * XXX: if we ever use ovl_obtain_alias() to decode directory * file handles, need to use ovl_get_inode_locked() and * d_instantiate_new() here to prevent from creating two - * hashed directory inode aliases. + * hashed directory inode aliases. We then need to return + * the obtained alias to ovl_mkdir(). */ inode = ovl_get_inode(dentry->d_sb, &oip); if (IS_ERR(inode)) @@ -687,10 +659,10 @@ static int ovl_create(struct mnt_idmap *idmap, struct inode *dir, return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); } -static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); + return ERR_PTR(ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL)); } static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0021e2025020..6f2f8f4cfbbc 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -241,13 +241,14 @@ static inline int ovl_do_create(struct ovl_fs *ofs, return err; } -static inline int ovl_do_mkdir(struct ovl_fs *ofs, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { - int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); - return err; + dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry)); + return dentry; } static inline int ovl_do_mknod(struct ovl_fs *ofs, @@ -838,8 +839,6 @@ struct ovl_cattr { #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) -int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, - struct dentry **newdentry, umode_t mode); struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr); diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 1115c22deca0..6759f7d040c8 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -59,6 +59,7 @@ enum ovl_opt { Opt_metacopy, Opt_verity, Opt_volatile, + Opt_override_creds, }; static const struct constant_table ovl_parameter_bool[] = { @@ -155,6 +156,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), fsparam_enum("verity", Opt_verity, ovl_parameter_verity), fsparam_flag("volatile", Opt_volatile), + fsparam_flag_no("override_creds", Opt_override_creds), {} }; @@ -662,6 +664,29 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_userxattr: config->userxattr = true; break; + case Opt_override_creds: { + const struct cred *cred = NULL; + + if (result.negated) { + swap(cred, ofs->creator_cred); + put_cred(cred); + break; + } + + if (!current_in_userns(fc->user_ns)) { + err = -EINVAL; + break; + } + + cred = prepare_creds(); + if (cred) + swap(cred, ofs->creator_cred); + else + err = -ENOMEM; + + put_cred(cred); + break; + } default: pr_err("unrecognized mount option \"%s\" or missing value\n", param->key); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 86ae6f6da36b..b63474d1b064 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -327,9 +327,10 @@ retry: goto retry; } - err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode); - if (err) - goto out_dput; + work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); + err = PTR_ERR(work); + if (IS_ERR(work)) + goto out_err; /* Weird filesystem returning with hashed negative (kernfs)? */ err = -EINVAL; @@ -1305,6 +1306,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) { struct ovl_fs *ofs = sb->s_fs_info; struct ovl_fs_context *ctx = fc->fs_private; + const struct cred *old_cred = NULL; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_layer *layers; @@ -1318,10 +1320,15 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; - ofs->creator_cred = cred = prepare_creds(); + if (!ofs->creator_cred) + ofs->creator_cred = cred = prepare_creds(); + else + cred = (struct cred *)ofs->creator_cred; if (!cred) goto out_err; + old_cred = ovl_override_creds(sb); + err = ovl_fs_params_verify(ctx, &ofs->config); if (err) goto out_err; @@ -1481,11 +1488,19 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_root = root_dentry; + ovl_revert_creds(old_cred); return 0; out_free_oe: ovl_free_entry(oe); out_err: + /* + * Revert creds before calling ovl_free_fs() which will call + * put_cred() and put_cred() requires that the cred's that are + * put are not the caller's creds, i.e., current->cred. + */ + if (old_cred) + ovl_revert_creds(old_cred); ovl_free_fs(ofs); sb->s_fs_info = NULL; return err; diff --git a/fs/pidfs.c b/fs/pidfs.c index c0478b3c55d9..d64a4cbeb0da 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -24,6 +24,28 @@ #include "internal.h" #include "mount.h" +static struct kmem_cache *pidfs_cachep __ro_after_init; + +/* + * Stashes information that userspace needs to access even after the + * process has been reaped. + */ +struct pidfs_exit_info { + __u64 cgroupid; + __s32 exit_code; +}; + +struct pidfs_inode { + struct pidfs_exit_info __pei; + struct pidfs_exit_info *exit_info; + struct inode vfs_inode; +}; + +static inline struct pidfs_inode *pidfs_i(struct inode *inode) +{ + return container_of(inode, struct pidfs_inode, vfs_inode); +} + static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 @@ -188,36 +210,48 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = pidfd_pid(file); - bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* - * Depending on PIDFD_THREAD, inform pollers when the thread - * or the whole thread-group exits. + * Don't wake waiters if the thread-group leader exited + * prematurely. They either get notified when the last subthread + * exits or not at all if one of the remaining subthreads execs + * and assumes the struct pid of the old thread-group leader. */ guard(rcu)(); task = pid_task(pid, PIDTYPE_PID); if (!task) poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; - else if (task->exit_state && (thread || thread_group_empty(task))) + else if (task->exit_state && !delay_group_leader(task)) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; } -static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) +static inline bool pid_in_current_pidns(const struct pid *pid) +{ + const struct pid_namespace *ns = task_active_pid_ns(current); + + if (ns->level <= pid->level) + return pid->numbers[ns->level].ns == ns; + + return false; +} + +static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; + struct inode *inode = file_inode(file); + struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; + struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; + struct task_struct *task; const struct cred *c; __u64 mask; -#ifdef CONFIG_CGROUPS - struct cgroup *cgrp; -#endif if (!uinfo) return -EINVAL; @@ -227,6 +261,37 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) return -EFAULT; + /* + * Restrict information retrieval to tasks within the caller's pid + * namespace hierarchy. + */ + if (!pid_in_current_pidns(pid)) + return -ESRCH; + + if (mask & PIDFD_INFO_EXIT) { + exit_info = READ_ONCE(pidfs_i(inode)->exit_info); + if (exit_info) { + kinfo.mask |= PIDFD_INFO_EXIT; +#ifdef CONFIG_CGROUPS + kinfo.cgroupid = exit_info->cgroupid; + kinfo.mask |= PIDFD_INFO_CGROUPID; +#endif + kinfo.exit_code = exit_info->exit_code; + } + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + /* + * If the task has already been reaped, only exit + * information is available + */ + if (!(mask & PIDFD_INFO_EXIT)) + return -ESRCH; + + goto copy_out; + } + c = get_task_cred(task); if (!c) return -ESRCH; @@ -246,11 +311,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long put_cred(c); #ifdef CONFIG_CGROUPS - rcu_read_lock(); - cgrp = task_dfl_cgroup(task); - kinfo.cgroupid = cgroup_id(cgrp); - kinfo.mask |= PIDFD_INFO_CGROUPID; - rcu_read_unlock(); + if (!kinfo.cgroupid) { + struct cgroup *cgrp; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + kinfo.cgroupid = cgroup_id(cgrp); + kinfo.mask |= PIDFD_INFO_CGROUPID; + rcu_read_unlock(); + } #endif /* @@ -270,16 +339,14 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) return -ESRCH; +copy_out: /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows about will be copied. */ - if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) - return -EFAULT; - - return 0; + return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL); } static bool pidfs_ioctl_valid(unsigned int cmd) @@ -317,7 +384,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; - struct pid *pid = pidfd_pid(file); struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; @@ -332,13 +398,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return put_user(file_inode(file)->i_generation, argp); } - task = get_pid_task(pid, PIDTYPE_PID); - if (!task) - return -ESRCH; - /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) - return pidfd_info(task, cmd, arg); + return pidfd_info(file, cmd, arg); + + task = get_pid_task(pidfd_pid(file), PIDTYPE_PID); + if (!task) + return -ESRCH; if (arg) return -EINVAL; @@ -450,6 +516,49 @@ struct pid *pidfd_pid(const struct file *file) return file_inode(file)->i_private; } +/* + * We're called from release_task(). We know there's at least one + * reference to struct pid being held that won't be released until the + * task has been reaped which cannot happen until we're out of + * release_task(). + * + * If this struct pid is referred to by a pidfd then + * stashed_dentry_get() will return the dentry and inode for that struct + * pid. Since we've taken a reference on it there's now an additional + * reference from the exit path on it. Which is fine. We're going to put + * it again in a second and we know that the pid is kept alive anyway. + * + * Worst case is that we've filled in the info and immediately free the + * dentry and inode afterwards since the pidfd has been closed. Since + * pidfs_exit() currently is placed after exit_task_work() we know that + * it cannot be us aka the exiting task holding a pidfd to ourselves. + */ +void pidfs_exit(struct task_struct *tsk) +{ + struct dentry *dentry; + + might_sleep(); + + dentry = stashed_dentry_get(&task_pid(tsk)->stashed); + if (dentry) { + struct inode *inode = d_inode(dentry); + struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); +#endif + exit_info->exit_code = tsk->exit_code; + + /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ + smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); + dput(dentry); + } +} + static struct vfsmount *pidfs_mnt __ro_after_init; /* @@ -505,9 +614,30 @@ static void pidfs_evict_inode(struct inode *inode) put_pid(pid); } +static struct inode *pidfs_alloc_inode(struct super_block *sb) +{ + struct pidfs_inode *pi; + + pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL); + if (!pi) + return NULL; + + memset(&pi->__pei, 0, sizeof(pi->__pei)); + pi->exit_info = NULL; + + return &pi->vfs_inode; +} + +static void pidfs_free_inode(struct inode *inode) +{ + kmem_cache_free(pidfs_cachep, pidfs_i(inode)); +} + static const struct super_operations pidfs_sops = { + .alloc_inode = pidfs_alloc_inode, .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, + .free_inode = pidfs_free_inode, .statfs = simple_statfs, }; @@ -633,8 +763,49 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } +static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, + unsigned int flags) +{ + enum pid_type type; + + if (flags & PIDFD_CLONE) + return true; + + /* + * Make sure that if a pidfd is created PIDFD_INFO_EXIT + * information will be available. So after an inode for the + * pidfd has been allocated perform another check that the pid + * is still alive. If it is exit information is available even + * if the task gets reaped before the pidfd is returned to + * userspace. The only exception is PIDFD_CLONE where no task + * linkage has been established for @pid yet and the kernel is + * in the middle of process creation so there's nothing for + * pidfs to miss. + */ + if (flags & PIDFD_THREAD) + type = PIDTYPE_PID; + else + type = PIDTYPE_TGID; + + /* + * Since pidfs_exit() is called before struct pid's task linkage + * is removed the case where the task got reaped but a dentry + * was already attached to struct pid and exit information was + * recorded and published can be handled correctly. + */ + if (unlikely(!pid_has_task(pid, type))) { + struct inode *inode = d_inode(path->dentry); + return !!READ_ONCE(pidfs_i(inode)->exit_info); + } + + return true; +} + static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { + if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags)) + return ERR_PTR(-ESRCH); + /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. @@ -698,22 +869,46 @@ static struct file_system_type pidfs_type = { struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) { - struct file *pidfd_file; - struct path path; + struct path path __free(path_put) = {}; int ret; + /* + * Ensure that PIDFD_CLONE can be passed as a flag without + * overloading other uapi pidfd flags. + */ + BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD); + BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK); + ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) return ERR_PTR(ret); + if (!pidfs_pid_valid(pid, &path, flags)) + return ERR_PTR(-ESRCH); + + flags &= ~PIDFD_CLONE; pidfd_file = dentry_open(&path, flags, current_cred()); - path_put(&path); + /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ + if (!IS_ERR(pidfd_file)) + pidfd_file->f_flags |= (flags & PIDFD_THREAD); + return pidfd_file; } +static void pidfs_inode_init_once(void *data) +{ + struct pidfs_inode *pi = data; + + inode_init_once(&pi->vfs_inode); +} + void __init pidfs_init(void) { + pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), + pidfs_inode_init_once); pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); diff --git a/fs/pipe.c b/fs/pipe.c index 4d0799e4e719..da45edd68c41 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -112,20 +112,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock(pipe2); } +static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe) +{ + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) { + struct page *page = pipe->tmp_page[i]; + pipe->tmp_page[i] = NULL; + return page; + } + } + + return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); +} + +static void anon_pipe_put_page(struct pipe_inode_info *pipe, + struct page *page) +{ + if (page_count(page) == 1) { + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (!pipe->tmp_page[i]) { + pipe->tmp_page[i] = page; + return; + } + } + } + + put_page(page); +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; - /* - * If nobody else uses this page, and we don't already have a - * temporary page, let's keep track of it as a one-deep - * allocation cache. (Otherwise just release our reference to it) - */ - if (page_count(page) == 1 && !pipe->tmp_page) - pipe->tmp_page = page; - else - put_page(page); + anon_pipe_put_page(pipe, page); } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, @@ -247,7 +267,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, } static ssize_t -pipe_read(struct kiocb *iocb, struct iov_iter *to) +anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; @@ -274,7 +294,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { @@ -301,7 +320,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) #endif if (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t chars = buf->len; size_t written; int error; @@ -359,29 +378,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) break; } mutex_unlock(&pipe->mutex); - /* * We only get here if we didn't actually read anything. * - * However, we could have seen (and removed) a zero-sized - * pipe buffer, and might have made space in the buffers - * that way. - * - * You can't make zero-sized pipe buffers by doing an empty - * write (not even in packet mode), but they can happen if - * the writer gets an EFAULT when trying to fill a buffer - * that already got allocated and inserted in the buffer - * array. - * - * So we still need to wake up any pending writers in the - * _very_ unlikely case that the pipe was full, but we got - * no data. - */ - if (unlikely(wake_writer)) - wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - - /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need @@ -390,7 +389,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - wake_writer = false; wake_next_reader = true; mutex_lock(&pipe->mutex); } @@ -403,8 +401,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + return ret; +} + +static ssize_t +fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to) +{ + int ret = anon_pipe_read(iocb, to); if (ret > 0) - file_accessed(filp); + file_accessed(iocb->ki_filp); return ret; } @@ -424,7 +429,7 @@ static inline bool pipe_writable(const struct pipe_inode_info *pipe) } static ssize_t -pipe_write(struct kiocb *iocb, struct iov_iter *from) +anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; @@ -471,8 +476,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head - 1); int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && @@ -503,54 +507,44 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { - unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; - struct page *page = pipe->tmp_page; + struct page *page; int copied; - if (!page) { - page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); - if (unlikely(!page)) { - ret = ret ? : -ENOMEM; - break; - } - pipe->tmp_page = page; + page = anon_pipe_get_page(pipe); + if (unlikely(!page)) { + if (!ret) + ret = -ENOMEM; + break; } - /* Allocate a slot in the ring in advance and attach an - * empty buffer. If we fault or otherwise fail to use - * it, either the reader will consume it or it'll still - * be there for the next write. - */ - pipe->head = head + 1; + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + anon_pipe_put_page(pipe, page); + if (!ret) + ret = -EFAULT; + break; + } + pipe->head = head + 1; /* Insert it into the buffer array */ - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; - pipe->tmp_page = NULL; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); - if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { - if (!ret) - ret = -EFAULT; - break; - } - ret += copied; buf->len = copied; + ret += copied; if (!iov_iter_count(from)) break; - } - if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; + } /* Wait for buffer space to become available. */ if ((filp->f_flags & O_NONBLOCK) || @@ -602,11 +596,21 @@ out: kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; - sb_end_write(file_inode(filp)->i_sb); + return ret; +} + +static ssize_t +fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from) +{ + int ret = anon_pipe_write(iocb, from); + if (ret > 0) { + struct file *filp = iocb->ki_filp; + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { + int err = file_update_time(filp); + if (err) + ret = err; + sb_end_write(file_inode(filp)->i_sb); + } } return ret; } @@ -853,8 +857,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif - if (pipe->tmp_page) - __free_page(pipe->tmp_page); + for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) + __free_page(pipe->tmp_page[i]); + } kfree(pipe->bufs); kfree(pipe); } @@ -874,6 +880,8 @@ static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; +static const struct file_operations pipeanon_fops; + static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); @@ -891,7 +899,7 @@ static struct inode * get_pipe_inode(void) inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &pipefifo_fops; + inode->i_fop = &pipeanon_fops; /* * Mark the inode dirty from the very beginning, @@ -934,7 +942,7 @@ int create_pipe_files(struct file **res, int flags) f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); @@ -945,7 +953,7 @@ int create_pipe_files(struct file **res, int flags) f->f_pipe = 0; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); @@ -1109,8 +1117,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe) static int fifo_open(struct inode *inode, struct file *filp) { + bool is_pipe = inode->i_fop == &pipeanon_fops; struct pipe_inode_info *pipe; - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_pipe = 0; @@ -1234,8 +1242,19 @@ err: const struct file_operations pipefifo_fops = { .open = fifo_open, - .read_iter = pipe_read, - .write_iter = pipe_write, + .read_iter = fifo_pipe_read, + .write_iter = fifo_pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, + .splice_write = iter_file_splice_write, +}; + +static const struct file_operations pipeanon_fops = { + .open = fifo_open, + .read_iter = anon_pipe_read, + .write_iter = anon_pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, @@ -1271,6 +1290,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) struct pipe_buffer *bufs; unsigned int head, tail, mask, n; + /* nr_slots larger than limits of pipe->{head,tail} */ + if (unlikely(nr_slots > (pipe_index_t)-1u)) + return -EINVAL; + bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) @@ -1390,7 +1413,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; - if (file->f_op != &pipefifo_fops || !pipe) + if (!pipe) + return NULL; + if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; diff --git a/fs/pnode.c b/fs/pnode.c index ef048f008bdd..7a062a5de10e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -150,7 +150,7 @@ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); while (1) { @@ -174,7 +174,7 @@ static struct mount *skip_propagation_subtree(struct mount *m, * Advance m such that propagation_next will not return * the slaves of m. */ - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) m = last_slave(m); return m; @@ -185,7 +185,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) while (1) { while (1) { struct mount *next; - if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { @@ -226,7 +226,7 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp) struct mount *child; int type; /* skip ones added by this propagate_mnt() */ - if (IS_MNT_NEW(m)) + if (IS_MNT_PROPAGATED(m)) return 0; /* skip if mountpoint isn't covered by it */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) @@ -380,7 +380,7 @@ bool propagation_would_overmount(const struct mount *from, if (!IS_MNT_SHARED(from)) return false; - if (IS_MNT_NEW(to)) + if (IS_MNT_PROPAGATED(to)) return false; if (to->mnt.mnt_root != mp->m_dentry) @@ -549,8 +549,10 @@ static void restore_mounts(struct list_head *to_restore) mp = parent->mnt_mp; parent = parent->mnt_parent; } - if (parent != mnt->mnt_parent) + if (parent != mnt->mnt_parent) { mnt_change_mountpoint(parent, mp, mnt); + mnt_notify_add(mnt); + } } } diff --git a/fs/pnode.h b/fs/pnode.h index 0b02a6393891..ddafe0d087ca 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -12,7 +12,7 @@ #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) -#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns)) +#define IS_MNT_PROPAGATED(m) (!(m)->mnt_ns || ((m)->mnt_ns->mntns_flags & MNTNS_PROPAGATING)) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) diff --git a/fs/proc/base.c b/fs/proc/base.c index cd89e956c322..5538c4aee8fa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1489,7 +1489,6 @@ static const struct file_operations proc_fail_nth_operations = { #endif -#ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: */ @@ -1539,8 +1538,6 @@ static const struct file_operations proc_pid_sched_operations = { .release = single_release, }; -#endif - #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: @@ -2497,11 +2494,9 @@ static const struct file_operations proc_map_files_operations = { #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { - struct pid *pid; - struct task_struct *task; - struct sighand_struct *sighand; - struct pid_namespace *ns; - unsigned long flags; + struct pid *pid; + struct task_struct *task; + struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) @@ -2512,54 +2507,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos) if (!tp->task) return ERR_PTR(-ESRCH); - tp->sighand = lock_task_sighand(tp->task, &tp->flags); - if (!tp->sighand) - return ERR_PTR(-ESRCH); - - return seq_hlist_start(&tp->task->signal->posix_timers, *pos); + rcu_read_lock(); + return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); + + return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; - if (tp->sighand) { - unlock_task_sighand(tp->task, &tp->flags); - tp->sighand = NULL; - } - if (tp->task) { put_task_struct(tp->task); tp->task = NULL; + rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { - struct k_itimer *timer; - struct timers_private *tp = m->private; - int notify; static const char * const nstr[] = { - [SIGEV_SIGNAL] = "signal", - [SIGEV_NONE] = "none", - [SIGEV_THREAD] = "thread", + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", }; - timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); - notify = timer->it_sigev_notify; + struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); + struct timers_private *tp = m->private; + int notify = timer->it_sigev_notify; + + guard(spinlock_irq)(&timer->it_lock); + if (!posixtimer_valid(timer)) + return 0; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%px\n", - timer->sigq.info.si_signo, + seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, timer->sigq.info.si_value.sival_ptr); - seq_printf(m, "notify: %s/%s.%d\n", - nstr[notify & ~SIGEV_THREAD_ID], + seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); @@ -3331,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif @@ -3682,9 +3669,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8ec90826a49e..a3e22803cddf 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -559,10 +559,16 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } -static inline void pde_set_flags(struct proc_dir_entry *pde) +static void pde_set_flags(struct proc_dir_entry *pde) { if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) pde->flags |= PROC_ENTRY_PERMANENT; + if (pde->proc_ops->proc_read_iter) + pde->flags |= PROC_ENTRY_proc_read_iter; +#ifdef CONFIG_COMPAT + if (pde->proc_ops->proc_compat_ioctl) + pde->flags |= PROC_ENTRY_proc_compat_ioctl; +#endif } struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, @@ -626,6 +632,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_seq_private); @@ -656,6 +663,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, return NULL; p->proc_ops = &proc_single_ops; p->single_show = show; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_single_data); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 626ad7bd94f2..a3eb3b740f76 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -656,13 +656,13 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) if (S_ISREG(inode->i_mode)) { inode->i_op = de->proc_iops; - if (de->proc_ops->proc_read_iter) + if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops; else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT - if (de->proc_ops->proc_compat_ioctl) { - if (de->proc_ops->proc_read_iter) + if (pde_has_proc_compat_ioctl(de)) { + if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops_compat; else inode->i_fop = &proc_reg_file_ops_compat; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 1695509370b8..77a517f91821 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -85,6 +85,20 @@ static inline void pde_make_permanent(struct proc_dir_entry *pde) pde->flags |= PROC_ENTRY_PERMANENT; } +static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_read_iter; +} + +static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde) +{ +#ifdef CONFIG_COMPAT + return pde->flags & PROC_ENTRY_proc_compat_ioctl; +#else + return false; +#endif +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 1cb33771bf9f..728630b10fdf 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -34,8 +34,6 @@ #include <asm/sections.h> #include "internal.h" -#define CORE_STR "CORE" - #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif @@ -122,7 +120,9 @@ static void update_kcore_size(void) kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr); kcore_notes_len = (4 * sizeof(struct elf_note) + - 3 * ALIGN(sizeof(CORE_STR), 4) + + ALIGN(sizeof(NN_PRSTATUS), 4) + + ALIGN(sizeof(NN_PRPSINFO), 4) + + ALIGN(sizeof(NN_TASKSTRUCT), 4) + VMCOREINFO_NOTE_NAME_BYTES + ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) + @@ -443,11 +443,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; } - append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); - append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); - append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current, arch_task_struct_size); /* * vmcoreinfo_size is mostly constant after init time, but it diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 56815799ce79..bb3b769edc71 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -14,10 +14,10 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/string.h> -#include <linux/mount.h> #include <linux/seq_file.h> #include <linux/ramfs.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/sched.h> #include <linux/magic.h> #include <linux/pstore.h> @@ -226,37 +226,38 @@ static struct inode *pstore_get_inode(struct super_block *sb) } enum { - Opt_kmsg_bytes, Opt_err + Opt_kmsg_bytes }; -static const match_table_t tokens = { - {Opt_kmsg_bytes, "kmsg_bytes=%u"}, - {Opt_err, NULL} +static const struct fs_parameter_spec pstore_param_spec[] = { + fsparam_u32 ("kmsg_bytes", Opt_kmsg_bytes), + {} }; -static void parse_options(char *options) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return; +struct pstore_context { + unsigned int kmsg_bytes; +}; - while ((p = strsep(&options, ",")) != NULL) { - int token; +static int pstore_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct pstore_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; - if (!*p) - continue; + opt = fs_parse(fc, pstore_param_spec, param, &result); + /* pstore has historically ignored invalid kmsg_bytes param */ + if (opt < 0) + return 0; - token = match_token(p, tokens, args); - switch (token) { - case Opt_kmsg_bytes: - if (!match_int(&args[0], &option)) - pstore_set_kmsg_bytes(option); - break; - } + switch (opt) { + case Opt_kmsg_bytes: + ctx->kmsg_bytes = result.uint_32; + break; + default: + return -EINVAL; } + + return 0; } /* @@ -265,14 +266,16 @@ static void parse_options(char *options) static int pstore_show_options(struct seq_file *m, struct dentry *root) { if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) - seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); + seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes); return 0; } -static int pstore_remount(struct super_block *sb, int *flags, char *data) +static int pstore_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - parse_options(data); + struct pstore_context *ctx = fc->fs_private; + + sync_filesystem(fc->root->d_sb); + pstore_set_kmsg_bytes(ctx->kmsg_bytes); return 0; } @@ -281,7 +284,6 @@ static const struct super_operations pstore_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .evict_inode = pstore_evict_inode, - .remount_fs = pstore_remount, .show_options = pstore_show_options, }; @@ -406,8 +408,9 @@ void pstore_get_records(int quiet) inode_unlock(d_inode(root)); } -static int pstore_fill_super(struct super_block *sb, void *data, int silent) +static int pstore_fill_super(struct super_block *sb, struct fs_context *fc) { + struct pstore_context *ctx = fc->fs_private; struct inode *inode; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -417,7 +420,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &pstore_ops; sb->s_time_gran = 1; - parse_options(data); + pstore_set_kmsg_bytes(ctx->kmsg_bytes); inode = pstore_get_inode(sb); if (inode) { @@ -438,12 +441,26 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *pstore_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int pstore_get_tree(struct fs_context *fc) +{ + if (fc->root) + return pstore_reconfigure(fc); + + return get_tree_single(fc, pstore_fill_super); +} + +static void pstore_free_fc(struct fs_context *fc) { - return mount_single(fs_type, flags, data, pstore_fill_super); + kfree(fc->fs_private); } +static const struct fs_context_operations pstore_context_ops = { + .parse_param = pstore_parse_param, + .get_tree = pstore_get_tree, + .reconfigure = pstore_reconfigure, + .free = pstore_free_fc, +}; + static void pstore_kill_sb(struct super_block *sb) { guard(mutex)(&pstore_sb_lock); @@ -456,11 +473,33 @@ static void pstore_kill_sb(struct super_block *sb) INIT_LIST_HEAD(&records_list); } +static int pstore_init_fs_context(struct fs_context *fc) +{ + struct pstore_context *ctx; + + ctx = kzalloc(sizeof(struct pstore_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + /* + * Global kmsg_bytes is initialized to default, and updated + * every time we (re)mount the single-sb filesystem with the + * option specified. + */ + ctx->kmsg_bytes = kmsg_bytes; + + fc->fs_private = ctx; + fc->ops = &pstore_context_ops; + + return 0; +} + static struct file_system_type pstore_fs_type = { .owner = THIS_MODULE, .name = "pstore", - .mount = pstore_mount, .kill_sb = pstore_kill_sb, + .init_fs_context = pstore_init_fs_context, + .parameters = pstore_param_spec, }; int __init pstore_init_fs(void) diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 801d6c0b170c..a0fc51196910 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,7 @@ #include <linux/time.h> #include <linux/pstore.h> -extern unsigned long kmsg_bytes; +extern unsigned int kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); @@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {} extern struct pstore_info *psinfo; -extern void pstore_set_kmsg_bytes(int); +extern void pstore_set_kmsg_bytes(unsigned int bytes); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f56b066ab80c..557cf9d40177 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -92,8 +92,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* How much of the kernel log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; -module_param(kmsg_bytes, ulong, 0444); +unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, uint, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); static void *compress_workspace; @@ -107,9 +107,9 @@ static void *compress_workspace; static char *big_oops_buf; static size_t max_compressed_size; -void pstore_set_kmsg_bytes(int bytes) +void pstore_set_kmsg_bytes(unsigned int bytes) { - kmsg_bytes = bytes; + WRITE_ONCE(kmsg_bytes, bytes); } /* Tag each group of saved records with a sequence number */ @@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail) { struct kmsg_dump_iter iter; + unsigned int remaining = READ_ONCE(kmsg_bytes); unsigned long total = 0; const char *why; unsigned int part = 1; @@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, kmsg_dump_rewind(&iter); oopscount++; - while (total < kmsg_bytes) { + while (total < remaining) { char *dst; size_t dst_size; int header_size; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 8006faaaf0ec..775fa905fda0 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -119,13 +119,13 @@ out: return error; } -static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); - return retval; + return ERR_PTR(retval); } static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/read_write.c b/fs/read_write.c index a6133241dfb8..bb0ed26a0b3a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -169,11 +169,16 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, if (whence == SEEK_CUR) { /* - * f_lock protects against read/modify/write race with - * other SEEK_CURs. Note that parallel writes and reads - * behave like SEEK_SET. + * If the file requires locking via f_pos_lock we know + * that mutual exclusion for SEEK_CUR on the same file + * is guaranteed. If the file isn't locked, we take + * f_lock to protect against f_pos races with other + * SEEK_CURs. */ - guard(spinlock)(&file->f_lock); + if (file_seek_cur_needs_f_lock(file)) { + guard(spinlock)(&file->f_lock); + return vfs_setpos(file, file->f_pos + offset, maxsize); + } return vfs_setpos(file, file->f_pos + offset, maxsize); } diff --git a/fs/signalfd.c b/fs/signalfd.c index d1a5f43ce466..d469782f97f4 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -277,15 +277,14 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) return ufd; } - file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx, - O_RDWR | (flags & O_NONBLOCK)); + file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops, + ctx, O_RDWR | (flags & O_NONBLOCK), + FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } - file->f_mode |= FMODE_NOWAIT; - fd_install(ufd, file); } else { CLASS(fd, f)(ufd); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 831fee962c4d..8dea0cf3a8de 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -59,8 +59,8 @@ extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t); +extern struct dentry *cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); extern int cifs_rename2(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index d07682020c64..4fc9485c5d91 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -114,19 +114,23 @@ again: mutex_lock(&ses->session_mutex); /* - * Recheck after acquire mutex. If another thread is negotiating - * and the server never sends an answer the socket will be closed - * and tcpStatus set to reconnect. + * Handle the case where a concurrent thread failed to negotiate or + * killed a channel. */ spin_lock(&server->srv_lock); - if (server->tcpStatus == CifsNeedReconnect) { + switch (server->tcpStatus) { + case CifsExiting: spin_unlock(&server->srv_lock); mutex_unlock(&ses->session_mutex); - - if (tcon->retry) - goto again; - rc = -EHOSTDOWN; - goto out; + return -EHOSTDOWN; + case CifsNeedReconnect: + spin_unlock(&server->srv_lock); + mutex_unlock(&ses->session_mutex); + if (!tcon->retry) + return -EHOSTDOWN; + goto again; + default: + break; } spin_unlock(&server->srv_lock); @@ -152,16 +156,20 @@ again: spin_unlock(&ses->ses_lock); rc = cifs_negotiate_protocol(0, ses, server); - if (!rc) { - rc = cifs_setup_session(0, ses, server, ses->local_nls); - if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) { - /* - * Try alternate password for next reconnect if an alternate - * password is available. - */ - if (ses->password2) - swap(ses->password2, ses->password); - } + if (rc) { + mutex_unlock(&ses->session_mutex); + if (!tcon->retry) + return -EHOSTDOWN; + goto again; + } + rc = cifs_setup_session(0, ses, server, ses->local_nls); + if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect if an alternate + * password is available. + */ + if (ses->password2) + swap(ses->password2, ses->password); } /* do we need to reconnect tcon? */ diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 8582cf61242c..9e4f7378f30f 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -388,7 +388,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) spin_unlock(&tcon->tc_lock); /* - * BB Add call to invalidate_inodes(sb) for all superblocks mounted + * BB Add call to evict_inodes(sb) for all superblocks mounted * to this tcon. */ } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 616149c7f0a5..3bb21aa58474 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2207,8 +2207,8 @@ posix_mkdir_get_info: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, - struct dentry *direntry, umode_t mode) +struct dentry *cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, + struct dentry *direntry, umode_t mode) { int rc = 0; unsigned int xid; @@ -2224,10 +2224,10 @@ int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, cifs_sb = CIFS_SB(inode->i_sb); if (unlikely(cifs_forced_shutdown(cifs_sb))) - return -EIO; + return ERR_PTR(-EIO); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return PTR_ERR(tlink); + return ERR_CAST(tlink); tcon = tlink_tcon(tlink); xid = get_xid(); @@ -2283,7 +2283,7 @@ mkdir_out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); - return rc; + return ERR_PTR(rc); } int cifs_rmdir(struct inode *inode, struct dentry *direntry) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index ed7812247ebc..f9c521b3c65e 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -300,32 +300,23 @@ again: mutex_lock(&ses->session_mutex); /* - * if this is called by delayed work, and the channel has been disabled - * in parallel, the delayed work can continue to execute in parallel - * there's a chance that this channel may not exist anymore + * Handle the case where a concurrent thread failed to negotiate or + * killed a channel. */ spin_lock(&server->srv_lock); - if (server->tcpStatus == CifsExiting) { + switch (server->tcpStatus) { + case CifsExiting: spin_unlock(&server->srv_lock); mutex_unlock(&ses->session_mutex); - rc = -EHOSTDOWN; - goto out; - } - - /* - * Recheck after acquire mutex. If another thread is negotiating - * and the server never sends an answer the socket will be closed - * and tcpStatus set to reconnect. - */ - if (server->tcpStatus == CifsNeedReconnect) { + return -EHOSTDOWN; + case CifsNeedReconnect: spin_unlock(&server->srv_lock); mutex_unlock(&ses->session_mutex); - - if (tcon->retry) - goto again; - - rc = -EHOSTDOWN; - goto out; + if (!tcon->retry) + return -EHOSTDOWN; + goto again; + default: + break; } spin_unlock(&server->srv_lock); @@ -350,43 +341,41 @@ again: spin_unlock(&ses->ses_lock); rc = cifs_negotiate_protocol(0, ses, server); - if (!rc) { - /* - * if server stopped supporting multichannel - * and the first channel reconnected, disable all the others. - */ - if (ses->chan_count > 1 && - !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - rc = cifs_chan_skip_or_disable(ses, server, - from_reconnect); - if (rc) { - mutex_unlock(&ses->session_mutex); - goto out; - } - } - - rc = cifs_setup_session(0, ses, server, ses->local_nls); - if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { - /* - * Try alternate password for next reconnect (key rotation - * could be enabled on the server e.g.) if an alternate - * password is available and the current password is expired, - * but do not swap on non pwd related errors like host down - */ - if (ses->password2) - swap(ses->password2, ses->password); - } - - if ((rc == -EACCES) && !tcon->retry) { - mutex_unlock(&ses->session_mutex); - rc = -EHOSTDOWN; - goto failed; - } else if (rc) { + if (rc) { + mutex_unlock(&ses->session_mutex); + if (!tcon->retry) + return -EHOSTDOWN; + goto again; + } + /* + * if server stopped supporting multichannel + * and the first channel reconnected, disable all the others. + */ + if (ses->chan_count > 1 && + !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + if (rc) { mutex_unlock(&ses->session_mutex); goto out; } - } else { + } + + rc = cifs_setup_session(0, ses, server, ses->local_nls); + if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect (key rotation + * could be enabled on the server e.g.) if an alternate + * password is available and the current password is expired, + * but do not swap on non pwd related errors like host down + */ + if (ses->password2) + swap(ses->password2, ses->password); + } + if (rc) { mutex_unlock(&ses->session_mutex); + if (rc == -EACCES && !tcon->retry) + return -EHOSTDOWN; goto out; } @@ -490,7 +479,6 @@ out: case SMB2_IOCTL: rc = -EAGAIN; } -failed: return rc; } diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 6890016e1923..8554aa5a1059 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -113,11 +113,6 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, if (IS_ERR(d)) goto err_out; - if (d_is_negative(d)) { - dput(d); - goto err_out; - } - path->dentry = d; path->mnt = mntget(parent_path->mnt); @@ -211,8 +206,8 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) { struct mnt_idmap *idmap; struct path path; - struct dentry *dentry; - int err; + struct dentry *dentry, *d; + int err = 0; dentry = ksmbd_vfs_kern_path_create(work, name, LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, @@ -227,27 +222,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; - err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); - if (!err && d_unhashed(dentry)) { - struct dentry *d; - - d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); - if (IS_ERR(d)) { - err = PTR_ERR(d); - goto out_err; - } - if (unlikely(d_is_negative(d))) { - dput(d); - err = -ENOENT; - goto out_err; - } - - ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); - dput(d); - } + d = dentry; + dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); + if (IS_ERR(dentry)) + err = PTR_ERR(dentry); + else if (d_is_negative(dentry)) + err = -ENOENT; + if (!err && dentry != d) + ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); -out_err: done_path_create(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); @@ -693,6 +676,7 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, struct ksmbd_file *parent_fp; int new_type; int err, lookup_flags = LOOKUP_NO_SYMLINKS; + int target_lookup_flags = LOOKUP_RENAME_TARGET; if (ksmbd_override_fsids(work)) return -ENOMEM; @@ -703,6 +687,14 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, goto revert_fsids; } + /* + * explicitly handle file overwrite case, for compatibility with + * filesystems that may not support rename flags (e.g: fuse) + */ + if (flags & RENAME_NOREPLACE) + target_lookup_flags |= LOOKUP_EXCL; + flags &= ~(RENAME_NOREPLACE); + retry: err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH, &new_path, &new_last, &new_type, @@ -743,7 +735,7 @@ retry: } new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | LOOKUP_RENAME_TARGET); + lookup_flags | target_lookup_flags); if (IS_ERR(new_dentry)) { err = PTR_ERR(new_dentry); goto out3; @@ -754,16 +746,6 @@ retry: goto out4; } - /* - * explicitly handle file overwrite case, for compatibility with - * filesystems that may not support rename flags (e.g: fuse) - */ - if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) { - err = -EEXIST; - goto out4; - } - flags &= ~(RENAME_NOREPLACE); - if (old_child == trap) { err = -EINVAL; goto out4; diff --git a/fs/splice.c b/fs/splice.c index 23fa5561b944..90d464241f15 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; - unsigned int mask = pipe->ring_size - 1; ssize_t ret = 0; int page_nr = 0; @@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, } while (!pipe_full(head, tail, pipe->max_usage)) { - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head); buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; @@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { @@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { - pipe->bufs[head & mask] = *buf; + *pipe_buf(pipe, head) = *buf; pipe->head = head + 1; return buf->len; } @@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); sd->len = buf->len; if (sd->len > sd->total_len) @@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); @@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; - unsigned int head, tail, mask; + unsigned int head, tail; size_t left; int n; @@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ @@ -752,7 +747,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (ret >= buf->len) { ret -= buf->len; buf->len = 0; @@ -809,7 +804,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, pipe_lock(pipe); while (len > 0) { - unsigned int head, tail, mask, bc = 0; + unsigned int head, tail, bc = 0; size_t remain = len; /* @@ -846,10 +841,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg; if (!buf->len) { @@ -894,7 +888,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, len -= ret; tail = pipe->tail; while (ret > 0) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; @@ -1725,7 +1719,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; int ret = 0; bool input_wakeup = false; @@ -1747,9 +1740,7 @@ retry: pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { size_t o_len; @@ -1792,8 +1783,8 @@ retry: goto retry; } - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); if (len >= ibuf->len) { /* @@ -1862,7 +1853,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; ssize_t ret = 0; /* @@ -1873,9 +1863,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { if (!opipe->readers) { @@ -1896,8 +1884,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_full(o_head, o_tail, opipe->max_usage)) break; - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); /* * Get a reference to this pipe buffer, diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 4db0d2b0aab8..181260e72680 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -198,7 +198,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) { int i, j; - if (cache == NULL) + if (IS_ERR(cache) || cache == NULL) return; for (i = 0; i < cache->entries; i++) { diff --git a/fs/super.c b/fs/super.c index 5a7db4a556e3..97a17f9d9023 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1417,7 +1417,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); - invalidate_inodes(sb); + evict_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); @@ -1737,61 +1737,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -int reconfigure_single(struct super_block *s, - int flags, void *data) -{ - struct fs_context *fc; - int ret; - - /* The caller really need to be passing fc down into mount_single(), - * then a chunk of this can be removed. [Bollocks -- AV] - * Better yet, reconfiguration shouldn't happen, but rather the second - * mount should be rejected if the parameters are not compatible. - */ - fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - ret = parse_monolithic_mount_data(fc, data); - if (ret < 0) - goto out; - - ret = reconfigure_super(fc); -out: - put_fs_context(fc); - return ret; -} - -static int compare_single(struct super_block *s, void *p) -{ - return 1; -} - -struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - - s = sget(fs_type, compare_single, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - if (!s->s_root) { - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (!error) - s->s_flags |= SB_ACTIVE; - } else { - error = reconfigure_single(s, flags, data); - } - if (unlikely(error)) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_single); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig deleted file mode 100644 index 67b3f90afbfd..000000000000 --- a/fs/sysv/Kconfig +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config SYSV_FS - tristate "System V/Xenix/V7/Coherent file system support" - depends on BLOCK - select BUFFER_HEAD - help - SCO, Xenix and Coherent are commercial Unix systems for Intel - machines, and Version 7 was used on the DEC PDP-11. Saying Y - here would allow you to read from their floppies and hard disk - partitions. - - If you have floppies or hard disk partitions like that, it is likely - that they contain binaries from those other Unix systems; in order - to run these binaries, you will want to install linux-abi which is - a set of kernel modules that lets you run SCO, Xenix, Wyse, - UnixWare, Dell Unix and System V programs under Linux. It is - available via FTP (user: ftp) from - <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>). - NOTE: that will work only for binaries from Intel-based systems; - PDP ones will have to wait until somebody ports Linux to -11 ;-) - - If you only intend to mount files from some other Unix over the - network using NFS, you don't need the System V file system support - (but you need NFS file system support obviously). - - Note that this option is generally not needed for floppies, since a - good portable way to transport files and directories between unixes - (and even other operating systems) is given by the tar program ("man - tar" or preferably "info tar"). Note also that this option has - nothing whatsoever to do with the option "System V IPC". Read about - the System V file system in - <file:Documentation/filesystems/sysv-fs.rst>. - Saying Y here will enlarge your kernel by about 27 KB. - - To compile this as a module, choose M here: the module will be called - sysv. - - If you haven't heard about all of this before, it's safe to say N. diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile deleted file mode 100644 index 17d12ba04b18..000000000000 --- a/fs/sysv/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Makefile for the Linux SystemV/Coherent filesystem routines. -# - -obj-$(CONFIG_SYSV_FS) += sysv.o - -sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \ - namei.o super.o diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c deleted file mode 100644 index 0e69dbdf7277..000000000000 --- a/fs/sysv/balloc.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/balloc.c - * - * minix/bitmap.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext/freelists.c - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * - * xenix/alloc.c - * Copyright (C) 1992 Doug Evans - * - * coh/alloc.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/balloc.c - * Copyright (C) 1993 Bruno Haible - * - * This file contains code for allocating/freeing blocks. - */ - -#include <linux/buffer_head.h> -#include <linux/string.h> -#include "sysv.h" - -/* We don't trust the value of - sb->sv_sbd2->s_tfree = *sb->sv_free_blocks - but we nevertheless keep it up to date. */ - -static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh) -{ - char *bh_data = bh->b_data; - - if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4) - return (sysv_zone_t*)(bh_data+4); - else - return (sysv_zone_t*)(bh_data+2); -} - -/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */ - -void sysv_free_block(struct super_block * sb, sysv_zone_t nr) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - sysv_zone_t *blocks = sbi->s_bcache; - unsigned count; - unsigned block = fs32_to_cpu(sbi, nr); - - /* - * This code does not work at all for AFS (it has a bitmap - * free list). As AFS is supposed to be read-only no one - * should call this for an AFS filesystem anyway... - */ - if (sbi->s_type == FSTYPE_AFS) - return; - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { - printk("sysv_free_block: trying to free block not in datazone\n"); - return; - } - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_bcache_count); - - if (count > sbi->s_flc_size) { - printk("sysv_free_block: flc_count > flc_size\n"); - mutex_unlock(&sbi->s_lock); - return; - } - /* If the free list head in super-block is full, it is copied - * into this block being freed, ditto if it's completely empty - * (applies only on Coherent). - */ - if (count == sbi->s_flc_size || count == 0) { - block += sbi->s_block_base; - bh = sb_getblk(sb, block); - if (!bh) { - printk("sysv_free_block: getblk() failed\n"); - mutex_unlock(&sbi->s_lock); - return; - } - memset(bh->b_data, 0, sb->s_blocksize); - *(__fs16*)bh->b_data = cpu_to_fs16(sbi, count); - memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t)); - mark_buffer_dirty(bh); - set_buffer_uptodate(bh); - brelse(bh); - count = 0; - } - sbi->s_bcache[count++] = nr; - - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - fs32_add(sbi, sbi->s_free_blocks, 1); - dirty_sb(sb); - mutex_unlock(&sbi->s_lock); -} - -sysv_zone_t sysv_new_block(struct super_block * sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - unsigned int block; - sysv_zone_t nr; - struct buffer_head * bh; - unsigned count; - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_bcache_count); - - if (count == 0) /* Applies only to Coherent FS */ - goto Enospc; - nr = sbi->s_bcache[--count]; - if (nr == 0) /* Applies only to Xenix FS, SystemV FS */ - goto Enospc; - - block = fs32_to_cpu(sbi, nr); - - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { - printk("sysv_new_block: new block %d is not in data zone\n", - block); - goto Enospc; - } - - if (count == 0) { /* the last block continues the free list */ - unsigned count; - - block += sbi->s_block_base; - if (!(bh = sb_bread(sb, block))) { - printk("sysv_new_block: cannot read free-list block\n"); - /* retry this same block next time */ - *sbi->s_bcache_count = cpu_to_fs16(sbi, 1); - goto Enospc; - } - count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); - if (count > sbi->s_flc_size) { - printk("sysv_new_block: free-list block with >flc_size entries\n"); - brelse(bh); - goto Enospc; - } - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - memcpy(sbi->s_bcache, get_chunk(sb, bh), - count * sizeof(sysv_zone_t)); - brelse(bh); - } - /* Now the free list head in the superblock is valid again. */ - fs32_add(sbi, sbi->s_free_blocks, -1); - dirty_sb(sb); - mutex_unlock(&sbi->s_lock); - return nr; - -Enospc: - mutex_unlock(&sbi->s_lock); - return 0; -} - -unsigned long sysv_count_free_blocks(struct super_block * sb) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - int sb_count; - int count; - struct buffer_head * bh = NULL; - sysv_zone_t *blocks; - unsigned block; - int n; - - /* - * This code does not work at all for AFS (it has a bitmap - * free list). As AFS is supposed to be read-only we just - * lie and say it has no free block at all. - */ - if (sbi->s_type == FSTYPE_AFS) - return 0; - - mutex_lock(&sbi->s_lock); - sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks); - - if (0) - goto trust_sb; - - /* this causes a lot of disk traffic ... */ - count = 0; - n = fs16_to_cpu(sbi, *sbi->s_bcache_count); - blocks = sbi->s_bcache; - while (1) { - sysv_zone_t zone; - if (n > sbi->s_flc_size) - goto E2big; - zone = 0; - while (n && (zone = blocks[--n]) != 0) - count++; - if (zone == 0) - break; - - block = fs32_to_cpu(sbi, zone); - if (bh) - brelse(bh); - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) - goto Einval; - block += sbi->s_block_base; - bh = sb_bread(sb, block); - if (!bh) - goto Eio; - n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); - blocks = get_chunk(sb, bh); - } - if (bh) - brelse(bh); - if (count != sb_count) - goto Ecount; -done: - mutex_unlock(&sbi->s_lock); - return count; - -Einval: - printk("sysv_count_free_blocks: new block %d is not in data zone\n", - block); - goto trust_sb; -Eio: - printk("sysv_count_free_blocks: cannot read free-list block\n"); - goto trust_sb; -E2big: - printk("sysv_count_free_blocks: >flc_size entries in free-list block\n"); - if (bh) - brelse(bh); -trust_sb: - count = sb_count; - goto done; -Ecount: - printk("sysv_count_free_blocks: free block count was %d, " - "correcting to %d\n", sb_count, count); - if (!sb_rdonly(sb)) { - *sbi->s_free_blocks = cpu_to_fs32(sbi, count); - dirty_sb(sb); - } - goto done; -} diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c deleted file mode 100644 index 639307e2ff8c..000000000000 --- a/fs/sysv/dir.c +++ /dev/null @@ -1,378 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/dir.c - * - * minix/dir.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/dir.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/dir.c - * Copyright (C) 1993 Bruno Haible - * - * SystemV/Coherent directory handling functions - */ - -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include "sysv.h" - -static int sysv_readdir(struct file *, struct dir_context *); - -const struct file_operations sysv_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate_shared = sysv_readdir, - .fsync = generic_file_fsync, -}; - -static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) -{ - struct address_space *mapping = folio->mapping; - struct inode *dir = mapping->host; - - block_write_end(NULL, mapping, pos, len, len, folio, NULL); - if (pos+len > dir->i_size) { - i_size_write(dir, pos+len); - mark_inode_dirty(dir); - } - folio_unlock(folio); -} - -static int sysv_handle_dirsync(struct inode *dir) -{ - int err; - - err = filemap_write_and_wait(dir->i_mapping); - if (!err) - err = sync_inode_metadata(dir, 1); - return err; -} - -/* - * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the - * rules documented in mm/highmem.rst. - * - * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio() - * and must be treated accordingly for nesting purposes. - */ -static void *dir_get_folio(struct inode *dir, unsigned long n, - struct folio **foliop) -{ - struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); - - if (IS_ERR(folio)) - return ERR_CAST(folio); - *foliop = folio; - return kmap_local_folio(folio, 0); -} - -static int sysv_readdir(struct file *file, struct dir_context *ctx) -{ - unsigned long pos = ctx->pos; - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - unsigned long npages = dir_pages(inode); - unsigned offset; - unsigned long n; - - ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); - if (pos >= inode->i_size) - return 0; - - offset = pos & ~PAGE_MASK; - n = pos >> PAGE_SHIFT; - - for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; - struct sysv_dir_entry *de; - struct folio *folio; - - kaddr = dir_get_folio(inode, n, &folio); - if (IS_ERR(kaddr)) - continue; - de = (struct sysv_dir_entry *)(kaddr+offset); - limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE; - for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { - char *name = de->name; - - if (!de->inode) - continue; - - if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), - fs16_to_cpu(SYSV_SB(sb), de->inode), - DT_UNKNOWN)) { - folio_release_kmap(folio, kaddr); - return 0; - } - } - folio_release_kmap(folio, kaddr); - } - return 0; -} - -/* compare strings: name[0..len-1] (not zero-terminated) and - * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1]) - */ -static inline int namecompare(int len, int maxlen, - const char * name, const char * buffer) -{ - if (len < maxlen && buffer[len]) - return 0; - return !memcmp(name, buffer, len); -} - -/* - * sysv_find_entry() - * - * finds an entry in the specified directory with the wanted name. - * It does NOT read the inode of the - * entry - you'll have to do that yourself if you want to. - * - * On Success folio_release_kmap() should be called on *foliop. - * - * sysv_find_entry() acts as a call to dir_get_folio() and must be treated - * accordingly for nesting purposes. - */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop) -{ - const char * name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct inode * dir = d_inode(dentry->d_parent); - unsigned long start, n; - unsigned long npages = dir_pages(dir); - struct sysv_dir_entry *de; - - start = SYSV_I(dir)->i_dir_start_lookup; - if (start >= npages) - start = 0; - n = start; - - do { - char *kaddr = dir_get_folio(dir, n, foliop); - - if (!IS_ERR(kaddr)) { - de = (struct sysv_dir_entry *)kaddr; - kaddr += folio_size(*foliop) - SYSV_DIRSIZE; - for ( ; (char *) de <= kaddr ; de++) { - if (!de->inode) - continue; - if (namecompare(namelen, SYSV_NAMELEN, - name, de->name)) - goto found; - } - folio_release_kmap(*foliop, kaddr); - } - - if (++n >= npages) - n = 0; - } while (n != start); - - return NULL; - -found: - SYSV_I(dir)->i_dir_start_lookup = n; - return de; -} - -int sysv_add_link(struct dentry *dentry, struct inode *inode) -{ - struct inode *dir = d_inode(dentry->d_parent); - const char * name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct folio *folio = NULL; - struct sysv_dir_entry * de; - unsigned long npages = dir_pages(dir); - unsigned long n; - char *kaddr; - loff_t pos; - int err; - - /* We take care of directory expansion in the same loop */ - for (n = 0; n <= npages; n++) { - kaddr = dir_get_folio(dir, n, &folio); - if (IS_ERR(kaddr)) - return PTR_ERR(kaddr); - de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE - SYSV_DIRSIZE; - while ((char *)de <= kaddr) { - if (!de->inode) - goto got_it; - err = -EEXIST; - if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) - goto out_folio; - de++; - } - folio_release_kmap(folio, kaddr); - } - BUG(); - return -EINVAL; - -got_it: - pos = folio_pos(folio) + offset_in_folio(folio, de); - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) - goto out_unlock; - memcpy (de->name, name, namelen); - memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - mark_inode_dirty(dir); - err = sysv_handle_dirsync(dir); -out_folio: - folio_release_kmap(folio, kaddr); - return err; -out_unlock: - folio_unlock(folio); - goto out_folio; -} - -int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) -{ - struct inode *inode = folio->mapping->host; - loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); - int err; - - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - return err; - } - de->inode = 0; - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - mark_inode_dirty(inode); - return sysv_handle_dirsync(inode); -} - -int sysv_make_empty(struct inode *inode, struct inode *dir) -{ - struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); - struct sysv_dir_entry * de; - char *kaddr; - int err; - - if (IS_ERR(folio)) - return PTR_ERR(folio); - err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - goto fail; - } - kaddr = kmap_local_folio(folio, 0); - memset(kaddr, 0, folio_size(folio)); - - de = (struct sysv_dir_entry *)kaddr; - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - strcpy(de->name,"."); - de++; - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); - strcpy(de->name,".."); - - kunmap_local(kaddr); - dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE); - err = sysv_handle_dirsync(inode); -fail: - folio_put(folio); - return err; -} - -/* - * routine to check that the specified directory is empty (for rmdir) - */ -int sysv_empty_dir(struct inode * inode) -{ - struct super_block *sb = inode->i_sb; - struct folio *folio = NULL; - unsigned long i, npages = dir_pages(inode); - char *kaddr; - - for (i = 0; i < npages; i++) { - struct sysv_dir_entry *de; - - kaddr = dir_get_folio(inode, i, &folio); - if (IS_ERR(kaddr)) - continue; - - de = (struct sysv_dir_entry *)kaddr; - kaddr += folio_size(folio) - SYSV_DIRSIZE; - - for ( ;(char *)de <= kaddr; de++) { - if (!de->inode) - continue; - /* check for . and .. */ - if (de->name[0] != '.') - goto not_empty; - if (!de->name[1]) { - if (de->inode == cpu_to_fs16(SYSV_SB(sb), - inode->i_ino)) - continue; - goto not_empty; - } - if (de->name[1] != '.' || de->name[2]) - goto not_empty; - } - folio_release_kmap(folio, kaddr); - } - return 1; - -not_empty: - folio_release_kmap(folio, kaddr); - return 0; -} - -/* Releases the page */ -int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, - struct inode *inode) -{ - struct inode *dir = folio->mapping->host; - loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); - int err; - - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - return err; - } - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - mark_inode_dirty(dir); - return sysv_handle_dirsync(inode); -} - -/* - * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the - * rules documented in mm/highmem.rst. - * - * sysv_dotdot() acts as a call to dir_get_folio() and must be treated - * accordingly for nesting purposes. - */ -struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop) -{ - struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop); - - if (IS_ERR(de)) - return NULL; - /* ".." is the second directory entry */ - return de + 1; -} - -ino_t sysv_inode_by_name(struct dentry *dentry) -{ - struct folio *folio; - struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio); - ino_t res = 0; - - if (de) { - res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - folio_release_kmap(folio, de); - } - return res; -} diff --git a/fs/sysv/file.c b/fs/sysv/file.c deleted file mode 100644 index c645f60bdb7f..000000000000 --- a/fs/sysv/file.c +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/file.c - * - * minix/file.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/file.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/file.c - * Copyright (C) 1993 Bruno Haible - * - * SystemV/Coherent regular file handling primitives - */ - -#include "sysv.h" - -/* - * We have mostly NULLs here: the current defaults are OK for - * the coh filesystem. - */ -const struct file_operations sysv_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .fsync = generic_file_fsync, - .splice_read = filemap_splice_read, -}; - -static int sysv_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - int error; - - error = setattr_prepare(&nop_mnt_idmap, dentry, attr); - if (error) - return error; - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = inode_newsize_ok(inode, attr->ia_size); - if (error) - return error; - truncate_setsize(inode, attr->ia_size); - sysv_truncate(inode); - } - - setattr_copy(&nop_mnt_idmap, inode, attr); - mark_inode_dirty(inode); - return 0; -} - -const struct inode_operations sysv_file_inode_operations = { - .setattr = sysv_setattr, - .getattr = sysv_getattr, -}; diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c deleted file mode 100644 index 269df6d49815..000000000000 --- a/fs/sysv/ialloc.c +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/ialloc.c - * - * minix/bitmap.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext/freelists.c - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * - * xenix/alloc.c - * Copyright (C) 1992 Doug Evans - * - * coh/alloc.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/ialloc.c - * Copyright (C) 1993 Bruno Haible - * - * This file contains code for allocating/freeing inodes. - */ - -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/sched.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include "sysv.h" - -/* We don't trust the value of - sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes - but we nevertheless keep it up to date. */ - -/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */ - -/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */ -static inline sysv_ino_t * -sv_sb_fic_inode(struct super_block * sb, unsigned int i) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - if (sbi->s_bh1 == sbi->s_bh2) - return &sbi->s_sb_fic_inodes[i]; - else { - /* 512 byte Xenix FS */ - unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]); - if (offset < 512) - return (sysv_ino_t*)(sbi->s_sbd1 + offset); - else - return (sysv_ino_t*)(sbi->s_sbd2 + offset); - } -} - -struct sysv_inode * -sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct sysv_inode *res; - int block = sbi->s_firstinodezone + sbi->s_block_base; - - block += (ino-1) >> sbi->s_inodes_per_block_bits; - *bh = sb_bread(sb, block); - if (!*bh) - return NULL; - res = (struct sysv_inode *)(*bh)->b_data; - return res + ((ino-1) & sbi->s_inodes_per_block_1); -} - -static int refill_free_cache(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - int i = 0, ino; - - ino = SYSV_ROOT_INO+1; - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto out; - while (ino <= sbi->s_ninodes) { - if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) { - *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino); - if (i == sbi->s_fic_size) - break; - } - if ((ino++ & sbi->s_inodes_per_block_1) == 0) { - brelse(bh); - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto out; - } else - raw_inode++; - } - brelse(bh); -out: - return i; -} - -void sysv_free_inode(struct inode * inode) -{ - struct super_block *sb = inode->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - unsigned int ino; - struct buffer_head * bh; - struct sysv_inode * raw_inode; - unsigned count; - - sb = inode->i_sb; - ino = inode->i_ino; - if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) { - printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n"); - return; - } - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("sysv_free_inode: unable to read inode block on device " - "%s\n", inode->i_sb->s_id); - return; - } - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); - if (count < sbi->s_fic_size) { - *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino); - *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); - } - fs16_add(sbi, sbi->s_sb_total_free_inodes, 1); - dirty_sb(sb); - memset(raw_inode, 0, sizeof(struct sysv_inode)); - mark_buffer_dirty(bh); - mutex_unlock(&sbi->s_lock); - brelse(bh); -} - -struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) -{ - struct super_block *sb = dir->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct inode *inode; - sysv_ino_t ino; - unsigned count; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE - }; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); - if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) { - count = refill_free_cache(sb); - if (count == 0) { - iput(inode); - mutex_unlock(&sbi->s_lock); - return ERR_PTR(-ENOSPC); - } - } - /* Now count > 0. */ - ino = *sv_sb_fic_inode(sb,--count); - *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); - fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); - dirty_sb(sb); - inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_ino = fs16_to_cpu(sbi, ino); - simple_inode_init_ts(inode); - inode->i_blocks = 0; - memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); - SYSV_I(inode)->i_dir_start_lookup = 0; - insert_inode_hash(inode); - mark_inode_dirty(inode); - - sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ - mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ - /* That's it. */ - mutex_unlock(&sbi->s_lock); - return inode; -} - -unsigned long sysv_count_free_inodes(struct super_block * sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - int ino, count, sb_count; - - mutex_lock(&sbi->s_lock); - - sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes); - - if (0) - goto trust_sb; - - /* this causes a lot of disk traffic ... */ - count = 0; - ino = SYSV_ROOT_INO+1; - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto Eio; - while (ino <= sbi->s_ninodes) { - if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) - count++; - if ((ino++ & sbi->s_inodes_per_block_1) == 0) { - brelse(bh); - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto Eio; - } else - raw_inode++; - } - brelse(bh); - if (count != sb_count) - goto Einval; -out: - mutex_unlock(&sbi->s_lock); - return count; - -Einval: - printk("sysv_count_free_inodes: " - "free inode count was %d, correcting to %d\n", - sb_count, count); - if (!sb_rdonly(sb)) { - *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count); - dirty_sb(sb); - } - goto out; - -Eio: - printk("sysv_count_free_inodes: unable to read inode table\n"); -trust_sb: - count = sb_count; - goto out; -} diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c deleted file mode 100644 index 76bc2d5e75a9..000000000000 --- a/fs/sysv/inode.c +++ /dev/null @@ -1,354 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/inode.c - * - * minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * xenix/inode.c - * Copyright (C) 1992 Doug Evans - * - * coh/inode.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/inode.c - * Copyright (C) 1993 Paul B. Monday - * - * sysv/inode.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - * - * This file contains code for allocating/freeing inodes and for read/writing - * the superblock. - */ - -#include <linux/highuid.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/buffer_head.h> -#include <linux/vfs.h> -#include <linux/writeback.h> -#include <linux/namei.h> -#include <asm/byteorder.h> -#include "sysv.h" - -static int sysv_sync_fs(struct super_block *sb, int wait) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - u32 time = (u32)ktime_get_real_seconds(), old_time; - - mutex_lock(&sbi->s_lock); - - /* - * If we are going to write out the super block, - * then attach current time stamp. - * But if the filesystem was marked clean, keep it clean. - */ - old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); - if (sbi->s_type == FSTYPE_SYSV4) { - if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time)) - *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time); - *sbi->s_sb_time = cpu_to_fs32(sbi, time); - mark_buffer_dirty(sbi->s_bh2); - } - - mutex_unlock(&sbi->s_lock); - - return 0; -} - -static int sysv_remount(struct super_block *sb, int *flags, char *data) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - sync_filesystem(sb); - if (sbi->s_forced_ro) - *flags |= SB_RDONLY; - return 0; -} - -static void sysv_put_super(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - if (!sb_rdonly(sb)) { - /* XXX ext2 also updates the state here */ - mark_buffer_dirty(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - mark_buffer_dirty(sbi->s_bh2); - } - - brelse(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - brelse(sbi->s_bh2); - - kfree(sbi); -} - -static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - - buf->f_type = sb->s_magic; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = sbi->s_ndatazones; - buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb); - buf->f_files = sbi->s_ninodes; - buf->f_ffree = sysv_count_free_inodes(sb); - buf->f_namelen = SYSV_NAMELEN; - buf->f_fsid = u64_to_fsid(id); - return 0; -} - -/* - * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32 - */ -static inline void read3byte(struct sysv_sb_info *sbi, - unsigned char * from, unsigned char * to) -{ - if (sbi->s_bytesex == BYTESEX_PDP) { - to[0] = from[0]; - to[1] = 0; - to[2] = from[1]; - to[3] = from[2]; - } else if (sbi->s_bytesex == BYTESEX_LE) { - to[0] = from[0]; - to[1] = from[1]; - to[2] = from[2]; - to[3] = 0; - } else { - to[0] = 0; - to[1] = from[0]; - to[2] = from[1]; - to[3] = from[2]; - } -} - -static inline void write3byte(struct sysv_sb_info *sbi, - unsigned char * from, unsigned char * to) -{ - if (sbi->s_bytesex == BYTESEX_PDP) { - to[0] = from[0]; - to[1] = from[2]; - to[2] = from[3]; - } else if (sbi->s_bytesex == BYTESEX_LE) { - to[0] = from[0]; - to[1] = from[1]; - to[2] = from[2]; - } else { - to[0] = from[1]; - to[1] = from[2]; - to[2] = from[3]; - } -} - -static const struct inode_operations sysv_symlink_inode_operations = { - .get_link = page_get_link, - .getattr = sysv_getattr, -}; - -void sysv_set_inode(struct inode *inode, dev_t rdev) -{ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &sysv_file_inode_operations; - inode->i_fop = &sysv_file_operations; - inode->i_mapping->a_ops = &sysv_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &sysv_dir_inode_operations; - inode->i_fop = &sysv_dir_operations; - inode->i_mapping->a_ops = &sysv_aops; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &sysv_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &sysv_aops; - } else - init_special_inode(inode, inode->i_mode, rdev); -} - -struct inode *sysv_iget(struct super_block *sb, unsigned int ino) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - struct sysv_inode_info * si; - struct inode *inode; - unsigned int block; - - if (!ino || ino > sbi->s_ninodes) { - printk("Bad inode number on dev %s: %d is out of range\n", - sb->s_id, ino); - return ERR_PTR(-EIO); - } - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("Major problem: unable to read inode from dev %s\n", - inode->i_sb->s_id); - goto bad_inode; - } - /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */ - inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); - i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid)); - i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid)); - set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); - inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); - inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0); - inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0); - inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0); - inode->i_blocks = 0; - - si = SYSV_I(inode); - for (block = 0; block < 10+1+1+1; block++) - read3byte(sbi, &raw_inode->i_data[3*block], - (u8 *)&si->i_data[block]); - brelse(bh); - si->i_dir_start_lookup = 0; - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - sysv_set_inode(inode, - old_decode_dev(fs32_to_cpu(sbi, si->i_data[0]))); - else - sysv_set_inode(inode, 0); - unlock_new_inode(inode); - return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(-EIO); -} - -static int __sysv_write_inode(struct inode *inode, int wait) -{ - struct super_block * sb = inode->i_sb; - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - struct sysv_inode_info * si; - unsigned int ino, block; - int err = 0; - - ino = inode->i_ino; - if (!ino || ino > sbi->s_ninodes) { - printk("Bad inode number on dev %s: %d is out of range\n", - inode->i_sb->s_id, ino); - return -EIO; - } - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("unable to read i-node block\n"); - return -EIO; - } - - raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); - raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode))); - raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode))); - raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); - raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); - raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode)); - raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode)); - raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode)); - - si = SYSV_I(inode); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev)); - for (block = 0; block < 10+1+1+1; block++) - write3byte(sbi, (u8 *)&si->i_data[block], - &raw_inode->i_data[3*block]); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk ("IO error syncing sysv inode [%s:%08x]\n", - sb->s_id, ino); - err = -EIO; - } - } - brelse(bh); - return err; -} - -int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); -} - -int sysv_sync_inode(struct inode *inode) -{ - return __sysv_write_inode(inode, 1); -} - -static void sysv_evict_inode(struct inode *inode) -{ - truncate_inode_pages_final(&inode->i_data); - if (!inode->i_nlink) { - inode->i_size = 0; - sysv_truncate(inode); - } - invalidate_inode_buffers(inode); - clear_inode(inode); - if (!inode->i_nlink) - sysv_free_inode(inode); -} - -static struct kmem_cache *sysv_inode_cachep; - -static struct inode *sysv_alloc_inode(struct super_block *sb) -{ - struct sysv_inode_info *si; - - si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL); - if (!si) - return NULL; - return &si->vfs_inode; -} - -static void sysv_free_in_core_inode(struct inode *inode) -{ - kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); -} - -static void init_once(void *p) -{ - struct sysv_inode_info *si = (struct sysv_inode_info *)p; - - inode_init_once(&si->vfs_inode); -} - -const struct super_operations sysv_sops = { - .alloc_inode = sysv_alloc_inode, - .free_inode = sysv_free_in_core_inode, - .write_inode = sysv_write_inode, - .evict_inode = sysv_evict_inode, - .put_super = sysv_put_super, - .sync_fs = sysv_sync_fs, - .remount_fs = sysv_remount, - .statfs = sysv_statfs, -}; - -int __init sysv_init_icache(void) -{ - sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", - sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, - init_once); - if (!sysv_inode_cachep) - return -ENOMEM; - return 0; -} - -void sysv_destroy_icache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(sysv_inode_cachep); -} diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c deleted file mode 100644 index 451e95f474fa..000000000000 --- a/fs/sysv/itree.c +++ /dev/null @@ -1,511 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/itree.c - * - * Handling of indirect blocks' trees. - * AV, Sep--Dec 2000 - */ - -#include <linux/buffer_head.h> -#include <linux/mount.h> -#include <linux/mpage.h> -#include <linux/string.h> -#include "sysv.h" - -enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */ - -static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode) -{ - mark_buffer_dirty_inode(bh, inode); - if (IS_SYNC(inode)) - sync_dirty_buffer(bh); -} - -static int block_to_path(struct inode *inode, long block, int offsets[DEPTH]) -{ - struct super_block *sb = inode->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - int ptrs_bits = sbi->s_ind_per_block_bits; - unsigned long indirect_blocks = sbi->s_ind_per_block, - double_blocks = sbi->s_ind_per_block_2; - int n = 0; - - if (block < 0) { - printk("sysv_block_map: block < 0\n"); - } else if (block < DIRECT) { - offsets[n++] = block; - } else if ( (block -= DIRECT) < indirect_blocks) { - offsets[n++] = DIRECT; - offsets[n++] = block; - } else if ((block -= indirect_blocks) < double_blocks) { - offsets[n++] = DIRECT+1; - offsets[n++] = block >> ptrs_bits; - offsets[n++] = block & (indirect_blocks - 1); - } else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) { - offsets[n++] = DIRECT+2; - offsets[n++] = block >> (ptrs_bits * 2); - offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1); - offsets[n++] = block & (indirect_blocks - 1); - } else { - /* nothing */; - } - return n; -} - -static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr) -{ - return sbi->s_block_base + fs32_to_cpu(sbi, nr); -} - -typedef struct { - sysv_zone_t *p; - sysv_zone_t key; - struct buffer_head *bh; -} Indirect; - -static DEFINE_RWLOCK(pointers_lock); - -static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -static inline int verify_chain(Indirect *from, Indirect *to) -{ - while (from <= to && from->key == *from->p) - from++; - return (from > to); -} - -static inline sysv_zone_t *block_end(struct buffer_head *bh) -{ - return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); -} - -static Indirect *get_branch(struct inode *inode, - int depth, - int offsets[], - Indirect chain[], - int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - int block = block_to_cpu(SYSV_SB(sb), p->key); - bh = sb_bread(sb, block); - if (!bh) - goto failure; - read_lock(&pointers_lock); - if (!verify_chain(chain, p)) - goto changed; - add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); - read_unlock(&pointers_lock); - if (!p->key) - goto no_block; - } - return NULL; - -changed: - read_unlock(&pointers_lock); - brelse(bh); - *err = -EAGAIN; - goto no_block; -failure: - *err = -EIO; -no_block: - return p; -} - -static int alloc_branch(struct inode *inode, - int num, - int *offsets, - Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int n = 0; - int i; - - branch[0].key = sysv_new_block(inode->i_sb); - if (branch[0].key) for (n = 1; n < num; n++) { - struct buffer_head *bh; - int parent; - /* Allocate the next block */ - branch[n].key = sysv_new_block(inode->i_sb); - if (!branch[n].key) - break; - /* - * Get buffer_head for parent block, zero it out and set - * the pointer to new one, then send parent to disk. - */ - parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key); - bh = sb_getblk(inode->i_sb, parent); - if (!bh) { - sysv_free_block(inode->i_sb, branch[n].key); - break; - } - lock_buffer(bh); - memset(bh->b_data, 0, blocksize); - branch[n].bh = bh; - branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n]; - *branch[n].p = branch[n].key; - set_buffer_uptodate(bh); - unlock_buffer(bh); - dirty_indirect(bh, inode); - } - if (n == num) - return 0; - - /* Allocation failed, free what we already allocated */ - for (i = 1; i < n; i++) - bforget(branch[i].bh); - for (i = 0; i < n; i++) - sysv_free_block(inode->i_sb, branch[i].key); - return -ENOSPC; -} - -static inline int splice_branch(struct inode *inode, - Indirect chain[], - Indirect *where, - int num) -{ - int i; - - /* Verify that place we are splicing to is still there and vacant */ - write_lock(&pointers_lock); - if (!verify_chain(chain, where-1) || *where->p) - goto changed; - *where->p = where->key; - write_unlock(&pointers_lock); - - inode_set_ctime_current(inode); - - /* had we spliced it onto indirect block? */ - if (where->bh) - dirty_indirect(where->bh, inode); - - if (IS_SYNC(inode)) - sysv_sync_inode(inode); - else - mark_inode_dirty(inode); - return 0; - -changed: - write_unlock(&pointers_lock); - for (i = 1; i < num; i++) - bforget(where[i].bh); - for (i = 0; i < num; i++) - sysv_free_block(inode->i_sb, where[i].key); - return -EAGAIN; -} - -static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) -{ - int err = -EIO; - int offsets[DEPTH]; - Indirect chain[DEPTH]; - struct super_block *sb = inode->i_sb; - Indirect *partial; - int left; - int depth = block_to_path(inode, iblock, offsets); - - if (depth == 0) - goto out; - -reread: - partial = get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { -got_it: - map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb), - chain[depth-1].key)); - /* Clean up and exit */ - partial = chain+depth-1; /* the whole chain */ - goto cleanup; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) { -cleanup: - while (partial > chain) { - brelse(partial->bh); - partial--; - } -out: - return err; - } - - /* - * Indirect block might be removed by truncate while we were - * reading it. Handling of that case (forget what we've got and - * reread) is taken out of the main path. - */ - if (err == -EAGAIN) - goto changed; - - left = (chain + depth) - partial; - err = alloc_branch(inode, left, offsets+(partial-chain), partial); - if (err) - goto cleanup; - - if (splice_branch(inode, chain, partial, left) < 0) - goto changed; - - set_buffer_new(bh_result); - goto got_it; - -changed: - while (partial > chain) { - brelse(partial->bh); - partial--; - } - goto reread; -} - -static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -static Indirect *find_shared(struct inode *inode, - int depth, - int offsets[], - Indirect chain[], - sysv_zone_t *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = get_branch(inode, k, offsets, chain, &err); - - write_lock(&pointers_lock); - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) { - write_unlock(&pointers_lock); - goto no_top; - } - for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - *p->p = 0; - } - write_unlock(&pointers_lock); - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q) -{ - for ( ; p < q ; p++) { - sysv_zone_t nr = *p; - if (nr) { - *p = 0; - sysv_free_block(inode->i_sb, nr); - mark_inode_dirty(inode); - } - } -} - -static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth) -{ - struct buffer_head * bh; - struct super_block *sb = inode->i_sb; - - if (depth--) { - for ( ; p < q ; p++) { - int block; - sysv_zone_t nr = *p; - if (!nr) - continue; - *p = 0; - block = block_to_cpu(SYSV_SB(sb), nr); - bh = sb_bread(sb, block); - if (!bh) - continue; - free_branches(inode, (sysv_zone_t*)bh->b_data, - block_end(bh), depth); - bforget(bh); - sysv_free_block(sb, nr); - mark_inode_dirty(inode); - } - } else - free_data(inode, p, q); -} - -void sysv_truncate (struct inode * inode) -{ - sysv_zone_t *i_data = SYSV_I(inode)->i_data; - int offsets[DEPTH]; - Indirect chain[DEPTH]; - Indirect *partial; - sysv_zone_t nr = 0; - int n; - long iblock; - unsigned blocksize; - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - - blocksize = inode->i_sb->s_blocksize; - iblock = (inode->i_size + blocksize-1) - >> inode->i_sb->s_blocksize_bits; - - block_truncate_page(inode->i_mapping, inode->i_size, get_block); - - n = block_to_path(inode, iblock, offsets); - if (n == 0) - return; - - if (n == 1) { - free_data(inode, i_data+offsets[0], i_data + DIRECT); - goto do_indirects; - } - - partial = find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (already detached) */ - if (nr) { - if (partial == chain) - mark_inode_dirty(inode); - else - dirty_indirect(partial->bh, inode); - free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - free_branches(inode, partial->p + 1, block_end(partial->bh), - (chain+n-1) - partial); - dirty_indirect(partial->bh, inode); - brelse (partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees (== subtrees deeper than...) */ - while (n < DEPTH) { - nr = i_data[DIRECT + n - 1]; - if (nr) { - i_data[DIRECT + n - 1] = 0; - mark_inode_dirty(inode); - free_branches(inode, &nr, &nr+1, n); - } - n++; - } - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - if (IS_SYNC(inode)) - sysv_sync_inode (inode); - else - mark_inode_dirty(inode); -} - -static unsigned sysv_nblocks(struct super_block *s, loff_t size) -{ - struct sysv_sb_info *sbi = SYSV_SB(s); - int ptrs_bits = sbi->s_ind_per_block_bits; - unsigned blocks, res, direct = DIRECT, i = DEPTH; - blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits; - res = blocks; - while (--i && blocks > direct) { - blocks = ((blocks - direct - 1) >> ptrs_bits) + 1; - res += blocks; - direct = 1; - } - return res; -} - -int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, - struct kstat *stat, u32 request_mask, unsigned int flags) -{ - struct super_block *s = path->dentry->d_sb; - generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), - stat); - stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); - stat->blksize = s->s_blocksize; - return 0; -} - -static int sysv_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return mpage_writepages(mapping, wbc, get_block); -} - -static int sysv_read_folio(struct file *file, struct folio *folio) -{ - return block_read_full_folio(folio, get_block); -} - -int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) -{ - return __block_write_begin(folio, pos, len, get_block); -} - -static void sysv_write_failed(struct address_space *mapping, loff_t to) -{ - struct inode *inode = mapping->host; - - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - sysv_truncate(inode); - } -} - -static int sysv_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) -{ - int ret; - - ret = block_write_begin(mapping, pos, len, foliop, get_block); - if (unlikely(ret)) - sysv_write_failed(mapping, pos + len); - - return ret; -} - -static sector_t sysv_bmap(struct address_space *mapping, sector_t block) -{ - return generic_block_bmap(mapping,block,get_block); -} - -const struct address_space_operations sysv_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = sysv_read_folio, - .writepages = sysv_writepages, - .write_begin = sysv_write_begin, - .write_end = generic_write_end, - .migrate_folio = buffer_migrate_folio, - .bmap = sysv_bmap -}; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c deleted file mode 100644 index fb8bd8437872..000000000000 --- a/fs/sysv/namei.c +++ /dev/null @@ -1,280 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/namei.c - * - * minix/namei.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/namei.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/namei.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - */ - -#include <linux/pagemap.h> -#include "sysv.h" - -static int add_nondir(struct dentry *dentry, struct inode *inode) -{ - int err = sysv_add_link(dentry, inode); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } - inode_dec_link_count(inode); - iput(inode); - return err; -} - -static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) -{ - struct inode * inode = NULL; - ino_t ino; - - if (dentry->d_name.len > SYSV_NAMELEN) - return ERR_PTR(-ENAMETOOLONG); - ino = sysv_inode_by_name(dentry); - if (ino) - inode = sysv_iget(dir->i_sb, ino); - return d_splice_alias(inode, dentry); -} - -static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t rdev) -{ - struct inode * inode; - int err; - - if (!old_valid_dev(rdev)) - return -EINVAL; - - inode = sysv_new_inode(dir, mode); - err = PTR_ERR(inode); - - if (!IS_ERR(inode)) { - sysv_set_inode(inode, rdev); - mark_inode_dirty(inode); - err = add_nondir(dentry, inode); - } - return err; -} - -static int sysv_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) -{ - return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); -} - -static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, const char *symname) -{ - int err = -ENAMETOOLONG; - int l = strlen(symname)+1; - struct inode * inode; - - if (l > dir->i_sb->s_blocksize) - goto out; - - inode = sysv_new_inode(dir, S_IFLNK|0777); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - sysv_set_inode(inode, 0); - err = page_symlink(inode, symname, l); - if (err) - goto out_fail; - - mark_inode_dirty(inode); - err = add_nondir(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - iput(inode); - goto out; -} - -static int sysv_link(struct dentry * old_dentry, struct inode * dir, - struct dentry * dentry) -{ - struct inode *inode = d_inode(old_dentry); - - inode_set_ctime_current(inode); - inode_inc_link_count(inode); - ihold(inode); - - return add_nondir(dentry, inode); -} - -static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - struct inode * inode; - int err; - - inode_inc_link_count(dir); - - inode = sysv_new_inode(dir, S_IFDIR|mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_dir; - - sysv_set_inode(inode, 0); - - inode_inc_link_count(inode); - - err = sysv_make_empty(inode, dir); - if (err) - goto out_fail; - - err = sysv_add_link(dentry, inode); - if (err) - goto out_fail; - - d_instantiate(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - inode_dec_link_count(inode); - iput(inode); -out_dir: - inode_dec_link_count(dir); - goto out; -} - -static int sysv_unlink(struct inode * dir, struct dentry * dentry) -{ - struct inode * inode = d_inode(dentry); - struct folio *folio; - struct sysv_dir_entry * de; - int err; - - de = sysv_find_entry(dentry, &folio); - if (!de) - return -ENOENT; - - err = sysv_delete_entry(de, folio); - if (!err) { - inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); - inode_dec_link_count(inode); - } - folio_release_kmap(folio, de); - return err; -} - -static int sysv_rmdir(struct inode * dir, struct dentry * dentry) -{ - struct inode *inode = d_inode(dentry); - int err = -ENOTEMPTY; - - if (sysv_empty_dir(inode)) { - err = sysv_unlink(dir, dentry); - if (!err) { - inode->i_size = 0; - inode_dec_link_count(inode); - inode_dec_link_count(dir); - } - } - return err; -} - -/* - * Anybody can rename anything with this: the permission checks are left to the - * higher-level routines. - */ -static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, - struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry, unsigned int flags) -{ - struct inode * old_inode = d_inode(old_dentry); - struct inode * new_inode = d_inode(new_dentry); - struct folio *dir_folio; - struct sysv_dir_entry * dir_de = NULL; - struct folio *old_folio; - struct sysv_dir_entry * old_de; - int err = -ENOENT; - - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - old_de = sysv_find_entry(old_dentry, &old_folio); - if (!old_de) - goto out; - - if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; - dir_de = sysv_dotdot(old_inode, &dir_folio); - if (!dir_de) - goto out_old; - } - - if (new_inode) { - struct folio *new_folio; - struct sysv_dir_entry * new_de; - - err = -ENOTEMPTY; - if (dir_de && !sysv_empty_dir(new_inode)) - goto out_dir; - - err = -ENOENT; - new_de = sysv_find_entry(new_dentry, &new_folio); - if (!new_de) - goto out_dir; - err = sysv_set_link(new_de, new_folio, old_inode); - folio_release_kmap(new_folio, new_de); - if (err) - goto out_dir; - inode_set_ctime_current(new_inode); - if (dir_de) - drop_nlink(new_inode); - inode_dec_link_count(new_inode); - } else { - err = sysv_add_link(new_dentry, old_inode); - if (err) - goto out_dir; - if (dir_de) - inode_inc_link_count(new_dir); - } - - err = sysv_delete_entry(old_de, old_folio); - if (err) - goto out_dir; - - mark_inode_dirty(old_inode); - - if (dir_de) { - err = sysv_set_link(dir_de, dir_folio, new_dir); - if (!err) - inode_dec_link_count(old_dir); - } - -out_dir: - if (dir_de) - folio_release_kmap(dir_folio, dir_de); -out_old: - folio_release_kmap(old_folio, old_de); -out: - return err; -} - -/* - * directories can handle most operations... - */ -const struct inode_operations sysv_dir_inode_operations = { - .create = sysv_create, - .lookup = sysv_lookup, - .link = sysv_link, - .unlink = sysv_unlink, - .symlink = sysv_symlink, - .mkdir = sysv_mkdir, - .rmdir = sysv_rmdir, - .mknod = sysv_mknod, - .rename = sysv_rename, - .getattr = sysv_getattr, -}; diff --git a/fs/sysv/super.c b/fs/sysv/super.c deleted file mode 100644 index 5c0d07ddbda2..000000000000 --- a/fs/sysv/super.c +++ /dev/null @@ -1,595 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/sysv/inode.c - * - * minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * xenix/inode.c - * Copyright (C) 1992 Doug Evans - * - * coh/inode.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/inode.c - * Copyright (C) 1993 Paul B. Monday - * - * sysv/inode.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - * - * This file contains code for read/parsing the superblock. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/buffer_head.h> -#include "sysv.h" - -/* - * The following functions try to recognize specific filesystems. - * - * We recognize: - * - Xenix FS by its magic number. - * - SystemV FS by its magic number. - * - Coherent FS by its funny fname/fpack field. - * - SCO AFS by s_nfree == 0xffff - * - V7 FS has no distinguishing features. - * - * We discriminate among SystemV4 and SystemV2 FS by the assumption that - * the time stamp is not < 01-01-1980. - */ - -enum { - JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60 -}; - -static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - struct xenix_super_block * sbd1; - struct xenix_super_block * sbd2; - - if (bh1 != bh2) - sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data; - else { - /* block size = 512, so bh1 != bh2 */ - sbd1 = (struct xenix_super_block *) bh1->b_data; - sbd2 = (struct xenix_super_block *) (bh2->b_data - 512); - } - - *max_links = XENIX_LINK_MAX; - sbi->s_fic_size = XENIX_NICINOD; - sbi->s_flc_size = XENIX_NICFREE; - sbi->s_sbd1 = (char *)sbd1; - sbi->s_sbd2 = (char *)sbd2; - sbi->s_sb_fic_count = &sbd1->s_ninode; - sbi->s_sb_fic_inodes = &sbd1->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd2->s_tinode; - sbi->s_bcache_count = &sbd1->s_nfree; - sbi->s_bcache = &sbd1->s_free[0]; - sbi->s_free_blocks = &sbd2->s_tfree; - sbi->s_sb_time = &sbd2->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize); -} - -static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct sysv4_super_block * sbd; - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - - if (bh1 == bh2) - sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2); - else - sbd = (struct sysv4_super_block *) bh2->b_data; - - *max_links = SYSV_LINK_MAX; - sbi->s_fic_size = SYSV_NICINOD; - sbi->s_flc_size = SYSV_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_sb_state = &sbd->s_state; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct sysv2_super_block *sbd; - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - - if (bh1 == bh2) - sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2); - else - sbd = (struct sysv2_super_block *) bh2->b_data; - - *max_links = SYSV_LINK_MAX; - sbi->s_fic_size = SYSV_NICINOD; - sbi->s_flc_size = SYSV_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_sb_state = &sbd->s_state; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct coh_super_block * sbd; - struct buffer_head *bh1 = sbi->s_bh1; - - sbd = (struct coh_super_block *) bh1->b_data; - - *max_links = COH_LINK_MAX; - sbi->s_fic_size = COH_NICINOD; - sbi->s_flc_size = COH_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct buffer_head *bh2 = sbi->s_bh2; - struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data; - - *max_links = V7_LINK_MAX; - sbi->s_fic_size = V7_NICINOD; - sbi->s_flc_size = V7_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data; - if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544)) - sbi->s_bytesex = BYTESEX_LE; - else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544)) - sbi->s_bytesex = BYTESEX_BE; - else - return 0; - switch (fs32_to_cpu(sbi, sbd->s_type)) { - case 1: - sbi->s_type = FSTYPE_XENIX; - return 1; - case 2: - sbi->s_type = FSTYPE_XENIX; - return 2; - default: - return 0; - } -} - -static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct super_block *sb = sbi->s_sb; - /* All relevant fields are at the same offsets in R2 and R4 */ - struct sysv4_super_block * sbd; - u32 type; - - sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2); - if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20)) - sbi->s_bytesex = BYTESEX_LE; - else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20)) - sbi->s_bytesex = BYTESEX_BE; - else - return 0; - - type = fs32_to_cpu(sbi, sbd->s_type); - - if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) { - sbi->s_type = FSTYPE_AFS; - sbi->s_forced_ro = 1; - if (!sb_rdonly(sb)) { - printk("SysV FS: SCO EAFS on %s detected, " - "forcing read-only mode.\n", - sb->s_id); - } - return type; - } - - if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) { - /* this is likely to happen on SystemV2 FS */ - if (type > 3 || type < 1) - return 0; - sbi->s_type = FSTYPE_SYSV2; - return type; - } - if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10)) - return 0; - - /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10, - 0x20 or 0x30 indicates that symbolic links and the 14-character - filename limit is gone. Due to lack of information about this - feature read-only mode seems to be a reasonable approach... -KGB */ - - if (type >= 0x10) { - printk("SysV FS: can't handle long file names on %s, " - "forcing read-only mode.\n", sb->s_id); - sbi->s_forced_ro = 1; - } - - sbi->s_type = FSTYPE_SYSV4; - return type >= 0x10 ? type >> 4 : type; -} - -static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct coh_super_block * sbd; - - sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2); - if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6)) - || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6))) - return 0; - sbi->s_bytesex = BYTESEX_PDP; - sbi->s_type = FSTYPE_COH; - return 1; -} - -static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - int size = detect_sysv(sbi, bh); - - return size>2 ? 0 : size; -} - -static struct { - int block; - int (*test)(struct sysv_sb_info *, struct buffer_head *); -} flavours[] = { - {1, detect_xenix}, - {0, detect_sysv}, - {0, detect_coherent}, - {9, detect_sysv_odd}, - {15,detect_sysv_odd}, - {18,detect_sysv}, -}; - -static char *flavour_names[] = { - [FSTYPE_XENIX] = "Xenix", - [FSTYPE_SYSV4] = "SystemV", - [FSTYPE_SYSV2] = "SystemV Release 2", - [FSTYPE_COH] = "Coherent", - [FSTYPE_V7] = "V7", - [FSTYPE_AFS] = "AFS", -}; - -static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = { - [FSTYPE_XENIX] = detected_xenix, - [FSTYPE_SYSV4] = detected_sysv4, - [FSTYPE_SYSV2] = detected_sysv2, - [FSTYPE_COH] = detected_coherent, - [FSTYPE_V7] = detected_v7, - [FSTYPE_AFS] = detected_sysv4, -}; - -static int complete_read_super(struct super_block *sb, int silent, int size) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct inode *root_inode; - char *found = flavour_names[sbi->s_type]; - u_char n_bits = size+8; - int bsize = 1 << n_bits; - int bsize_4 = bsize >> 2; - - sbi->s_firstinodezone = 2; - - flavour_setup[sbi->s_type](sbi, &sb->s_max_links); - if (sbi->s_firstdatazone < sbi->s_firstinodezone) - return 0; - - sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; - sbi->s_inodes_per_block = bsize >> 6; - sbi->s_inodes_per_block_1 = (bsize >> 6)-1; - sbi->s_inodes_per_block_bits = n_bits-6; - sbi->s_ind_per_block = bsize_4; - sbi->s_ind_per_block_2 = bsize_4*bsize_4; - sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4)); - sbi->s_ind_per_block_bits = n_bits-2; - - sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone) - << sbi->s_inodes_per_block_bits; - - if (!silent) - printk("VFS: Found a %s FS (block size = %ld) on device %s\n", - found, sb->s_blocksize, sb->s_id); - - sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; - /* set up enough so that it can read an inode */ - sb->s_op = &sysv_sops; - if (sbi->s_forced_ro) - sb->s_flags |= SB_RDONLY; - root_inode = sysv_iget(sb, SYSV_ROOT_INO); - if (IS_ERR(root_inode)) { - printk("SysV FS: get root inode failed\n"); - return 0; - } - sb->s_root = d_make_root(root_inode); - if (!sb->s_root) { - printk("SysV FS: get root dentry failed\n"); - return 0; - } - return 1; -} - -static int sysv_fill_super(struct super_block *sb, void *data, int silent) -{ - struct buffer_head *bh1, *bh = NULL; - struct sysv_sb_info *sbi; - unsigned long blocknr; - int size = 0, i; - - BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block)); - BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block)); - BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block)); - BUILD_BUG_ON(500 != sizeof (struct coh_super_block)); - BUILD_BUG_ON(64 != sizeof (struct sysv_inode)); - - sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sbi->s_sb = sb; - sbi->s_block_base = 0; - mutex_init(&sbi->s_lock); - sb->s_fs_info = sbi; - sb->s_time_min = 0; - sb->s_time_max = U32_MAX; - sb_set_blocksize(sb, BLOCK_SIZE); - - for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) { - brelse(bh); - bh = sb_bread(sb, flavours[i].block); - if (!bh) - continue; - size = flavours[i].test(SYSV_SB(sb), bh); - } - - if (!size) - goto Eunknown; - - switch (size) { - case 1: - blocknr = bh->b_blocknr << 1; - brelse(bh); - sb_set_blocksize(sb, 512); - bh1 = sb_bread(sb, blocknr); - bh = sb_bread(sb, blocknr + 1); - break; - case 2: - bh1 = bh; - break; - case 3: - blocknr = bh->b_blocknr >> 1; - brelse(bh); - sb_set_blocksize(sb, 2048); - bh1 = bh = sb_bread(sb, blocknr); - break; - default: - goto Ebadsize; - } - - if (bh && bh1) { - sbi->s_bh1 = bh1; - sbi->s_bh2 = bh; - if (complete_read_super(sb, silent, size)) - return 0; - } - - brelse(bh1); - brelse(bh); - sb_set_blocksize(sb, BLOCK_SIZE); - printk("oldfs: cannot read superblock\n"); -failed: - kfree(sbi); - return -EINVAL; - -Eunknown: - brelse(bh); - if (!silent) - printk("VFS: unable to find oldfs superblock on device %s\n", - sb->s_id); - goto failed; -Ebadsize: - brelse(bh); - if (!silent) - printk("VFS: oldfs: unsupported block size (%dKb)\n", - 1<<(size-2)); - goto failed; -} - -static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh) -{ - struct v7_super_block *v7sb; - struct sysv_inode *v7i; - struct buffer_head *bh2; - struct sysv_sb_info *sbi; - - sbi = sb->s_fs_info; - - /* plausibility check on superblock */ - v7sb = (struct v7_super_block *) bh->b_data; - if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || - fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || - fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE) - return 0; - - /* plausibility check on root inode: it is a directory, - with a nonzero size that is a multiple of 16 */ - bh2 = sb_bread(sb, 2); - if (bh2 == NULL) - return 0; - - v7i = (struct sysv_inode *)(bh2->b_data + 64); - if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR || - (fs32_to_cpu(sbi, v7i->i_size) == 0) || - (fs32_to_cpu(sbi, v7i->i_size) & 017) || - (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES * - sizeof(struct sysv_dir_entry))) { - brelse(bh2); - return 0; - } - - brelse(bh2); - return 1; -} - -static int v7_fill_super(struct super_block *sb, void *data, int silent) -{ - struct sysv_sb_info *sbi; - struct buffer_head *bh; - - BUILD_BUG_ON(sizeof(struct v7_super_block) != 440); - BUILD_BUG_ON(sizeof(struct sysv_inode) != 64); - - sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sbi->s_sb = sb; - sbi->s_block_base = 0; - sbi->s_type = FSTYPE_V7; - mutex_init(&sbi->s_lock); - sb->s_fs_info = sbi; - sb->s_time_min = 0; - sb->s_time_max = U32_MAX; - - sb_set_blocksize(sb, 512); - - if ((bh = sb_bread(sb, 1)) == NULL) { - if (!silent) - printk("VFS: unable to read V7 FS superblock on " - "device %s.\n", sb->s_id); - goto failed; - } - - /* Try PDP-11 UNIX */ - sbi->s_bytesex = BYTESEX_PDP; - if (v7_sanity_check(sb, bh)) - goto detected; - - /* Try PC/IX, v7/x86 */ - sbi->s_bytesex = BYTESEX_LE; - if (v7_sanity_check(sb, bh)) - goto detected; - - goto failed; - -detected: - sbi->s_bh1 = bh; - sbi->s_bh2 = bh; - if (complete_read_super(sb, silent, 1)) - return 0; - -failed: - printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n", - sb->s_id); - brelse(bh); - kfree(sbi); - return -EINVAL; -} - -/* Every kernel module contains stuff like this. */ - -static struct dentry *sysv_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); -} - -static struct dentry *v7_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); -} - -static struct file_system_type sysv_fs_type = { - .owner = THIS_MODULE, - .name = "sysv", - .mount = sysv_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("sysv"); - -static struct file_system_type v7_fs_type = { - .owner = THIS_MODULE, - .name = "v7", - .mount = v7_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("v7"); -MODULE_ALIAS("v7"); - -static int __init init_sysv_fs(void) -{ - int error; - - error = sysv_init_icache(); - if (error) - goto out; - error = register_filesystem(&sysv_fs_type); - if (error) - goto destroy_icache; - error = register_filesystem(&v7_fs_type); - if (error) - goto unregister; - return 0; - -unregister: - unregister_filesystem(&sysv_fs_type); -destroy_icache: - sysv_destroy_icache(); -out: - return error; -} - -static void __exit exit_sysv_fs(void) -{ - unregister_filesystem(&sysv_fs_type); - unregister_filesystem(&v7_fs_type); - sysv_destroy_icache(); -} - -module_init(init_sysv_fs) -module_exit(exit_sysv_fs) -MODULE_DESCRIPTION("SystemV Filesystem"); -MODULE_LICENSE("GPL"); diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h deleted file mode 100644 index 0a48b2e7edb1..000000000000 --- a/fs/sysv/sysv.h +++ /dev/null @@ -1,245 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SYSV_H -#define _SYSV_H - -#include <linux/buffer_head.h> - -typedef __u16 __bitwise __fs16; -typedef __u32 __bitwise __fs32; - -#include <linux/sysv_fs.h> - -/* - * SystemV/V7/Coherent super-block data in memory - * - * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified - * while the system is running). This is in contrast to the Minix and Berkeley - * filesystems (where the superblock is never modified). This affects the - * sync() operation: we must keep the superblock in a disk buffer and use this - * one as our "working copy". - */ - -struct sysv_sb_info { - struct super_block *s_sb; /* VFS superblock */ - int s_type; /* file system type: FSTYPE_{XENIX|SYSV|COH} */ - char s_bytesex; /* bytesex (le/be/pdp) */ - unsigned int s_inodes_per_block; /* number of inodes per block */ - unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */ - unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */ - unsigned int s_ind_per_block; /* number of indirections per block */ - unsigned int s_ind_per_block_bits; /* log2(ind_per_block) */ - unsigned int s_ind_per_block_2; /* ind_per_block ^ 2 */ - unsigned int s_toobig_block; /* 10 + ipb + ipb^2 + ipb^3 */ - unsigned int s_block_base; /* physical block number of block 0 */ - unsigned short s_fic_size; /* free inode cache size, NICINOD */ - unsigned short s_flc_size; /* free block list chunk size, NICFREE */ - /* The superblock is kept in one or two disk buffers: */ - struct buffer_head *s_bh1; - struct buffer_head *s_bh2; - /* These are pointers into the disk buffer, to compensate for - different superblock layout. */ - char * s_sbd1; /* entire superblock data, for part 1 */ - char * s_sbd2; /* entire superblock data, for part 2 */ - __fs16 *s_sb_fic_count; /* pointer to s_sbd->s_ninode */ - sysv_ino_t *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */ - __fs16 *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */ - __fs16 *s_bcache_count; /* pointer to s_sbd->s_nfree */ - sysv_zone_t *s_bcache; /* pointer to s_sbd->s_free */ - __fs32 *s_free_blocks; /* pointer to s_sbd->s_tfree */ - __fs32 *s_sb_time; /* pointer to s_sbd->s_time */ - __fs32 *s_sb_state; /* pointer to s_sbd->s_state, only FSTYPE_SYSV */ - /* We keep those superblock entities that don't change here; - this saves us an indirection and perhaps a conversion. */ - u32 s_firstinodezone; /* index of first inode zone */ - u32 s_firstdatazone; /* same as s_sbd->s_isize */ - u32 s_ninodes; /* total number of inodes */ - u32 s_ndatazones; /* total number of data zones */ - u32 s_nzones; /* same as s_sbd->s_fsize */ - u16 s_namelen; /* max length of dir entry */ - int s_forced_ro; - struct mutex s_lock; -}; - -/* - * SystemV/V7/Coherent FS inode data in memory - */ -struct sysv_inode_info { - __fs32 i_data[13]; - u32 i_dir_start_lookup; - struct inode vfs_inode; -}; - - -static inline struct sysv_inode_info *SYSV_I(struct inode *inode) -{ - return container_of(inode, struct sysv_inode_info, vfs_inode); -} - -static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} - - -/* identify the FS in memory */ -enum { - FSTYPE_NONE = 0, - FSTYPE_XENIX, - FSTYPE_SYSV4, - FSTYPE_SYSV2, - FSTYPE_COH, - FSTYPE_V7, - FSTYPE_AFS, - FSTYPE_END, -}; - -#define SYSV_MAGIC_BASE 0x012FF7B3 - -#define XENIX_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_XENIX) -#define SYSV4_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV4) -#define SYSV2_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV2) -#define COH_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_COH) - - -/* Admissible values for i_nlink: 0.._LINK_MAX */ -enum { - XENIX_LINK_MAX = 126, /* ?? */ - SYSV_LINK_MAX = 126, /* 127? 251? */ - V7_LINK_MAX = 126, /* ?? */ - COH_LINK_MAX = 10000, -}; - - -static inline void dirty_sb(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - mark_buffer_dirty(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - mark_buffer_dirty(sbi->s_bh2); -} - - -/* ialloc.c */ -extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned, - struct buffer_head **); -extern struct inode * sysv_new_inode(const struct inode *, umode_t); -extern void sysv_free_inode(struct inode *); -extern unsigned long sysv_count_free_inodes(struct super_block *); - -/* balloc.c */ -extern sysv_zone_t sysv_new_block(struct super_block *); -extern void sysv_free_block(struct super_block *, sysv_zone_t); -extern unsigned long sysv_count_free_blocks(struct super_block *); - -/* itree.c */ -void sysv_truncate(struct inode *); -int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); - -/* inode.c */ -extern struct inode *sysv_iget(struct super_block *, unsigned int); -extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); -extern int sysv_sync_inode(struct inode *); -extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(struct mnt_idmap *, const struct path *, - struct kstat *, u32, unsigned int); -extern int sysv_init_icache(void); -extern void sysv_destroy_icache(void); - - -/* dir.c */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **); -int sysv_add_link(struct dentry *, struct inode *); -int sysv_delete_entry(struct sysv_dir_entry *, struct folio *); -int sysv_make_empty(struct inode *, struct inode *); -int sysv_empty_dir(struct inode *); -int sysv_set_link(struct sysv_dir_entry *, struct folio *, - struct inode *); -struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **); -ino_t sysv_inode_by_name(struct dentry *); - - -extern const struct inode_operations sysv_file_inode_operations; -extern const struct inode_operations sysv_dir_inode_operations; -extern const struct file_operations sysv_file_operations; -extern const struct file_operations sysv_dir_operations; -extern const struct address_space_operations sysv_aops; -extern const struct super_operations sysv_sops; - - -enum { - BYTESEX_LE, - BYTESEX_PDP, - BYTESEX_BE, -}; - -static inline u32 PDP_swab(u32 x) -{ -#ifdef __LITTLE_ENDIAN - return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16); -#else -#ifdef __BIG_ENDIAN - return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8); -#else -#error BYTESEX -#endif -#endif -} - -static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - return PDP_swab((__force __u32)n); - else if (sbi->s_bytesex == BYTESEX_LE) - return le32_to_cpu((__force __le32)n); - else - return be32_to_cpu((__force __be32)n); -} - -static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - return (__force __fs32)PDP_swab(n); - else if (sbi->s_bytesex == BYTESEX_LE) - return (__force __fs32)cpu_to_le32(n); - else - return (__force __fs32)cpu_to_be32(n); -} - -static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d); - else if (sbi->s_bytesex == BYTESEX_LE) - le32_add_cpu((__le32 *)n, d); - else - be32_add_cpu((__be32 *)n, d); - return *n; -} - -static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n) -{ - if (sbi->s_bytesex != BYTESEX_BE) - return le16_to_cpu((__force __le16)n); - else - return be16_to_cpu((__force __be16)n); -} - -static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n) -{ - if (sbi->s_bytesex != BYTESEX_BE) - return (__force __fs16)cpu_to_le16(n); - else - return (__force __fs16)cpu_to_be16(n); -} - -static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d) -{ - if (sbi->s_bytesex != BYTESEX_BE) - le16_add_cpu((__le16 *)n, d); - else - be16_add_cpu((__be16 *)n, d); - return *n; -} - -#endif /* _SYSV_H */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 9f7eb451a60f..c68f28d9c426 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -205,9 +205,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); } else { - hrtimer_init(&ctx->t.tmr, clockid, htmode); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); - ctx->t.tmr.function = timerfd_tmrproc; } if (texp != 0) { @@ -429,7 +428,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); else - hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS); ctx->moffs = ktime_mono_to_real(0); @@ -439,15 +438,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return ufd; } - file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx, - O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); + file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), + FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } - file->f_mode |= FMODE_NOWAIT; fd_install(ufd, file); return ufd; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 53214499e384..cb1af30b49f5 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -109,9 +109,9 @@ static char *get_dname(struct dentry *dentry) return name; } -static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, - struct inode *inode, struct dentry *dentry, - umode_t mode) +static struct dentry *tracefs_syscall_mkdir(struct mnt_idmap *idmap, + struct inode *inode, struct dentry *dentry, + umode_t mode) { struct tracefs_inode *ti; char *name; @@ -119,7 +119,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, name = get_dname(dentry); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* * This is a new directory that does not take the default of @@ -141,7 +141,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, kfree(name); - return ret; + return ERR_PTR(ret); } static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index fda82f3e16e8..3c3d3ad4fa6c 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1002,8 +1002,8 @@ out_fname: return err; } -static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -1023,7 +1023,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = ubifs_budget_space(c, &req); if (err) - return err; + return ERR_PTR(err); err = ubifs_prepare_create(dir, dentry, &nm); if (err) @@ -1060,7 +1060,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, ubifs_release_budget(c, &req); d_instantiate(dentry, inode); fscrypt_free_filename(&nm); - return 0; + return NULL; out_cancel: dir->i_size -= sz_change; @@ -1074,7 +1074,7 @@ out_fname: fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); - return err; + return ERR_PTR(err); } static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 01d8eb170382..a79f229df475 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -1179,8 +1179,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->c = c; wbuf->next_ino = 0; - hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wbuf->timer.function = wbuf_timer_callback_nolock; + hrtimer_setup(&wbuf->timer, wbuf_timer_callback_nolock, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return 0; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 2cb49b6b0716..5f2e9a892bff 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -419,8 +419,8 @@ static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct udf_fileident_iter iter; @@ -430,7 +430,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, inode = udf_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; @@ -439,7 +439,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) { clear_nlink(inode); discard_new_inode(inode); - return err; + return ERR_PTR(err); } set_nlink(inode, 2); iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); @@ -456,7 +456,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) { clear_nlink(inode); discard_new_inode(inode); - return err; + return ERR_PTR(err); } iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); @@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, mark_inode_dirty(dir); d_instantiate_new(dentry, inode); - return 0; + return NULL; } static int empty_dir(struct inode *dir) diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 38a024c8cccd..5b3c85c93242 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -166,8 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, - struct dentry * dentry, umode_t mode) +static struct dentry *ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, + struct dentry * dentry, umode_t mode) { struct inode * inode; int err; @@ -194,7 +194,7 @@ static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, goto out_fail; d_instantiate_new(dentry, inode); - return 0; + return NULL; out_fail: inode_dec_link_count(inode); @@ -202,7 +202,7 @@ out_fail: discard_new_inode(inode); out_dir: inode_dec_link_count(dir); - return err; + return ERR_PTR(err); } static int ufs_unlink(struct inode *dir, struct dentry *dentry) diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index da786a687fdc..4ad2c36550f1 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -10,6 +10,7 @@ config UNICODE be a separate loadable module that gets requested only when a file system actually use it. -config UNICODE_NORMALIZATION_SELFTEST +config UNICODE_NORMALIZATION_KUNIT_TEST tristate "Test UTF-8 normalization support" - depends on UNICODE + depends on UNICODE && KUNIT + default KUNIT_ALL_TESTS diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index e309afe2b2bb..d95be7fb9f6b 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),) obj-y += unicode.o endif obj-$(CONFIG_UNICODE) += utf8data.o -obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o +obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += tests/utf8_kunit.o unicode-y := utf8-norm.o utf8-core.o diff --git a/fs/unicode/tests/.kunitconfig b/fs/unicode/tests/.kunitconfig new file mode 100644 index 000000000000..62dd5c171f9c --- /dev/null +++ b/fs/unicode/tests/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_UNICODE=y +CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST=y diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/tests/utf8_kunit.c index 5ddaf27b21a6..5063e8138aec 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/tests/utf8_kunit.c @@ -1,34 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Kernel module for testing utf-8 support. + * KUnit tests for utf-8 support. * * Copyright 2017 Collabora Ltd. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/module.h> -#include <linux/printk.h> #include <linux/unicode.h> -#include <linux/dcache.h> - -#include "utf8n.h" - -static unsigned int failed_tests; -static unsigned int total_tests; - -#define _test(cond, func, line, fmt, ...) do { \ - total_tests++; \ - if (!cond) { \ - failed_tests++; \ - pr_err("test %s:%d Failed: %s%s", \ - func, line, #cond, (fmt?":":".")); \ - if (fmt) \ - pr_err(fmt, ##__VA_ARGS__); \ - } \ - } while (0) -#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) -#define test(cond) _test(cond, __func__, __LINE__, "") +#include <kunit/test.h> + +#include "../utf8n.h" static const struct { /* UTF-8 strings in this vector _must_ be NULL-terminated. */ @@ -167,69 +147,74 @@ static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, return utf8ncursor(u8c, um, n, s, (unsigned int)-1); } -static void check_utf8_nfdi(struct unicode_map *um) +static void check_utf8_nfdi(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { int len = strlen(nfdi_test_data[i].str); int nlen = strlen(nfdi_test_data[i].dec); int j = 0; unsigned char c; + int ret; + + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len), + nlen); - test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); - test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == - nlen)); - if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdi_test_data[i].dec[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdi_test_data[i].dec[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdi_test_data[i].dec[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_nfdicf(struct unicode_map *um) +static void check_utf8_nfdicf(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { int len = strlen(nfdicf_test_data[i].str); int nlen = strlen(nfdicf_test_data[i].ncf); int j = 0; + int ret; unsigned char c; - test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == - nlen)); - test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == - nlen)); + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str), + nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len), + nlen); - if (utf8cursor(&u8c, um, UTF8_NFDICF, - nfdicf_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdicf_test_data[i].ncf[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdicf_test_data[i].ncf[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdicf_test_data[i].ncf[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_comparisons(struct unicode_map *table) +static void check_utf8_comparisons(struct kunit *test) { int i; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { const struct qstr s1 = {.name = nfdi_test_data[i].str, @@ -237,8 +222,9 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdi_test_data[i].dec, .len = sizeof(nfdi_test_data[i].dec)}; - test_f(!utf8_strncmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { @@ -247,62 +233,65 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, .len = sizeof(nfdicf_test_data[i].ncf)}; - test_f(!utf8_strncasecmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncasecmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } } -static void check_supported_versions(struct unicode_map *um) +static void check_supported_versions(struct kunit *test) { + struct unicode_map *um = test->priv; /* Unicode 7.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); /* Unicode 9.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); /* Unicode 1x.0.0 (the latest version) should be supported. */ - test(utf8version_is_supported(um, UTF8_LATEST)); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST)); /* Next versions don't exist. */ - test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); } -static int __init init_test_ucd(void) +static struct kunit_case unicode_normalization_test_cases[] = { + KUNIT_CASE(check_supported_versions), + KUNIT_CASE(check_utf8_comparisons), + KUNIT_CASE(check_utf8_nfdicf), + KUNIT_CASE(check_utf8_nfdi), + {} +}; + +static int init_test_ucd(struct kunit *test) { - struct unicode_map *um; + struct unicode_map *um = utf8_load(UTF8_LATEST); - failed_tests = 0; - total_tests = 0; + test->priv = um; - um = utf8_load(UTF8_LATEST); - if (IS_ERR(um)) { - pr_err("%s: Unable to load utf8 table.\n", __func__); - return PTR_ERR(um); - } + KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0, + "%s: Unable to load utf8 table.\n", __func__); - check_supported_versions(um); - check_utf8_nfdi(um); - check_utf8_nfdicf(um); - check_utf8_comparisons(um); - - if (!failed_tests) - pr_info("All %u tests passed\n", total_tests); - else - pr_err("%u out of %u tests failed\n", failed_tests, - total_tests); - utf8_unload(um); return 0; } -static void __exit exit_test_ucd(void) +static void exit_test_ucd(struct kunit *test) { + utf8_unload(test->priv); } -module_init(init_test_ucd); -module_exit(exit_test_ucd); +static struct kunit_suite unicode_normalization_test_suite = { + .name = "unicode_normalization", + .test_cases = unicode_normalization_test_cases, + .init = init_test_ucd, + .exit = exit_test_ucd, +}; + +kunit_test_suite(unicode_normalization_test_suite); + MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); -MODULE_DESCRIPTION("Kernel module for testing utf-8 support"); +MODULE_DESCRIPTION("KUnit tests for utf-8 support."); MODULE_LICENSE("GPL"); diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 768f8ab448b8..7b998c99c88d 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c @@ -586,7 +586,7 @@ ccc_mismatch: } } -#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE +#if IS_MODULE(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) EXPORT_SYMBOL_GPL(utf8version_is_supported); EXPORT_SYMBOL_GPL(utf8nlen); EXPORT_SYMBOL_GPL(utf8ncursor); diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index a859ac9b74ba..770e29ec3557 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -303,11 +303,11 @@ static int vboxsf_dir_mkfile(struct mnt_idmap *idmap, return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL); } -static int vboxsf_dir_mkdir(struct mnt_idmap *idmap, - struct inode *parent, struct dentry *dentry, - umode_t mode) +static struct dentry *vboxsf_dir_mkdir(struct mnt_idmap *idmap, + struct inode *parent, struct dentry *dentry, + umode_t mode) { - return vboxsf_dir_create(parent, dentry, mode, true, true, NULL); + return ERR_PTR(vboxsf_dir_create(parent, dentry, mode, true, true, NULL)); } static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry, diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index c287c755f2c5..3537f3cca6d5 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -167,10 +167,11 @@ xrep_orphanage_create( * directory to control access to a file we put in here. */ if (d_really_is_negative(orphanage_dentry)) { - error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry, - 0750); - if (error) - goto out_dput_orphanage; + orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, + orphanage_dentry, 0750); + error = PTR_ERR(orphanage_dentry); + if (IS_ERR(orphanage_dentry)) + goto out_unlock_root; } /* Not a directory? Bail out. */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6d9965b546cb..5077d52a775d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -115,7 +115,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); @@ -126,9 +126,9 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); if (!error && xfs_ioend_is_append(ioend)) @@ -396,10 +396,11 @@ allocate_blocks: } static int -xfs_prepare_ioend( - struct iomap_ioend *ioend, +xfs_submit_ioend( + struct iomap_writepage_ctx *wpc, int status) { + struct iomap_ioend *ioend = wpc->ioend; unsigned int nofs_flag; /* @@ -410,7 +411,7 @@ xfs_prepare_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -418,10 +419,14 @@ xfs_prepare_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_is_append(ioend) || + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) ioend->io_bio.bi_end_io = xfs_end_bio; - return status; + + if (status) + return status; + submit_bio(&ioend->io_bio); + return 0; } /* @@ -463,7 +468,7 @@ xfs_discard_folio( static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, - .prepare_ioend = xfs_prepare_ioend, + .submit_ioend = xfs_submit_ioend, .discard_folio = xfs_discard_folio, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9a435b1ff264..85b857805d6d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1498,7 +1498,8 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + NULL); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); @@ -1613,7 +1614,8 @@ const struct file_operations xfs_file_operations = { .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d61460309a78..f631177ac320 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -828,6 +828,10 @@ xfs_direct_write_iomap_begin( if (offset + length > i_size_read(inode)) iomap_flags |= IOMAP_F_DIRTY; + /* HW-offload atomics are always used in this path */ + if (flags & IOMAP_ATOMIC) + iomap_flags |= IOMAP_F_ATOMIC_BIO; + /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. @@ -1495,7 +1499,7 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } int @@ -1510,5 +1514,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 40289fe6f5b2..a4480098d2bf 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -298,14 +298,14 @@ xfs_vn_create( return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); } -STATIC int +STATIC struct dentry * xfs_vn_mkdir( struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL); + return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL)); } STATIC struct dentry * diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0055066fb1d9..62d04f4843cf 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2122,7 +2122,8 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 35166c92420c..42e2c0065bb3 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index a17e97e634a6..d0eccbd920e5 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -280,6 +280,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct acpi_processor_cx *cx, struct acpi_power_register *reg); void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cstate); +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx); #else static inline void acpi_processor_power_init_bm_check(struct acpi_processor_flags @@ -300,6 +301,10 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx { return; } +static inline void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + BUG(); +} #endif static inline int call_on_cpu(int cpu, long (*fn)(void *), void *arg, diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index a5cbbf3e26ec..3c61c29ff6ab 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1212,7 +1212,7 @@ static inline void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) #ifndef memset_io /** - * memset_io Set a range of I/O memory to a constant value + * memset_io - Set a range of I/O memory to a constant value * @addr: The beginning of the I/O-memory range to set * @val: The value to set the memory to * @count: The number of bytes to set @@ -1224,7 +1224,7 @@ void memset_io(volatile void __iomem *addr, int val, size_t count); #ifndef memcpy_fromio /** - * memcpy_fromio Copy a block of data from I/O memory + * memcpy_fromio - Copy a block of data from I/O memory * @dst: The (RAM) destination for the copy * @src: The (I/O memory) source for the data * @count: The number of bytes to copy @@ -1236,7 +1236,7 @@ void memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count); #ifndef memcpy_toio /** - * memcpy_toio Copy a block of data into I/O memory + * memcpy_toio - Copy a block of data into I/O memory * @dst: The (I/O memory) destination for the copy * @src: The (RAM) source for the data * @count: The number of bytes to copy diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index c768de6f19a9..0755bc39b0d8 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -39,7 +39,7 @@ extern char __init_begin[], __init_end[]; extern char _sinittext[], _einittext[]; extern char __start_ro_after_init[], __end_ro_after_init[]; extern char _end[]; -extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; +extern char __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __entry_text_start[], __entry_text_end[]; extern char __start_rodata[], __end_rodata[]; diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index 01dafd604188..b550afa15ecd 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -4,24 +4,35 @@ #ifndef __ASSEMBLY__ -#ifndef __arch_get_k_vdso_data -static __always_inline struct vdso_data *__arch_get_k_vdso_data(void) +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE + +#ifndef __arch_get_vdso_u_time_data +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { - return NULL; + return &vdso_u_time_data; } -#endif /* __arch_get_k_vdso_data */ +#endif + +#ifndef __arch_get_vdso_u_rng_data +static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(void) +{ + return &vdso_u_rng_data; +} +#endif + +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ #ifndef __arch_update_vsyscall -static __always_inline void __arch_update_vsyscall(struct vdso_data *vdata) +static __always_inline void __arch_update_vsyscall(struct vdso_time_data *vdata) { } #endif /* __arch_update_vsyscall */ -#ifndef __arch_sync_vdso_data -static __always_inline void __arch_sync_vdso_data(struct vdso_data *vdata) +#ifndef __arch_sync_vdso_time_data +static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { } -#endif /* __arch_sync_vdso_data */ +#endif /* __arch_sync_vdso_time_data */ #endif /* !__ASSEMBLY__ */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0d5b186abee8..58a635a6d5bd 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -385,6 +385,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) . = ALIGN(PAGE_SIZE); \ __nosave_end = .; +#define CACHE_HOT_DATA(align) \ + . = ALIGN(align); \ + *(SORT_BY_ALIGNMENT(.data..hot.*)) \ + . = ALIGN(align); + #define PAGE_ALIGNED_DATA(page_align) \ . = ALIGN(page_align); \ *(.data..page_aligned) \ @@ -404,7 +409,6 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __start_init_stack = .; \ init_thread_union = .; \ init_stack = .; \ - KEEP(*(.data..init_task)) \ KEEP(*(.data..init_thread_info)) \ . = __start_init_stack + THREAD_SIZE; \ __end_init_stack = .; @@ -1062,10 +1066,13 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #define PERCPU_INPUT(cacheline) \ __per_cpu_start = .; \ - *(.data..percpu..first) \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ + __per_cpu_hot_start = .; \ + *(SORT_BY_ALIGNMENT(.data..percpu..hot.*)) \ + __per_cpu_hot_end = .; \ + . = ALIGN(cacheline); \ *(.data..percpu..read_mostly) \ . = ALIGN(cacheline); \ *(.data..percpu) \ @@ -1074,52 +1081,17 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __per_cpu_end = .; /** - * PERCPU_VADDR - define output section for percpu area + * PERCPU_SECTION - define output section for percpu area * @cacheline: cacheline size - * @vaddr: explicit base address (optional) - * @phdr: destination PHDR (optional) * * Macro which expands to output section for percpu area. * * @cacheline is used to align subsections to avoid false cacheline * sharing between subsections for different purposes. - * - * If @vaddr is not blank, it specifies explicit base address and all - * percpu symbols will be offset from the given address. If blank, - * @vaddr always equals @laddr + LOAD_OFFSET. - * - * @phdr defines the output PHDR to use if not blank. Be warned that - * output PHDR is sticky. If @phdr is specified, the next output - * section in the linker script will go there too. @phdr should have - * a leading colon. - * - * Note that this macros defines __per_cpu_load as an absolute symbol. - * If there is no need to put the percpu section at a predetermined - * address, use PERCPU_SECTION. - */ -#define PERCPU_VADDR(cacheline, vaddr, phdr) \ - __per_cpu_load = .; \ - .data..percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) { \ - PERCPU_INPUT(cacheline) \ - } phdr \ - . = __per_cpu_load + SIZEOF(.data..percpu); - -/** - * PERCPU_SECTION - define output section for percpu area, simple version - * @cacheline: cacheline size - * - * Align to PAGE_SIZE and outputs output section for percpu area. This - * macro doesn't manipulate @vaddr or @phdr and __per_cpu_load and - * __per_cpu_start will be identical. - * - * This macro is equivalent to ALIGN(PAGE_SIZE); PERCPU_VADDR(@cacheline,,) - * except that __per_cpu_load is defined as a relative symbol against - * .data..percpu which is required for relocatable x86_32 configuration. */ #define PERCPU_SECTION(cacheline) \ . = ALIGN(PAGE_SIZE); \ .data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \ - __per_cpu_load = .; \ PERCPU_INPUT(cacheline) \ } @@ -1148,6 +1120,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) INIT_TASK_DATA(inittask) \ NOSAVE_DATA \ PAGE_ALIGNED_DATA(pagealigned) \ + CACHE_HOT_DATA(cacheline) \ CACHELINE_ALIGNED_DATA(cacheline) \ READ_MOSTLY_DATA(cacheline) \ DATA_DATA \ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4e495b29c640..a70e62d69dc7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -330,7 +330,6 @@ static inline bool acpi_sci_irq_valid(void) } extern int sbf_port; -extern unsigned long acpi_realmode_flags; int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); diff --git a/include/linux/align.h b/include/linux/align.h index 2b4acec7b95a..55debf105a5d 100644 --- a/include/linux/align.h +++ b/include/linux/align.h @@ -2,14 +2,6 @@ #ifndef _LINUX_ALIGN_H #define _LINUX_ALIGN_H -#include <linux/const.h> - -/* @a is a power of 2 value */ -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) +#include <vdso/align.h> #endif /* _LINUX_ALIGN_H */ diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 60d674af3080..1625c8529e70 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -64,7 +64,7 @@ struct linux_binprm { const char *fdpath; /* generated filename for execveat */ unsigned interp_flags; int execfd; /* File descriptor of the executable */ - unsigned long loader, exec; + unsigned long exec; struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 2026953e2c4e..595217b7a6e7 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -560,9 +560,9 @@ void bitmap_replace(unsigned long *dst, * ...0..11...0..10 * dst: 0000001100000010 * - * A relationship exists between bitmap_scatter() and bitmap_gather(). + * A relationship exists between bitmap_scatter() and bitmap_gather(). See + * bitmap_gather() for the bitmap gather detailed operations. TL;DR: * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. - * See bitmap_scatter() for details related to this relationship. */ static __always_inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, @@ -608,7 +608,9 @@ void bitmap_scatter(unsigned long *dst, const unsigned long *src, * dst: 0000000000011010 * * A relationship exists between bitmap_gather() and bitmap_scatter(). See - * bitmap_scatter() for the bitmap scatter detailed operations. + * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR: + * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation. + * * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n). * The operation bitmap_gather(result, scattered, mask, n) leads to a result * equal or equivalent to src. diff --git a/include/linux/bits.h b/include/linux/bits.h index 61a75d3f294b..14fd0ca9a6cd 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -40,7 +40,7 @@ * Missing asm support * * __GENMASK_U128() depends on _BIT128() which would not work - * in the asm code, as it shifts an 'unsigned __init128' data + * in the asm code, as it shifts an 'unsigned __int128' data * type instead of direct representation of 128 bit constants * such as long and unsigned long. The fundamental problem is * that a 128 bit constant will get silently truncated by the diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d37751789bf5..1c0cf6af392c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -268,10 +268,16 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } +/* + * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER) + * however we constrain this to what we can validate and test. + */ +#define BLK_MAX_BLOCK_SIZE SZ_64K + /* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) { - if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) + if (bsize < 512 || bsize > BLK_MAX_BLOCK_SIZE || !is_power_of_2(bsize)) return -EINVAL; return 0; diff --git a/include/linux/cache.h b/include/linux/cache.h index ca2a05682a54..e69768f50d53 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -3,16 +3,13 @@ #define __LINUX_CACHE_H #include <uapi/linux/kernel.h> +#include <vdso/cache.h> #include <asm/cache.h> #ifndef L1_CACHE_ALIGN #define L1_CACHE_ALIGN(x) __ALIGN_KERNEL(x, L1_CACHE_BYTES) #endif -#ifndef SMP_CACHE_BYTES -#define SMP_CACHE_BYTES L1_CACHE_BYTES -#endif - /** * SMP_CACHE_ALIGN - align a value to the L2 cacheline size * @x: value to align @@ -63,10 +60,6 @@ #define __ro_after_init __section(".data..ro_after_init") #endif -#ifndef ____cacheline_aligned -#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) -#endif - #ifndef ____cacheline_aligned_in_smp #ifdef CONFIG_SMP #define ____cacheline_aligned_in_smp ____cacheline_aligned diff --git a/include/linux/cfi.h b/include/linux/cfi.h index f0df518e11dd..1db17ecbb86c 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,8 @@ #include <linux/module.h> #include <asm/cfi.h> +extern bool cfi_warn; + #ifndef cfi_get_offset static inline int cfi_get_offset(void) { diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 17960a1e858d..485b651869d9 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -619,9 +619,8 @@ struct cgroup_root { */ struct cftype { /* - * By convention, the name should begin with the name of the - * subsystem, followed by a period. Zero length string indicates - * end of cftype array. + * Name of the subsystem is prepended in cgroup_file_name(). + * Zero length string indicates end of cftype array. */ char name[MAX_CFTYPE_NAME]; unsigned long private; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f8ef47f8a634..28e999f2c642 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -113,6 +113,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); @@ -689,8 +690,6 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); -void cgroup_rstat_flush_hold(struct cgroup *cgrp); -void cgroup_rstat_flush_release(struct cgroup *cgrp); /* * Basic resource stats. diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index ee2614adb785..2b32a5759b22 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -216,6 +216,23 @@ const volatile void * __must_check_fn(const volatile void *val) #define return_ptr(p) return no_free_ptr(p) +/* + * Only for situations where an allocation is handed in to another function + * and consumed by that function on success. + * + * struct foo *f __free(kfree) = kzalloc(sizeof(*f), GFP_KERNEL); + * + * setup(f); + * if (some_condition) + * return -EINVAL; + * .... + * ret = bar(f); + * if (!ret) + * retain_ptr(f); + * return ret; + */ +#define retain_ptr(p) \ + __get_and_null(p, NULL) /* * DEFINE_CLASS(name, type, exit, init, init_args...): @@ -291,11 +308,21 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond -#define DEFINE_GUARD(_name, _type, _lock, _unlock) \ +#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return (void *)(__force unsigned long)*(_exp); } + +#define DEFINE_CLASS_IS_GUARD(_name) \ __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ + __DEFINE_GUARD_LOCK_PTR(_name, _T) + +#define DEFINE_CLASS_IS_COND_GUARD(_name) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, true); \ + __DEFINE_GUARD_LOCK_PTR(_name, _T) + +#define DEFINE_GUARD(_name, _type, _lock, _unlock) \ DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ - static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ - { return (void *)(__force unsigned long)*_T; } + DEFINE_CLASS_IS_GUARD(_name) #define DEFINE_GUARD_COND(_name, _ext, _condlock) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ @@ -375,11 +402,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ if (_T->lock) { _unlock; } \ } \ \ -static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ -{ \ - return (void *)(__force unsigned long)_T->lock; \ -} - +__DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ static inline class_##_name##_t class_##_name##_constructor(_type *l) \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 155385754824..9fc30b6b80c9 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -206,12 +206,38 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __must_be_byte_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_byte_array(a), \ "must be byte array") +/* + * If the "nonstring" attribute isn't available, we have to return true + * so the __must_*() checks pass when "nonstring" isn't supported. + */ +#if __has_attribute(__nonstring__) && defined(__annotated) +#define __is_cstr(a) (!__annotated(a, nonstring)) +#define __is_noncstr(a) (__annotated(a, nonstring)) +#else +#define __is_cstr(a) (true) +#define __is_noncstr(a) (true) +#endif + /* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ #define __must_be_cstr(p) \ - __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") + __BUILD_BUG_ON_ZERO_MSG(!__is_cstr(p), \ + "must be C-string (NUL-terminated)") +#define __must_be_noncstr(p) \ + __BUILD_BUG_ON_ZERO_MSG(!__is_noncstr(p), \ + "must be non-C-string (not NUL-terminated)") #endif /* __KERNEL__ */ +#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +/* + * Force a reference to the external symbol so the compiler generates + * __kcfi_typid. + */ +#define KCFI_REFERENCE(sym) __ADDRESSABLE(sym) +#else +#define KCFI_REFERENCE(sym) +#endif + /** * offset_to_ptr - convert a relative memory offset to an absolute pointer * @off: the address of the 32-bit offset value diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 981cc3d7e3aa..e09d323be845 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -349,6 +349,18 @@ struct ftrace_likely_data { #endif /* + * Optional: only supported since gcc >= 15 + * Optional: not supported by Clang + * + * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117178 + */ +#ifdef CONFIG_CC_HAS_MULTIDIMENSIONAL_NONSTRING +# define __nonstring_array __attribute__((__nonstring__)) +#else +# define __nonstring_array +#endif + +/* * Apply __counted_by() when the Endianness matches to increase test coverage. */ #ifdef __LITTLE_ENDIAN @@ -360,7 +372,7 @@ struct ftrace_likely_data { #endif /* Do not trap wrapping arithmetic within an annotated function. */ -#ifdef CONFIG_UBSAN_SIGNED_WRAP +#ifdef CONFIG_UBSAN_INTEGER_WRAP # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow"))) #else # define __signed_wrap @@ -446,11 +458,14 @@ struct ftrace_likely_data { #define __member_size(p) __builtin_object_size(p, 1) #endif -/* Determine if an attribute has been applied to a variable. */ +/* + * Determine if an attribute has been applied to a variable. + * Using __annotated needs to check for __annotated being available, + * or negative tests may fail when annotation cannot be checked. For + * example, see the definition of __is_cstr(). + */ #if __has_builtin(__builtin_has_attribute) #define __annotated(var, attr) __builtin_has_attribute(var, attr) -#else -#define __annotated(var, attr) (false) #endif /* diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7fe0981a7e46..02fd4746231d 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1184,7 +1184,7 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ } #endif -extern unsigned int arch_freq_get_on_cpu(int cpu); +extern int arch_freq_get_on_cpu(int cpu); #ifndef arch_set_freq_scale static __always_inline diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 36a890d0dd57..f9a868384083 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -81,7 +81,7 @@ static __always_inline void set_nr_cpu_ids(unsigned int nr) * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated - * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online + * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * @@ -285,35 +285,52 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, } /** - * for_each_cpu - iterate over every cpu in a mask - * @cpu: the (optionally unsigned) integer iterator - * @mask: the cpumask pointer + * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from + * @n+1. If nothing found, wrap around and start from + * the beginning + * @n: the cpu prior to the place to search (i.e. search starts from @n+1) + * @src1p: the first cpumask pointer + * @src2p: the second cpumask pointer * - * After the loop, cpu is >= nr_cpu_ids. + * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty. */ -#define for_each_cpu(cpu, mask) \ - for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) - -#if NR_CPUS == 1 static __always_inline -unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) +unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p, + const struct cpumask *src2p) { - cpumask_check(start); + /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); + return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), + small_cpumask_bits, n + 1); +} - /* - * Return the first available CPU when wrapping, or when starting before cpu0, - * since there is only one valid option. - */ - if (wrap && n >= 0) - return nr_cpumask_bits; - - return cpumask_first(mask); +/** + * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing + * found, wrap around and start from the beginning + * @n: the cpu prior to the place to search (i.e. search starts from @n+1) + * @src: cpumask pointer + * + * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty. + */ +static __always_inline +unsigned int cpumask_next_wrap(int n, const struct cpumask *src) +{ + /* -1 is a legal arg here. */ + if (n != -1) + cpumask_check(n); + return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1); } -#else -unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); -#endif + +/** + * for_each_cpu - iterate over every cpu in a mask + * @cpu: the (optionally unsigned) integer iterator + * @mask: the cpumask pointer + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu(cpu, mask) \ + for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location @@ -1033,11 +1050,21 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) + +#define for_each_possible_cpu_wrap(cpu, start) \ + for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) +#define for_each_online_cpu_wrap(cpu, start) \ + for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) + +#define for_each_possible_cpu_wrap(cpu, start) \ + for_each_cpu_wrap((cpu), cpu_possible_mask, (start)) +#define for_each_online_cpu_wrap(cpu, start) \ + for_each_cpu_wrap((cpu), cpu_online_mask, (start)) #endif /* Wrappers for arch boot code to manipulate normally-constant masks */ diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 835e7b793f6a..5466c96a33db 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -125,9 +125,11 @@ static inline int cpuset_do_page_mem_spread(void) extern bool current_cpuset_is_being_rebound(void); +extern void dl_rebuild_rd_accounting(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); +extern void cpuset_reset_sched_domains(void); /* * read_mems_allowed_begin is required when making decisions involving @@ -259,11 +261,20 @@ static inline bool current_cpuset_is_being_rebound(void) return false; } +static inline void dl_rebuild_rd_accounting(void) +{ +} + static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_reset_sched_domains(void) +{ + partition_sched_domains(1, NULL, NULL); +} + static inline void cpuset_print_current_mems_allowed(void) { } diff --git a/include/linux/damon.h b/include/linux/damon.h index af525252b853..c9074d569596 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -470,6 +470,11 @@ struct damos { unsigned long next_apply_sis; /* informs if ongoing DAMOS walk for this scheme is finished */ bool walk_completed; + /* + * If the current region in the filtering stage is allowed by core + * layer-handled filters. If true, operations layer allows it, too. + */ + bool core_filters_allowed; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4afb60365675..45bff10d3773 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -203,34 +203,34 @@ struct dentry_operations { #define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) +#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(13) /* Parent inode is watched by some fsnotify listener */ -#define DCACHE_DENTRY_KILLED BIT(15) +#define DCACHE_DENTRY_KILLED BIT(14) -#define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ +#define DCACHE_MOUNTED BIT(15) /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT BIT(16) /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT BIT(17) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST BIT(19) +#define DCACHE_LRU_LIST BIT(18) -#define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ -#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ -#define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ -#define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ -#define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ -#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ -#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ +#define DCACHE_ENTRY_TYPE (7 << 19) /* bits 19..21 are for storing type: */ +#define DCACHE_MISS_TYPE (0 << 19) /* Negative dentry */ +#define DCACHE_WHITEOUT_TYPE (1 << 19) /* Whiteout dentry (stop pathwalk) */ +#define DCACHE_DIRECTORY_TYPE (2 << 19) /* Normal directory */ +#define DCACHE_AUTODIR_TYPE (3 << 19) /* Lookupless directory (presumed automount) */ +#define DCACHE_REGULAR_TYPE (4 << 19) /* Regular file type */ +#define DCACHE_SPECIAL_TYPE (5 << 19) /* Other file type */ +#define DCACHE_SYMLINK_TYPE (6 << 19) /* Symlink */ -#define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ -#define DCACHE_OP_REAL BIT(26) +#define DCACHE_NOKEY_NAME BIT(22) /* Encrypted name encoded without key */ +#define DCACHE_OP_REAL BIT(23) -#define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ -#define DCACHE_DENTRY_CURSOR BIT(29) -#define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ +#define DCACHE_PAR_LOOKUP BIT(24) /* being looked up (with parent locked shared) */ +#define DCACHE_DENTRY_CURSOR BIT(25) +#define DCACHE_NORCU BIT(26) /* No RCU delay for freeing */ extern seqlock_t rename_lock; @@ -253,7 +253,6 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name); -extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index d7e30d4f7503..f3bc0bcd7098 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -78,14 +78,18 @@ static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map) #define phys_to_dma_unencrypted phys_to_dma #endif #else -static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, - phys_addr_t paddr) +static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { if (dev->dma_range_map) return translate_phys_to_dma(dev, paddr); return paddr; } +static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, + phys_addr_t paddr) +{ + return dma_addr_unencrypted(__phys_to_dma(dev, paddr)); +} /* * If memory encryption is supported, phys_to_dma will set the memory encryption * bit in the DMA address, and dma_to_phys will clear it. @@ -94,19 +98,20 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev, */ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return __sme_set(phys_to_dma_unencrypted(dev, paddr)); + return dma_addr_encrypted(__phys_to_dma(dev, paddr)); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dma_addr) { phys_addr_t paddr; + dma_addr = dma_addr_canonical(dma_addr); if (dev->dma_range_map) paddr = translate_dma_to_phys(dev, dma_addr); else paddr = dma_addr; - return __sme_clr(paddr); + return paddr; } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ diff --git a/include/linux/edac.h b/include/linux/edac.h index b4ee8961e623..451f9c152c99 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -661,4 +661,219 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, return mci->dimms[index]; } + +#define EDAC_FEAT_NAME_LEN 128 + +/* RAS feature type */ +enum edac_dev_feat { + RAS_FEAT_SCRUB, + RAS_FEAT_ECS, + RAS_FEAT_MEM_REPAIR, + RAS_FEAT_MAX +}; + +/** + * struct edac_scrub_ops - scrub device operations (all elements optional) + * @read_addr: read base address of scrubbing range. + * @read_size: read offset of scrubbing range. + * @write_addr: set base address of the scrubbing range. + * @write_size: set offset of the scrubbing range. + * @get_enabled_bg: check if currently performing background scrub. + * @set_enabled_bg: start or stop a bg-scrub. + * @get_min_cycle: get minimum supported scrub cycle duration in seconds. + * @get_max_cycle: get maximum supported scrub cycle duration in seconds. + * @get_cycle_duration: get current scrub cycle duration in seconds. + * @set_cycle_duration: set current scrub cycle duration in seconds. + */ +struct edac_scrub_ops { + int (*read_addr)(struct device *dev, void *drv_data, u64 *base); + int (*read_size)(struct device *dev, void *drv_data, u64 *size); + int (*write_addr)(struct device *dev, void *drv_data, u64 base); + int (*write_size)(struct device *dev, void *drv_data, u64 size); + int (*get_enabled_bg)(struct device *dev, void *drv_data, bool *enable); + int (*set_enabled_bg)(struct device *dev, void *drv_data, bool enable); + int (*get_min_cycle)(struct device *dev, void *drv_data, u32 *min); + int (*get_max_cycle)(struct device *dev, void *drv_data, u32 *max); + int (*get_cycle_duration)(struct device *dev, void *drv_data, u32 *cycle); + int (*set_cycle_duration)(struct device *dev, void *drv_data, u32 cycle); +}; + +#if IS_ENABLED(CONFIG_EDAC_SCRUB) +int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, + u8 instance); +#else +static inline int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, + u8 instance) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_SCRUB */ + +/** + * struct edac_ecs_ops - ECS device operations (all elements optional) + * @get_log_entry_type: read the log entry type value. + * @set_log_entry_type: set the log entry type value. + * @get_mode: read the mode value. + * @set_mode: set the mode value. + * @reset: reset the ECS counter. + * @get_threshold: read the threshold count per gigabits of memory cells. + * @set_threshold: set the threshold count per gigabits of memory cells. + */ +struct edac_ecs_ops { + int (*get_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 *val); + int (*set_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*get_mode)(struct device *dev, void *drv_data, int fru_id, u32 *val); + int (*set_mode)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*reset)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*get_threshold)(struct device *dev, void *drv_data, int fru_id, u32 *threshold); + int (*set_threshold)(struct device *dev, void *drv_data, int fru_id, u32 threshold); +}; + +struct edac_ecs_ex_info { + u16 num_media_frus; +}; + +#if IS_ENABLED(CONFIG_EDAC_ECS) +int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, + u16 num_media_frus); +#else +static inline int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, + u16 num_media_frus) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_ECS */ + +enum edac_mem_repair_type { + EDAC_REPAIR_MAX +}; + +enum edac_mem_repair_cmd { + EDAC_DO_MEM_REPAIR = 1, +}; + +/** + * struct edac_mem_repair_ops - memory repair operations + * (all elements are optional except do_repair, set_hpa/set_dpa) + * @get_repair_type: get the memory repair type, listed in + * enum edac_mem_repair_function. + * @get_persist_mode: get the current persist mode. + * false - Soft repair type (temporary repair). + * true - Hard memory repair type (permanent repair). + * @set_persist_mode: set the persist mode of the memory repair instance. + * @get_repair_safe_when_in_use: get whether memory media is accessible and + * data is retained during repair operation. + * @get_hpa: get current host physical address (HPA) of memory to repair. + * @set_hpa: set host physical address (HPA) of memory to repair. + * @get_min_hpa: get the minimum supported host physical address (HPA). + * @get_max_hpa: get the maximum supported host physical address (HPA). + * @get_dpa: get current device physical address (DPA) of memory to repair. + * @set_dpa: set device physical address (DPA) of memory to repair. + * In some states of system configuration (e.g. before address decoders + * have been configured), memory devices (e.g. CXL) may not have an active + * mapping in the host physical address map. As such, the memory + * to repair must be identified by a device specific physical addressing + * scheme using a device physical address(DPA). The DPA and other control + * attributes to use for the repair operations will be presented in related + * error records. + * @get_min_dpa: get the minimum supported device physical address (DPA). + * @get_max_dpa: get the maximum supported device physical address (DPA). + * @get_nibble_mask: get current nibble mask of memory to repair. + * @set_nibble_mask: set nibble mask of memory to repair. + * @get_bank_group: get current bank group of memory to repair. + * @set_bank_group: set bank group of memory to repair. + * @get_bank: get current bank of memory to repair. + * @set_bank: set bank of memory to repair. + * @get_rank: get current rank of memory to repair. + * @set_rank: set rank of memory to repair. + * @get_row: get current row of memory to repair. + * @set_row: set row of memory to repair. + * @get_column: get current column of memory to repair. + * @set_column: set column of memory to repair. + * @get_channel: get current channel of memory to repair. + * @set_channel: set channel of memory to repair. + * @get_sub_channel: get current subchannel of memory to repair. + * @set_sub_channel: set subchannel of memory to repair. + * @do_repair: Issue memory repair operation for the HPA/DPA and + * other control attributes set for the memory to repair. + * + * All elements are optional except do_repair and at least one of set_hpa/set_dpa. + */ +struct edac_mem_repair_ops { + int (*get_repair_type)(struct device *dev, void *drv_data, const char **type); + int (*get_persist_mode)(struct device *dev, void *drv_data, bool *persist); + int (*set_persist_mode)(struct device *dev, void *drv_data, bool persist); + int (*get_repair_safe_when_in_use)(struct device *dev, void *drv_data, bool *safe); + int (*get_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*set_hpa)(struct device *dev, void *drv_data, u64 hpa); + int (*get_min_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*get_max_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*get_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*set_dpa)(struct device *dev, void *drv_data, u64 dpa); + int (*get_min_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*get_max_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*get_nibble_mask)(struct device *dev, void *drv_data, u32 *val); + int (*set_nibble_mask)(struct device *dev, void *drv_data, u32 val); + int (*get_bank_group)(struct device *dev, void *drv_data, u32 *val); + int (*set_bank_group)(struct device *dev, void *drv_data, u32 val); + int (*get_bank)(struct device *dev, void *drv_data, u32 *val); + int (*set_bank)(struct device *dev, void *drv_data, u32 val); + int (*get_rank)(struct device *dev, void *drv_data, u32 *val); + int (*set_rank)(struct device *dev, void *drv_data, u32 val); + int (*get_row)(struct device *dev, void *drv_data, u32 *val); + int (*set_row)(struct device *dev, void *drv_data, u32 val); + int (*get_column)(struct device *dev, void *drv_data, u32 *val); + int (*set_column)(struct device *dev, void *drv_data, u32 val); + int (*get_channel)(struct device *dev, void *drv_data, u32 *val); + int (*set_channel)(struct device *dev, void *drv_data, u32 val); + int (*get_sub_channel)(struct device *dev, void *drv_data, u32 *val); + int (*set_sub_channel)(struct device *dev, void *drv_data, u32 val); + int (*do_repair)(struct device *dev, void *drv_data, u32 val); +}; + +#if IS_ENABLED(CONFIG_EDAC_MEM_REPAIR) +int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance); +#else +static inline int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_MEM_REPAIR */ + +/* EDAC device feature information structure */ +struct edac_dev_data { + union { + const struct edac_scrub_ops *scrub_ops; + const struct edac_ecs_ops *ecs_ops; + const struct edac_mem_repair_ops *mem_repair_ops; + }; + u8 instance; + void *private; +}; + +struct edac_dev_feat_ctx { + struct device dev; + void *private; + struct edac_dev_data *scrub; + struct edac_dev_data ecs; + struct edac_dev_data *mem_repair; +}; + +struct edac_dev_feature { + enum edac_dev_feat ft_type; + u8 instance; + union { + const struct edac_scrub_ops *scrub_ops; + const struct edac_ecs_ops *ecs_ops; + const struct edac_mem_repair_ops *mem_repair_ops; + }; + void *ctx; + struct edac_ecs_ex_info ecs_info; +}; + +int edac_dev_register(struct device *parent, char *dev_name, + void *parent_pvt_data, int num_features, + const struct edac_dev_feature *ras_features); #endif /* _LINUX_EDAC_H_ */ diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 78318d49276d..65efc0f5ea2e 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -240,9 +240,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; -#ifdef CONFIG_SCHED_DEBUG WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); -#endif if (!sum_util) return 0; diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 0c0d00fcd131..ccb478eb174b 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -25,6 +25,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); +/* Copy ready events to userspace */ +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents); + /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 64130ae19690..65655a5d1be2 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -65,6 +65,37 @@ enum execmem_range_flags { * Architectures that use EXECMEM_ROX_CACHE must implement this. */ void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); + +/** + * execmem_make_temp_rw - temporarily remap region with read-write + * permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Remaps a part of the cached large page in the ROX cache in the range + * [@ptr, @ptr + @size) as writable and not executable. The caller must + * have exclusive ownership of this range and ensure nothing will try to + * execute code in this range. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_make_temp_rw(void *ptr, size_t size); + +/** + * execmem_restore_rox - restore read-only-execute permissions + * @ptr: address of the region to remap + * @size: size of the region to remap + * + * Restores read-only-execute permissions on a range [@ptr, @ptr + @size) + * after it was temporarily remapped as writable. Relies on architecture + * implementation of set_memory_rox() to restore mapping using large pages. + * + * Return: 0 on success or negative error code on failure. + */ +int execmem_restore_rox(void *ptr, size_t size); +#else +static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } +static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif /** diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 78f660ebc318..3c817dc6292e 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -25,7 +25,7 @@ #define FANOTIFY_FID_BITS (FAN_REPORT_DFID_NAME_TARGET) -#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD) +#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD | FAN_REPORT_MNT) /* * fanotify_init() flags that require CAP_SYS_ADMIN. @@ -38,7 +38,8 @@ FAN_REPORT_PIDFD | \ FAN_REPORT_FD_ERROR | \ FAN_UNLIMITED_QUEUE | \ - FAN_UNLIMITED_MARKS) + FAN_UNLIMITED_MARKS | \ + FAN_REPORT_MNT) /* * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN. @@ -58,7 +59,7 @@ #define FANOTIFY_INTERNAL_GROUP_FLAGS (FANOTIFY_UNPRIV) #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ - FAN_MARK_FILESYSTEM) + FAN_MARK_FILESYSTEM | FAN_MARK_MNTNS) #define FANOTIFY_MARK_CMD_BITS (FAN_MARK_ADD | FAN_MARK_REMOVE | \ FAN_MARK_FLUSH) @@ -109,10 +110,13 @@ /* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */ #define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR) +#define FANOTIFY_MOUNT_EVENTS (FAN_MNT_ATTACH | FAN_MNT_DETACH) + /* Events that user can request to be notified on */ #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ FANOTIFY_INODE_EVENTS | \ - FANOTIFY_ERROR_EVENTS) + FANOTIFY_ERROR_EVENTS | \ + FANOTIFY_MOUNT_EVENTS) /* Extra flags that may be reported with event or control handling of events */ #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 9b3a8d9b17ab..7db62fbc0500 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -61,6 +61,7 @@ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) atomic_long_set(&ref->refcnt, cnt - 1); } +bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt); bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** @@ -161,6 +162,39 @@ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) } /** + * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF + * @ref: Pointer to the reference count + * + * Semantically it is equivalent to calling file_ref_put(), but it trades lower + * performance in face of other CPUs also modifying the refcount for higher + * performance when this happens to be the last reference. + * + * For the last reference file_ref_put() issues 2 atomics. One to drop the + * reference and another to transition it to FILE_REF_DEAD. This routine does + * the work in one step, but in order to do it has to pre-read the variable which + * decreases scalability. + * + * Use with close() et al, stick to file_ref_put() by default. + */ +static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) +{ + long old, new; + + old = atomic_long_read(&ref->refcnt); + do { + if (unlikely(old < 0)) + return __file_ref_put_badval(ref, old); + + if (old == FILE_REF_ONEREF) + new = FILE_REF_DEAD; + else + new = old - 1; + } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new)); + + return new == FILE_REF_DEAD; +} + +/** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * @@ -174,4 +208,18 @@ static inline unsigned long file_ref_read(file_ref_t *ref) return c >= FILE_REF_RELEASED ? 0 : c + 1; } +/* + * __file_ref_read_raw - Return the value stored in ref->refcnt + * @ref: Pointer to the reference count + * + * Return: The raw value found in the counter + * + * A hack for file_needs_f_pos_lock(), you probably want to use + * file_ref_read() instead. + */ +static inline unsigned long __file_ref_read_raw(file_ref_t *ref) +{ + return atomic_long_read(&ref->refcnt); +} + #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 2788df98080f..1a0e23a5d02d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> @@ -790,19 +791,8 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { - int testlen; - - /* - * TODO: patch it into a debug-only check if relevant macros show up. - * In the meantime, since we are suffering strlen even on production kernels - * to find the right length, do a fixup if the wrong value got passed. - */ - testlen = strlen(link); - if (testlen != linklen) { - WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", - link, linklen, testlen); - linklen = testlen; - } + VFS_WARN_ON_INODE(strlen(link) != linklen, inode); + VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; @@ -1067,7 +1057,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) /** * struct file - Represents a file - * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations @@ -1077,12 +1066,12 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener + * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file - * @f_owner: file owner * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file @@ -1090,9 +1079,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) + * @f_ref: reference count */ struct file { - file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; @@ -1102,6 +1091,7 @@ struct file { unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; + struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { @@ -1115,7 +1105,6 @@ struct file { void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ - struct fown_struct *f_owner; errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL @@ -1127,6 +1116,7 @@ struct file { struct file_ra_state f_ra; freeptr_t f_freeptr; }; + file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1981,8 +1971,8 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, */ int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int vfs_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); +struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, @@ -2039,7 +2029,7 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, @@ -2211,8 +2201,8 @@ struct inode_operations { int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); - int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *, - umode_t); + struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); @@ -2616,6 +2606,7 @@ struct file_system_type { #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ +#define FS_LBS 128 /* FS supports LBS */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -2653,9 +2644,6 @@ static inline bool is_mgtime(const struct inode *inode) extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); @@ -2794,13 +2782,13 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) return mnt_idmap(mnt) != &nop_mnt_idmap; } -extern long vfs_truncate(const struct path *, loff_t); +int vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); -extern long do_sys_open(int dfd, const char __user *filename, int flags, - umode_t mode); +int do_sys_open(int dfd, const char __user *filename, int flags, + umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, @@ -2851,7 +2839,10 @@ extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); -extern struct filename *getname(const char __user *); +static inline struct filename *getname(const char __user *name) +{ + return getname_flags(name, 0); +} extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) @@ -2864,6 +2855,13 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f return __getname_maybe_null(name); } extern void putname(struct filename *name); +DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T)) + +static inline struct filename *refname(struct filename *name) +{ + atomic_inc(&name->refcnt); + return name; +} extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); @@ -3297,7 +3295,11 @@ static inline void __iget(struct inode *inode) extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); -extern struct inode *new_inode_pseudo(struct super_block *sb); +struct inode *alloc_inode(struct super_block *sb); +static inline struct inode *new_inode_pseudo(struct super_block *sb) +{ + return alloc_inode(sb); +} extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 4b4bfef6f053..a19e4bd32e4d 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -144,8 +144,6 @@ extern void put_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param); extern void fc_drop_locked(struct fs_context *fc); -int reconfigure_single(struct super_block *s, - int flags, void *data); extern int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 18855cb44b1c..56fad33043d5 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -310,10 +310,8 @@ static inline void fscrypt_prepare_dentry(struct dentry *dentry, /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); -struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags); +struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); @@ -480,10 +478,8 @@ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } -static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags) +static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 83d3ac97f826..454d8e466958 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -320,6 +320,11 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt) __fsnotify_vfsmount_delete(mnt); } +static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns) +{ + __fsnotify_mntns_delete(mntns); +} + /* * fsnotify_inoderemove - an inode is going away */ @@ -528,4 +533,19 @@ static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode, NULL, NULL, NULL, 0); } +static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_ATTACH, ns, mnt); +} + +static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_DETACH, ns, mnt); +} + +static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt) +{ + fsnotify_mnt(FS_MNT_MOVE, ns, mnt); +} + #endif /* _LINUX_FS_NOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 0d24a21a8e60..6cd8d1d28b8b 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -59,6 +59,10 @@ #define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ +#define FS_MNT_ATTACH 0x01000000 /* Mount was attached */ +#define FS_MNT_DETACH 0x02000000 /* Mount was detached */ +#define FS_MNT_MOVE (FS_MNT_ATTACH | FS_MNT_DETACH) + /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. @@ -80,6 +84,9 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Mount namespace events */ +#define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH) + /* Content events can be used to inspect file content */ #define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ FS_ACCESS_PERM) @@ -108,6 +115,7 @@ /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ + FSNOTIFY_MNT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ @@ -298,6 +306,7 @@ enum fsnotify_data_type { FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, + FSNOTIFY_EVENT_MNT, FSNOTIFY_EVENT_ERROR, }; @@ -318,6 +327,11 @@ static inline const struct path *file_range_path(const struct file_range *range) return range->path; } +struct fsnotify_mnt { + const struct mnt_namespace *ns; + u64 mnt_id; +}; + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -383,6 +397,24 @@ static inline struct super_block *fsnotify_data_sb(const void *data, } } +static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_MNT: + return data; + default: + return NULL; + } +} + +static inline u64 fsnotify_data_mnt_id(const void *data, int data_type) +{ + const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); + + return mnt_data ? mnt_data->mnt_id : 0; +} + static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) @@ -420,6 +452,7 @@ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, + FSNOTIFY_ITER_TYPE_MNTNS, FSNOTIFY_ITER_TYPE_COUNT }; @@ -429,6 +462,7 @@ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, + FSNOTIFY_OBJ_TYPE_MNTNS, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; @@ -613,8 +647,10 @@ extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); +extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns); extern void fsnotify_sb_free(struct super_block *sb); extern u32 fsnotify_get_cookie(void); +extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { @@ -928,6 +964,9 @@ static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) static inline void fsnotify_sb_delete(struct super_block *sb) {} +static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns) +{} + static inline void fsnotify_sb_free(struct super_block *sb) {} @@ -942,6 +981,9 @@ static inline u32 fsnotify_get_cookie(void) static inline void fsnotify_unmount_inodes(struct super_block *sb) {} +static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) +{} + #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f7bfdcf0dda3..88e078871158 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -223,6 +223,11 @@ static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) } #endif +static inline enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) +{ + return HRTIMER_NORESTART; +} + /* Exported timer functions: */ /* Initialize timers: */ @@ -333,6 +338,7 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) static inline void hrtimer_update_function(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *)) { +#ifdef CONFIG_PROVE_LOCKING guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock); if (WARN_ON_ONCE(hrtimer_is_queued(timer))) @@ -340,7 +346,7 @@ static inline void hrtimer_update_function(struct hrtimer *timer, if (WARN_ON_ONCE(!function)) return; - +#endif timer->function = function; } diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6..cd729be369b3 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -15,6 +15,7 @@ #include <linux/radix-tree.h> #include <linux/gfp.h> #include <linux/percpu.h> +#include <linux/cleanup.h> struct idr { struct radix_tree_root idr_rt; @@ -124,6 +125,22 @@ void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); +struct __class_idr { + struct idr *idr; + int id; +}; + +#define idr_null ((struct __class_idr){ NULL, -1 }) +#define take_idr_id(id) __get_and_null(id, idr_null) + +DEFINE_CLASS(idr_alloc, struct __class_idr, + if (_T.id >= 0) idr_remove(_T.idr, _T.id), + ((struct __class_idr){ + .idr = idr, + .id = idr_alloc(idr, ptr, start, end, gfp), + }), + struct idr *idr, void *ptr, int start, int end, gfp_t gfp); + /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 8cd9327e4e78..c782a74d2a30 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -448,7 +448,7 @@ irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_disable(); #endif } @@ -456,22 +456,14 @@ static inline void disable_irq_nosync_lockdep(unsigned int irq) static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_save(*flags); #endif } -static inline void disable_irq_lockdep(unsigned int irq) -{ - disable_irq(irq); -#ifdef CONFIG_LOCKDEP - local_irq_disable(); -#endif -} - static inline void enable_irq_lockdep(unsigned int irq) { -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_enable(); #endif enable_irq(irq); @@ -479,7 +471,7 @@ static inline void enable_irq_lockdep(unsigned int irq) static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { -#ifdef CONFIG_LOCKDEP +#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT) local_irq_restore(*flags); #endif enable_irq(irq); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..02fe001feebb 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,13 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. + * + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic + * bio, i.e. set REQ_ATOMIC. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +75,8 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) +#define IOMAP_F_ATOMIC_BIO (1U << 8) /* * Flags set by the core iomap code during operations: @@ -111,6 +120,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } @@ -182,7 +193,8 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC (1 << 9) +#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */ +#define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { /* @@ -211,8 +223,10 @@ struct iomap_ops { * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. - * @processed: The number of bytes processed by the body in the most recent - * iteration, or a negative errno. 0 causes the iteration to stop. + * @iter_start_pos: The original start pos for the current iomap. Used for + * incremental iter advance. + * @status: Status of the most recent iteration. Zero on success or a negative + * errno on error. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -221,7 +235,8 @@ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; - s64 processed; + loff_t iter_start_pos; + int status; unsigned flags; struct iomap iomap; struct iomap srcmap; @@ -229,20 +244,46 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); +int iomap_iter_advance(struct iomap_iter *iter, u64 *count); /** - * iomap_length - length of the current iomap iteration + * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure + * @pos: File position to trim from. + * @len: Length of the mapping to trim to. * - * Returns the length that the operation applies to for the current iteration. + * Returns a trimmed length that the operation applies to for the current + * iteration. */ -static inline u64 iomap_length(const struct iomap_iter *iter) +static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, + u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); - return min(iter->len, end - iter->pos); + return min(len, end - pos); +} + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + return iomap_length_trim(iter, iter->pos, iter->len); +} + +/** + * iomap_iter_advance_full - advance by the full length of current map + */ +static inline int iomap_iter_advance_full(struct iomap_iter *iter) +{ + u64 length = iomap_length(iter); + + return iomap_iter_advance(iter, &length); } /** @@ -306,12 +347,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - + const struct iomap_ops *ops, void *private); +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, @@ -328,16 +368,42 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); /* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) + +/* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; @@ -362,12 +428,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where @@ -383,6 +451,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -454,4 +526,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ diff --git a/include/linux/irq.h b/include/linux/irq.h index 8daa17f0107a..dd5df1e2d032 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -486,6 +486,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @ipi_send_mask: send an IPI to destination cpus in cpumask * @irq_nmi_setup: function called from core code before enabling an NMI * @irq_nmi_teardown: function called from core code after disabling an NMI + * @irq_force_complete_move: optional function to force complete pending irq move * @flags: chip specific flags */ struct irq_chip { @@ -537,6 +538,8 @@ struct irq_chip { int (*irq_nmi_setup)(struct irq_data *data); void (*irq_nmi_teardown)(struct irq_data *data); + void (*irq_force_complete_move)(struct irq_data *data); + unsigned long flags; }; @@ -612,6 +615,7 @@ extern int irq_affinity_online_cpu(unsigned int cpu); #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) +bool irq_can_move_in_process_context(struct irq_data *data); void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { @@ -619,11 +623,10 @@ static inline void irq_move_irq(struct irq_data *data) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); -void irq_force_complete_move(struct irq_desc *desc); #else +static inline bool irq_can_move_in_process_context(struct irq_data *data) { return true; } static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } -static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif extern int no_irq_affinity; diff --git a/include/linux/irqchip/irq-davinci-cp-intc.h b/include/linux/irqchip/irq-davinci-cp-intc.h deleted file mode 100644 index 8d71ed5b5a61..000000000000 --- a/include/linux/irqchip/irq-davinci-cp-intc.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2019 Texas Instruments - */ - -#ifndef _LINUX_IRQ_DAVINCI_CP_INTC_ -#define _LINUX_IRQ_DAVINCI_CP_INTC_ - -#include <linux/ioport.h> - -/** - * struct davinci_cp_intc_config - configuration data for davinci-cp-intc - * driver. - * - * @reg: register range to map - * @num_irqs: number of HW interrupts supported by the controller - */ -struct davinci_cp_intc_config { - struct resource reg; - unsigned int num_irqs; -}; - -int davinci_cp_intc_init(const struct davinci_cp_intc_config *config); - -#endif /* _LINUX_IRQ_DAVINCI_CP_INTC_ */ diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e432b6a12a32..5126482515cb 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -281,6 +281,8 @@ static inline struct fwnode_handle *irq_domain_alloc_fwnode(phys_addr_t *pa) void irq_domain_free_fwnode(struct fwnode_handle *fwnode); +DEFINE_FREE(irq_domain_free_fwnode, struct fwnode_handle *, if (_T) irq_domain_free_fwnode(_T)) + struct irq_domain_chip_generic_info; /** @@ -350,13 +352,13 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data); -extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, - enum irq_domain_bus_token bus_token); -extern void irq_set_default_host(struct irq_domain *host); -extern struct irq_domain *irq_get_default_host(void); -extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node, - const struct irq_affinity_desc *affinity); +struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token); +void irq_set_default_host(struct irq_domain *host); +struct irq_domain *irq_get_default_host(void); +int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node, + const struct irq_affinity_desc *affinity); static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) { @@ -370,8 +372,8 @@ static inline bool is_fwnode_irqchip(const struct fwnode_handle *fwnode) return fwnode && fwnode->ops == &irqchip_fwnode_ops; } -extern void irq_domain_update_bus_token(struct irq_domain *domain, - enum irq_domain_bus_token bus_token); +void irq_domain_update_bus_token(struct irq_domain *domain, + enum irq_domain_bus_token bus_token); static inline struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, @@ -454,7 +456,7 @@ static inline struct irq_domain *irq_domain_add_nomap(struct device_node *of_nod return IS_ERR(d) ? NULL : d; } -extern unsigned int irq_create_direct_mapping(struct irq_domain *host); +unsigned int irq_create_direct_mapping(struct irq_domain *host); #endif static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, @@ -507,19 +509,19 @@ static inline struct irq_domain *irq_domain_create_tree(struct fwnode_handle *fw return IS_ERR(d) ? NULL : d; } -extern void irq_domain_remove(struct irq_domain *host); +void irq_domain_remove(struct irq_domain *host); -extern int irq_domain_associate(struct irq_domain *domain, unsigned int irq, - irq_hw_number_t hwirq); -extern void irq_domain_associate_many(struct irq_domain *domain, - unsigned int irq_base, - irq_hw_number_t hwirq_base, int count); +int irq_domain_associate(struct irq_domain *domain, unsigned int irq, + irq_hw_number_t hwirq); +void irq_domain_associate_many(struct irq_domain *domain, + unsigned int irq_base, + irq_hw_number_t hwirq_base, int count); -extern unsigned int irq_create_mapping_affinity(struct irq_domain *host, - irq_hw_number_t hwirq, - const struct irq_affinity_desc *affinity); -extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); -extern void irq_dispose_mapping(unsigned int virq); +unsigned int irq_create_mapping_affinity(struct irq_domain *host, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity); +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); +void irq_dispose_mapping(unsigned int virq); static inline unsigned int irq_create_mapping(struct irq_domain *host, irq_hw_number_t hwirq) @@ -527,9 +529,9 @@ static inline unsigned int irq_create_mapping(struct irq_domain *host, return irq_create_mapping_affinity(host, hwirq, NULL); } -extern struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq, - unsigned int *irq); +struct irq_desc *__irq_resolve_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq, + unsigned int *irq); static inline struct irq_desc *irq_resolve_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) @@ -587,19 +589,21 @@ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest); int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest); /* V2 interfaces to support hierarchy IRQ domains. */ -extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, - unsigned int virq); -extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data, irq_flow_handler_t handler, - void *handler_data, const char *handler_name); -extern void irq_domain_reset_irq_data(struct irq_data *irq_data); +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq); +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, + const struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name); +void irq_domain_reset_irq_data(struct irq_data *irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -extern struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, void *host_data); +struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct fwnode_handle *fwnode, + const struct irq_domain_ops *ops, + void *host_data); static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, unsigned int flags, @@ -613,13 +617,13 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par ops, host_data); } -extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, - unsigned int nr_irqs, int node, void *arg, - bool realloc, - const struct irq_affinity_desc *affinity); -extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); -extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early); -extern void irq_domain_deactivate_irq(struct irq_data *irq_data); +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, + const struct irq_affinity_desc *affinity); +void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs); +int irq_domain_activate_irq(struct irq_data *irq_data, bool early); +void irq_domain_deactivate_irq(struct irq_data *irq_data); static inline int irq_domain_alloc_irqs(struct irq_domain *domain, unsigned int nr_irqs, int node, void *arg) @@ -628,32 +632,29 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, NULL); } -extern int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); -extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, - unsigned int virq, - irq_hw_number_t hwirq, - const struct irq_chip *chip, - void *chip_data); -extern void irq_domain_free_irqs_common(struct irq_domain *domain, - unsigned int virq, - unsigned int nr_irqs); -extern void irq_domain_free_irqs_top(struct irq_domain *domain, - unsigned int virq, unsigned int nr_irqs); - -extern int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); -extern int irq_domain_pop_irq(struct irq_domain *domain, int virq); - -extern int irq_domain_alloc_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg); - -extern void irq_domain_free_irqs_parent(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs); - -extern int irq_domain_disconnect_hierarchy(struct irq_domain *domain, +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, + unsigned int virq, + irq_hw_number_t hwirq, + const struct irq_chip *chip, + void *chip_data); +void irq_domain_free_irqs_common(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs); +void irq_domain_free_irqs_top(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs); + +int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg); +int irq_domain_pop_irq(struct irq_domain *domain, int virq); + +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg); + +void irq_domain_free_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs); + +int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0e9f8eda7a3..c840431eadda 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -68,8 +68,6 @@ extern note_buf_t __percpu *crash_notes; #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif -#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME - /* * This structure is used to hold the arguments that are used when loading * kernel binaries. diff --git a/include/linux/key.h b/include/linux/key.h index 074dca3222b9..ba05de8579ec 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -236,6 +236,7 @@ struct key { #define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */ #define KEY_FLAG_KEEP 8 /* set if key should not be removed */ #define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */ +#define KEY_FLAG_FINAL_PUT 10 /* set if final put has happened on key */ /* the key type and key description string * - the desc is used to match a key against search criteria diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h index 7fcf29a4e0de..6ea897222af1 100644 --- a/include/linux/kstrtox.h +++ b/include/linux/kstrtox.h @@ -143,6 +143,7 @@ static inline int __must_check kstrtos32_from_user(const char __user *s, size_t */ extern unsigned long simple_strtoul(const char *,char **,unsigned int); +extern unsigned long simple_strntoul(const char *,char **,unsigned int,size_t); extern long simple_strtol(const char *,char **,unsigned int); extern unsigned long long simple_strtoull(const char *,char **,unsigned int); extern long long simple_strtoll(const char *,char **,unsigned int); diff --git a/include/linux/libata.h b/include/linux/libata.h index c1c57f814b98..e5695998acb0 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -88,6 +88,7 @@ enum ata_quirks { __ATA_QUIRK_MAX_SEC_1024, /* Limit max sects to 1024 */ __ATA_QUIRK_MAX_TRIM_128M, /* Limit max trim size to 128M */ __ATA_QUIRK_NO_NCQ_ON_ATI, /* Disable NCQ on ATI chipset */ + __ATA_QUIRK_NO_LPM_ON_ATI, /* Disable LPM on ATI chipset */ __ATA_QUIRK_NO_ID_DEV_LOG, /* Identify device log missing */ __ATA_QUIRK_NO_LOG_DIR, /* Do not read log directory */ __ATA_QUIRK_NO_FUA, /* Do not use FUA */ @@ -432,6 +433,7 @@ enum { ATA_QUIRK_MAX_SEC_1024 = (1U << __ATA_QUIRK_MAX_SEC_1024), ATA_QUIRK_MAX_TRIM_128M = (1U << __ATA_QUIRK_MAX_TRIM_128M), ATA_QUIRK_NO_NCQ_ON_ATI = (1U << __ATA_QUIRK_NO_NCQ_ON_ATI), + ATA_QUIRK_NO_LPM_ON_ATI = (1U << __ATA_QUIRK_NO_LPM_ON_ATI), ATA_QUIRK_NO_ID_DEV_LOG = (1U << __ATA_QUIRK_NO_ID_DEV_LOG), ATA_QUIRK_NO_LOG_DIR = (1U << __ATA_QUIRK_NO_LOG_DIR), ATA_QUIRK_NO_FUA = (1U << __ATA_QUIRK_NO_FUA), diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index ae4526389261..07584c5e36fb 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -26,11 +26,34 @@ */ #define __sme_set(x) ((x) | sme_me_mask) #define __sme_clr(x) ((x) & ~sme_me_mask) + +#define dma_addr_encrypted(x) __sme_set(x) +#define dma_addr_canonical(x) __sme_clr(x) + #else #define __sme_set(x) (x) #define __sme_clr(x) (x) #endif +/* + * dma_addr_encrypted() and dma_addr_unencrypted() are for converting a given DMA + * address to the respective type of addressing. + * + * dma_addr_canonical() is used to reverse any conversions for encrypted/decrypted + * back to the canonical address. + */ +#ifndef dma_addr_encrypted +#define dma_addr_encrypted(x) (x) +#endif + +#ifndef dma_addr_unencrypted +#define dma_addr_unencrypted(x) (x) +#endif + +#ifndef dma_addr_canonical +#define dma_addr_canonical(x) (x) +#endif + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e59..4bf261d41a6d 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -60,7 +60,6 @@ struct misc_cg { struct misc_res res[MISC_CG_RES_TYPES]; }; -u64 misc_cg_res_total_usage(enum misc_res_type type); int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); @@ -104,11 +103,6 @@ static inline void put_misc_cg(struct misc_cg *cg) #else /* !CONFIG_CGROUP_MISC */ -static inline u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - return 0; -} - static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index 8483e09aeb2c..2edb8d14d165 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1458,7 +1458,10 @@ static inline void folio_get(struct folio *folio) static inline void get_page(struct page *page) { - folio_get(page_folio(page)); + struct folio *folio = page_folio(page); + if (WARN_ON_ONCE(folio_test_slab(folio))) + return; + folio_get(folio); } static inline __must_check bool try_get_page(struct page *page) @@ -1552,6 +1555,9 @@ static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); + if (folio_test_slab(folio)) + return; + /* * For some devmap managed pages we need to catch refcount transition * from 2 to 1: @@ -2549,7 +2555,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; -struct page *get_dump_page(unsigned long addr); +struct page *get_dump_page(unsigned long addr, int *locked); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b1b219bc3422..e71a6070a8f8 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -25,6 +25,11 @@ static_assert(sizeof(vfsgid_t) == sizeof(kgid_t)); static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val)); static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val)); +static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap) +{ + return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap; +} + #ifdef CONFIG_MULTIUSER static inline uid_t __vfsuid_val(vfsuid_t uid) { diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index d67614f7b7f1..bd7e60c0b72f 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -692,6 +692,7 @@ struct x86_cpu_id { __u16 feature; /* bit index */ /* Solely for kernel-internal use: DO NOT EXPORT to userspace! */ __u16 flags; + __u8 type; kernel_ulong_t driver_data; }; @@ -703,6 +704,7 @@ struct x86_cpu_id { #define X86_STEP_MIN 0 #define X86_STEP_MAX 0xf #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ +#define X86_CPU_TYPE_ANY 0 /* * Generic table type for matching CPU features. diff --git a/include/linux/module.h b/include/linux/module.h index 30e5b19bafa9..9937e71a3b5b 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -370,7 +370,6 @@ enum mod_mem_type { struct module_memory { void *base; - void *rw_copy; bool is_rox; unsigned int size; @@ -772,16 +771,6 @@ static inline bool is_livepatch_module(struct module *mod) void set_module_sig_enforced(void); -void *__module_writable_address(struct module *mod, void *loc); - -static inline void *module_writable_address(struct module *mod, void *loc) -{ - if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod || - mod->state != MODULE_STATE_UNFORMED) - return loc; - return __module_writable_address(mod, loc); -} - #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -889,11 +878,6 @@ static inline bool module_is_coming(struct module *mod) { return false; } - -static inline void *module_writable_address(struct module *mod, void *loc) -{ - return loc; -} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 1f5507ba5a12..e395461d59e5 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -108,10 +108,6 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *mod); - #ifdef CONFIG_MODULES void flush_module_init_free_work(void); #else diff --git a/include/linux/msi.h b/include/linux/msi.h index b10093c4d00e..b81e10da62a2 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -81,7 +81,6 @@ struct device_attribute; struct irq_domain; struct irq_affinity_desc; -void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); #else @@ -225,8 +224,11 @@ struct msi_dev_domain { int msi_setup_device_data(struct device *dev); -void msi_lock_descs(struct device *dev); -void msi_unlock_descs(struct device *dev); +void __msi_lock_descs(struct device *dev); +void __msi_unlock_descs(struct device *dev); + +DEFINE_LOCK_GUARD_1(msi_descs_lock, struct device, __msi_lock_descs(_T->lock), + __msi_unlock_descs(_T->lock)); struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, enum msi_desc_filter filter); @@ -558,11 +560,21 @@ enum { MSI_FLAG_NO_AFFINITY = (1 << 21), }; +/* + * Flags for msi_parent_ops::chip_flags + */ +enum { + MSI_CHIP_FLAG_SET_EOI = (1 << 0), + MSI_CHIP_FLAG_SET_ACK = (1 << 1), +}; + /** * struct msi_parent_ops - MSI parent domain callbacks and configuration info * * @supported_flags: Required: The supported MSI flags of the parent domain * @required_flags: Optional: The required MSI flags of the parent MSI domain + * @chip_flags: Optional: Select MSI chip callbacks to update with defaults + * in msi_lib_init_dev_msi_info(). * @bus_select_token: Optional: The bus token of the real parent domain for * irq_domain::select() * @bus_select_mask: Optional: A mask of supported BUS_DOMAINs for @@ -575,6 +587,7 @@ enum { struct msi_parent_ops { u32 supported_flags; u32 required_flags; + u32 chip_flags; u32 bus_select_token; u32 bus_select_mask; const char *prefix; @@ -603,8 +616,6 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid); bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, enum irq_domain_bus_token bus_token); -int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last); int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs); @@ -613,8 +624,6 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *cookie); -void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last); void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last); void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid); diff --git a/include/linux/namei.h b/include/linux/namei.h index 8ec8fed3bce8..e3042176cdf4 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -18,35 +18,36 @@ enum { MAX_NESTED_LINKS = 8 }; enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ -#define LOOKUP_FOLLOW 0x0001 /* follow links at the end */ -#define LOOKUP_DIRECTORY 0x0002 /* require a directory */ -#define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */ -#define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */ -#define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */ -#define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */ - -#define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */ -#define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */ +#define LOOKUP_FOLLOW BIT(0) /* follow links at the end */ +#define LOOKUP_DIRECTORY BIT(1) /* require a directory */ +#define LOOKUP_AUTOMOUNT BIT(2) /* force terminal automount */ +#define LOOKUP_EMPTY BIT(3) /* accept empty path [user_... only] */ +#define LOOKUP_LINKAT_EMPTY BIT(4) /* Linkat request with empty path. */ +#define LOOKUP_DOWN BIT(5) /* follow mounts in the starting point */ +#define LOOKUP_MOUNTPOINT BIT(6) /* follow mounts in the end */ +#define LOOKUP_REVAL BIT(7) /* tell ->d_revalidate() to trust no cache */ +#define LOOKUP_RCU BIT(8) /* RCU pathwalk mode; semi-internal */ +#define LOOKUP_CACHED BIT(9) /* Only do cached lookup */ +#define LOOKUP_PARENT BIT(10) /* Looking up final parent in path */ +/* 5 spare bits for pathwalk */ /* These tell filesystem methods that we are dealing with the final component... */ -#define LOOKUP_OPEN 0x0100 /* ... in open */ -#define LOOKUP_CREATE 0x0200 /* ... in object creation */ -#define LOOKUP_EXCL 0x0400 /* ... in exclusive creation */ -#define LOOKUP_RENAME_TARGET 0x0800 /* ... in destination of rename() */ +#define LOOKUP_OPEN BIT(16) /* ... in open */ +#define LOOKUP_CREATE BIT(17) /* ... in object creation */ +#define LOOKUP_EXCL BIT(18) /* ... in target must not exist */ +#define LOOKUP_RENAME_TARGET BIT(19) /* ... in destination of rename() */ -/* internal use only */ -#define LOOKUP_PARENT 0x0010 +/* 4 spare bits for intent */ /* Scoping flags for lookup. */ -#define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ -#define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ -#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ -#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ -#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ -#define LOOKUP_CACHED 0x200000 /* Only do cached lookup */ -#define LOOKUP_LINKAT_EMPTY 0x400000 /* Linkat request with empty path. */ +#define LOOKUP_NO_SYMLINKS BIT(24) /* No symlink crossing. */ +#define LOOKUP_NO_MAGICLINKS BIT(25) /* No nd_jump_link() crossing. */ +#define LOOKUP_NO_XDEV BIT(26) /* No mountpoint crossing. */ +#define LOOKUP_BENEATH BIT(27) /* No escaping from starting point. */ +#define LOOKUP_IN_ROOT BIT(28) /* Treat dirfd as fs root. */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) +/* 3 spare bits for scoping */ extern int path_pts(struct path *path); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9155a6ffc370..d66c61cbbd1d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1802,7 +1802,7 @@ struct nfs_rpc_ops { int (*link) (struct inode *, struct inode *, const struct qstr *); int (*symlink) (struct inode *, struct dentry *, struct folio *, unsigned int, struct iattr *); - int (*mkdir) (struct inode *, struct dentry *, struct iattr *); + struct dentry *(*mkdir) (struct inode *, struct dentry *, struct iattr *); int (*rmdir) (struct inode *, const struct qstr *); int (*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *); int (*mknod) (struct inode *, struct dentry *, struct iattr *, diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a8dfb38c9bb6..e78fa535f61d 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -17,7 +17,6 @@ void lockup_detector_init(void); void lockup_detector_retry_init(void); void lockup_detector_soft_poweroff(void); -void lockup_detector_cleanup(void); extern int watchdog_user_enabled; extern int watchdog_thresh; @@ -37,7 +36,6 @@ extern int sysctl_hardlockup_all_cpu_backtrace; static inline void lockup_detector_init(void) { } static inline void lockup_detector_retry_init(void) { } static inline void lockup_detector_soft_poweroff(void) { } -static inline void lockup_detector_cleanup(void) { } #endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR @@ -104,12 +102,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs); #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); -extern void hardlockup_detector_perf_cleanup(void); extern void hardlockup_config_perf_event(const char *str); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } -static inline void hardlockup_detector_perf_cleanup(void) { } static inline void hardlockup_config_perf_event(const char *str) { } #endif diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 9fd7a0ce9c1a..f0ac0633366b 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -94,7 +94,6 @@ #include <linux/bitmap.h> #include <linux/minmax.h> #include <linux/nodemask_types.h> -#include <linux/numa.h> #include <linux/random.h> extern nodemask_t _unused_nodemask_arg_; @@ -191,6 +190,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } +#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES) +static __always_inline void __nodes_copy(nodemask_t *dstp, + const nodemask_t *srcp, unsigned int nbits) +{ + bitmap_copy(dstp->bits, srcp->bits, nbits); +} + #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static __always_inline void __nodes_complement(nodemask_t *dstp, diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h index 6b28d97ea6ed..f850a48742f1 100644 --- a/include/linux/nodemask_types.h +++ b/include/linux/nodemask_types.h @@ -3,7 +3,16 @@ #define __LINUX_NODEMASK_TYPES_H #include <linux/bitops.h> -#include <linux/numa.h> + +#ifdef CONFIG_NODES_SHIFT +#define NODES_SHIFT CONFIG_NODES_SHIFT +#else +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#define NUMA_NO_NODE (-1) typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; diff --git a/include/linux/numa.h b/include/linux/numa.h index 3567e40329eb..e6baaf6051bc 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -3,16 +3,8 @@ #define _LINUX_NUMA_H #include <linux/init.h> #include <linux/types.h> +#include <linux/nodemask.h> -#ifdef CONFIG_NODES_SHIFT -#define NODES_SHIFT CONFIG_NODES_SHIFT -#else -#define NODES_SHIFT 0 -#endif - -#define MAX_NUMNODES (1 << NODES_SHIFT) - -#define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) static inline bool numa_valid_node(int nid) @@ -39,6 +31,8 @@ void __init alloc_offline_node_data(int nid); /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); +int nearest_node_nodemask(int node, nodemask_t *mask); + #ifndef memory_add_physaddr_to_nid int memory_add_physaddr_to_nid(u64 start); #endif @@ -55,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state) return NUMA_NO_NODE; } +static inline int nearest_node_nodemask(int node, nodemask_t *mask) +{ + return NUMA_NO_NODE; +} + static inline int memory_add_physaddr_to_nid(u64 start) { return 0; diff --git a/include/linux/objpool.h b/include/linux/objpool.h index cb1758eaa2d3..b713a1fe7521 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -170,17 +170,16 @@ static inline void *objpool_pop(struct objpool_head *pool) { void *obj = NULL; unsigned long flags; - int i, cpu; + int start, cpu; /* disable local irq to avoid preemption & interruption */ raw_local_irq_save(flags); - cpu = raw_smp_processor_id(); - for (i = 0; i < pool->nr_possible_cpus; i++) { + start = raw_smp_processor_id(); + for_each_possible_cpu_wrap(cpu, start) { obj = __objpool_try_get_slot(pool, cpu); if (obj) break; - cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); } raw_local_irq_restore(flags); diff --git a/include/linux/objtool.h b/include/linux/objtool.h index c722a921165b..3ca965a2ddc8 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -128,7 +128,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) -#define __ASM_ANNOTATE(label, type) +#define __ASM_ANNOTATE(label, type) "" #define ASM_ANNOTATE(type) #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 @@ -147,6 +147,8 @@ * these relocations will never be used for indirect calls. */ #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) + /* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 36d283552f80..df9234e5f478 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -925,14 +925,15 @@ FOLIO_FLAG_FALSE(has_hwpoisoned) enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ - PGTY_buddy = 0xf0, - PGTY_offline = 0xf1, - PGTY_table = 0xf2, - PGTY_guard = 0xf3, - PGTY_hugetlb = 0xf4, - PGTY_slab = 0xf5, - PGTY_zsmalloc = 0xf6, - PGTY_unaccepted = 0xf7, + PGTY_buddy = 0xf0, + PGTY_offline = 0xf1, + PGTY_table = 0xf2, + PGTY_guard = 0xf3, + PGTY_hugetlb = 0xf4, + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, + PGTY_large_kmalloc = 0xf8, PGTY_mapcount_underflow = 0xff }; @@ -1075,6 +1076,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) +FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..d2432cb02fa0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1044,21 +1044,23 @@ static inline pgoff_t page_pgoff(const struct folio *folio, return folio->index + folio_page_idx(folio, page); } -/* - * Return byte-offset into filesystem object for page. +/** + * folio_pos - Returns the byte position of this folio in its file. + * @folio: The folio. */ -static inline loff_t page_offset(struct page *page) +static inline loff_t folio_pos(const struct folio *folio) { - return ((loff_t)page->index) << PAGE_SHIFT; + return ((loff_t)folio->index) * PAGE_SIZE; } -/** - * folio_pos - Returns the byte position of this folio in its file. - * @folio: The folio. +/* + * Return byte-offset into filesystem object for page. */ -static inline loff_t folio_pos(struct folio *folio) +static inline loff_t page_offset(struct page *page) { - return page_offset(&folio->page); + struct folio *folio = page_folio(page); + + return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE; } /* @@ -1603,34 +1605,6 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, } /** - * page_mkwrite_check_truncate - check if page was truncated - * @page: the page to check - * @inode: the inode to check the page against - * - * Returns the number of bytes in the page up to EOF, - * or -EFAULT if the page was truncated. - */ -static inline int page_mkwrite_check_truncate(struct page *page, - struct inode *inode) -{ - loff_t size = i_size_read(inode); - pgoff_t index = size >> PAGE_SHIFT; - int offset = offset_in_page(size); - - if (page->mapping != inode->i_mapping) - return -EFAULT; - - /* page is wholly inside EOF */ - if (page->index < index) - return PAGE_SIZE; - /* page is wholly past EOF */ - if (page->index > index || !offset) - return -EFAULT; - /* page is partially inside EOF */ - return offset; -} - -/** * i_blocks_per_folio - How many blocks fit in this folio. * @inode: The inode which contains the blocks. * @folio: The folio. diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 5b520fe86b60..0fcacb909778 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -26,13 +26,11 @@ #define PER_CPU_SHARED_ALIGNED_SECTION "..shared_aligned" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" #endif -#define PER_CPU_FIRST_SECTION "..first" #else #define PER_CPU_SHARED_ALIGNED_SECTION "" #define PER_CPU_ALIGNED_SECTION "..shared_aligned" -#define PER_CPU_FIRST_SECTION "" #endif @@ -115,14 +113,17 @@ DEFINE_PER_CPU_SECTION(type, name, "") /* - * Declaration/definition used for per-CPU variables that must come first in - * the set of variables. + * Declaration/definition used for per-CPU variables that are frequently + * accessed and should be in a single cacheline. + * + * For use only by architecture and core code. Only use scalar or pointer + * types to maximize density. */ -#define DECLARE_PER_CPU_FIRST(type, name) \ - DECLARE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) +#define DECLARE_PER_CPU_CACHE_HOT(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..hot.." #name) -#define DEFINE_PER_CPU_FIRST(type, name) \ - DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION) +#define DEFINE_PER_CPU_CACHE_HOT(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..hot.." #name) /* * Declaration/definition used for per-CPU variables that must be cacheline diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index c012df33a9f0..af7d75ede619 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -8,6 +8,7 @@ #include <linux/wait.h> #include <linux/rcu_sync.h> #include <linux/lockdep.h> +#include <linux/cleanup.h> struct percpu_rw_semaphore { struct rcu_sync rss; @@ -125,6 +126,13 @@ extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +DEFINE_GUARD(percpu_read, struct percpu_rw_semaphore *, + percpu_down_read(_T), percpu_up_read(_T)) +DEFINE_GUARD_COND(percpu_read, _try, percpu_down_read_trylock(_T)) + +DEFINE_GUARD(percpu_write, struct percpu_rw_semaphore *, + percpu_down_write(_T), percpu_up_write(_T)) + static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) { return atomic_read(&sem->block); diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 7ce6dea5bfa9..6dc5e0cd76ca 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -84,7 +84,6 @@ struct arm_pmu { struct pmu pmu; cpumask_t supported_cpus; char *name; - int pmuver; irqreturn_t (*handle_irq)(struct arm_pmu *pmu); void (*enable)(struct perf_event *event); void (*disable)(struct perf_event *event); @@ -106,18 +105,20 @@ struct arm_pmu { int (*map_pmuv3_event)(unsigned int eventsel); DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS); bool secure_access; /* 32-bit ARM only */ -#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 - DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); -#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000 - DECLARE_BITMAP(pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); struct platform_device *plat_device; struct pmu_hw_events __percpu *hw_events; struct hlist_node node; struct notifier_block cpu_pm_nb; /* the attr_groups array must be NULL-terminated */ const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1]; - /* store the PMMIR_EL1 to expose slots */ + + /* PMUv3 only */ + int pmuver; u64 reg_pmmir; +#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 + DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); +#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000 + DECLARE_BITMAP(pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); /* Only to be used by ACPI probing code */ unsigned long acpi_cpuid; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8333f132f4a9..63dddb3b54f0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -343,8 +343,7 @@ struct pmu { */ unsigned int scope; - int __percpu *pmu_disable_count; - struct perf_cpu_pmu_context __percpu *cpu_pmu_context; + struct perf_cpu_pmu_context * __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; @@ -495,7 +494,7 @@ struct pmu { * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, - bool sched_in); + struct task_struct *task, bool sched_in); /* * Kmem cache of PMU specific data @@ -503,16 +502,6 @@ struct pmu { struct kmem_cache *task_ctx_cache; /* - * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) - * can be synchronized using this function. See Intel LBR callstack support - * implementation and Perf core context switch handling callbacks for usage - * examples. - */ - void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, - struct perf_event_pmu_context *next_epc); - /* optional */ - - /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, @@ -673,13 +662,16 @@ struct swevent_hlist { struct rcu_head rcu_head; }; -#define PERF_ATTACH_CONTEXT 0x01 -#define PERF_ATTACH_GROUP 0x02 -#define PERF_ATTACH_TASK 0x04 -#define PERF_ATTACH_TASK_DATA 0x08 -#define PERF_ATTACH_ITRACE 0x10 -#define PERF_ATTACH_SCHED_CB 0x20 -#define PERF_ATTACH_CHILD 0x40 +#define PERF_ATTACH_CONTEXT 0x0001 +#define PERF_ATTACH_GROUP 0x0002 +#define PERF_ATTACH_TASK 0x0004 +#define PERF_ATTACH_TASK_DATA 0x0008 +#define PERF_ATTACH_GLOBAL_DATA 0x0010 +#define PERF_ATTACH_SCHED_CB 0x0020 +#define PERF_ATTACH_CHILD 0x0040 +#define PERF_ATTACH_EXCLUSIVE 0x0080 +#define PERF_ATTACH_CALLCHAIN 0x0100 +#define PERF_ATTACH_ITRACE 0x0200 struct bpf_prog; struct perf_cgroup; @@ -921,7 +913,7 @@ struct perf_event_pmu_context { struct list_head pinned_active; struct list_head flexible_active; - /* Used to avoid freeing per-cpu perf_event_pmu_context */ + /* Used to identify the per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; @@ -931,7 +923,6 @@ struct perf_event_pmu_context { atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; - void *task_ctx_data; /* pmu specific data */ /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to @@ -979,7 +970,6 @@ struct perf_event_context { int nr_user; int is_active; - int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; @@ -1020,6 +1010,41 @@ struct perf_event_context { local_t nr_no_switch_fast; }; +/** + * struct perf_ctx_data - PMU specific data for a task + * @rcu_head: To avoid the race on free PMU specific data + * @refcount: To track users + * @global: To track system-wide users + * @ctx_cache: Kmem cache of PMU specific data + * @data: PMU specific data + * + * Currently, the struct is only used in Intel LBR call stack mode to + * save/restore the call stack of a task on context switches. + * + * The rcu_head is used to prevent the race on free the data. + * The data only be allocated when Intel LBR call stack mode is enabled. + * The data will be freed when the mode is disabled. + * The content of the data will only be accessed in context switch, which + * should be protected by rcu_read_lock(). + * + * Because of the alignment requirement of Intel Arch LBR, the Kmem cache + * is used to allocate the PMU specific data. The ctx_cache is to track + * the Kmem cache. + * + * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct. + * When system-wide Intel LBR call stack mode is enabled, a buffer with + * constant size will be allocated for each task. + * Also, system memory consumption can further grow when the size of + * struct perf_ctx_data enlarges. + */ +struct perf_ctx_data { + struct rcu_head rcu_head; + refcount_t refcount; + int global; + struct kmem_cache *ctx_cache; + void *data; +}; + struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; @@ -1029,6 +1054,7 @@ struct perf_cpu_pmu_context { int active_oncpu; int exclusive; + int pmu_disable_count; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; @@ -1062,7 +1088,13 @@ struct perf_output_handle { struct perf_buffer *rb; unsigned long wakeup; unsigned long size; - u64 aux_flags; + union { + u64 flags; /* perf_output*() */ + u64 aux_flags; /* perf_aux_output*() */ + struct { + u64 skip_read : 1; + }; + }; union { void *addr; unsigned long head; @@ -1339,6 +1371,9 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, if (branch_sample_hw_index(event)) size += sizeof(u64); + + brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr); + size += brs->nr * sizeof(struct perf_branch_entry); /* @@ -1646,19 +1681,10 @@ static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 } extern int sysctl_perf_event_paranoid; -extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; -extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); - /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 diff --git a/include/linux/pid.h b/include/linux/pid.h index 98837a1ff0f3..311ecebd7d56 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); -extern void detach_pid(struct task_struct *task, enum pid_type); -extern void change_pid(struct task_struct *task, enum pid_type, - struct pid *pid); +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type); +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type, + struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); @@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); +void free_pids(struct pid **pids); extern void disable_pid_allocation(struct pid_namespace *ns); /* diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 7c830d0dec9a..05e6f8f4a026 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -6,6 +6,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); +void pidfs_exit(struct task_struct *tsk); extern const struct dentry_operations pidfs_dentry_operations; #endif /* _LINUX_PID_FS_H */ diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index b698758000f8..9d42d473d201 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -108,7 +108,7 @@ struct pipe_inode_info { #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif - struct page *tmp_page; + struct page *tmp_page[2]; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index f11f10c97bd9..dd48c64b605e 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sigqueue *q); void posixtimer_send_sigqueue(struct k_itimer *tmr); bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq); void posixtimer_free_timer(struct k_itimer *timer); +long posixtimer_create_prctl(unsigned long ctrl); /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ @@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itimer(struct task_struct *p) { } static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq) { return false; } static inline void posixtimer_free_timer(struct k_itimer *timer) { } +static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK @@ -177,23 +179,26 @@ static inline void posix_cputimers_init_work(void) { } * @rcu: RCU head for freeing the timer. */ struct k_itimer { - struct hlist_node list; - struct hlist_node ignored_list; + /* 1st cacheline contains read-mostly fields */ struct hlist_node t_hash; - spinlock_t it_lock; - const struct k_clock *kclock; - clockid_t it_clock; + struct hlist_node list; timer_t it_id; + clockid_t it_clock; + int it_sigev_notify; + enum pid_type it_pid_type; + struct signal_struct *it_signal; + const struct k_clock *kclock; + + /* 2nd cacheline and above contain fields which are modified regularly */ + spinlock_t it_lock; int it_status; bool it_sig_periodic; s64 it_overrun; s64 it_overrun_last; unsigned int it_signal_seq; unsigned int it_sigqueue_seq; - int it_sigev_notify; - enum pid_type it_pid_type; ktime_t it_interval; - struct signal_struct *it_signal; + struct hlist_node ignored_list; union { struct pid *it_pid; struct task_struct *it_process; @@ -210,7 +215,7 @@ struct k_itimer { } alarm; } it; struct rcu_head rcu; -}; +} ____cacheline_aligned_in_smp; void run_posix_cpu_timers(void); void posix_cpu_timers_exit(struct task_struct *task); @@ -240,6 +245,13 @@ static inline void posixtimer_sigqueue_putref(struct sigqueue *q) posixtimer_putref(tmr); } + +static inline bool posixtimer_valid(const struct k_itimer *timer) +{ + unsigned long val = (unsigned long)timer->it_signal; + + return !(val & 0x1UL); +} #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { } static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { } diff --git a/include/linux/preempt.h b/include/linux/preempt.h index ca86235ac15c..b0af8d4ef6e6 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -319,6 +319,7 @@ do { \ #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; +struct task_struct; /** * preempt_ops - notifiers called when a task is preempted and rescheduled @@ -515,6 +516,8 @@ static inline bool preempt_model_rt(void) return IS_ENABLED(CONFIG_PREEMPT_RT); } +extern const char *preempt_model_str(void); + /* * Does the preemption model allow non-cooperative preemption? * diff --git a/include/linux/printk.h b/include/linux/printk.h index 4217a9f412b2..5b462029d03c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -207,6 +207,7 @@ void printk_legacy_allow_panic_sync(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); void nbcon_atomic_flush_unsafe(void); +bool pr_flush(int timeout_ms, bool reset_on_progress); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -315,6 +316,11 @@ static inline void nbcon_atomic_flush_unsafe(void) { } +static inline bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + return true; +} + #endif bool this_cpu_in_panic(void); diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 0b2a89854440..ea62201c74c4 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -20,10 +20,13 @@ enum { * If in doubt, ignore this flag. */ #ifdef MODULE - PROC_ENTRY_PERMANENT = 0U, + PROC_ENTRY_PERMANENT = 0U, #else - PROC_ENTRY_PERMANENT = 1U << 0, + PROC_ENTRY_PERMANENT = 1U << 0, #endif + + PROC_ENTRY_proc_read_iter = 1U << 1, + PROC_ENTRY_proc_compat_ioctl = 1U << 2, }; struct proc_ops { diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..f8159f8a7d73 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -95,9 +95,9 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { - preempt_enable(); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); + preempt_enable(); } static inline int rcu_preempt_depth(void) @@ -121,12 +121,6 @@ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -#ifdef CONFIG_TASKS_RCU_GENERIC -void rcu_init_tasks_generic(void); -#else -static inline void rcu_init_tasks_generic(void) { } -#endif - #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); @@ -806,11 +800,9 @@ do { \ * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * - * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also - * wait for regions of code with preemption disabled, including regions of - * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which - * define synchronize_sched(), only code enclosed within rcu_read_lock() - * and rcu_read_unlock() are guaranteed to be waited for. + * Both synchronize_rcu() and call_rcu() also wait for regions of code + * with preemption disabled, including regions of code with interrupts or + * softirqs disabled. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen @@ -865,11 +857,10 @@ static __always_inline void rcu_read_lock(void) * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. - * In recent kernels that have consolidated synchronize_sched() and - * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity - * also extends to the scheduler's runqueue and priority-inheritance - * spinlocks, courtesy of the quiescent-state deferral that is carried - * out when rcu_read_unlock() is invoked with interrupts disabled. + * This deadlock immunity also extends to the scheduler's runqueue + * and priority-inheritance spinlocks, courtesy of the quiescent-state + * deferral that is carried out when rcu_read_unlock() is invoked with + * interrupts disabled. * * See rcu_read_lock() for more information. */ @@ -1025,12 +1016,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) -/* - * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kvfree_rcu()? - */ -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) - /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. @@ -1041,11 +1026,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * - * The kfree_rcu() function handles this issue. Rather than encoding a - * function address in the embedded rcu_head structure, kfree_rcu() instead - * encodes the offset of the rcu_head structure within the base structure. - * Because the functions are not allowed in the low-order 4096 bytes of - * kernel virtual memory, offsets up to 4095 bytes can be accommodated. + * The kfree_rcu() function handles this issue. In order to have a universal + * callback function handling different offsets of rcu_head, the callback needs + * to determine the starting address of the freed object, which can be a large + * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to + * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to @@ -1082,14 +1067,23 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) +/* + * In mm/slab_common.c, no suitable header to include here. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr); + +/* + * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the + * comment of kfree_rcu() for details. + */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ - if (___p) { \ - BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ - kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ - } \ + if (___p) { \ + BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ + kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ + } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index f9bed3d3f78d..4c92d4291cce 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -16,6 +16,9 @@ struct rcu_synchronize { struct rcu_head head; struct completion completion; + + /* This is for debugging. */ + struct rcu_gp_oldstate oldstate; }; void wakeme_after_rcu(struct rcu_head *head); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fe42315f667f..f519cd680228 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -90,41 +90,6 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -/* - * Add one more declaration of kvfree() here. It is - * not so straight forward to just include <linux/mm.h> - * where it is defined due to getting many compile - * errors caused by that include. - */ -extern void kvfree(const void *addr); - -static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) { - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); - return; - } - - // kvfree_rcu(one_arg) call. - might_sleep(); - synchronize_rcu(); - kvfree(ptr); -} - -static inline void kvfree_rcu_barrier(void) -{ - rcu_barrier(); -} - -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -#else -static inline void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - __kvfree_call_rcu(head, ptr); -} -#endif - void rcu_qs(void); static inline void rcu_softirq_qs(void) @@ -164,7 +129,6 @@ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } -static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 27d86d912781..9d2d7bd251d4 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,12 +34,9 @@ static inline void rcu_virt_note_context_switch(void) } void synchronize_rcu_expedited(void); -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); -void kfree_rcu_scheduler_running(void); struct rcu_gp_oldstate { unsigned long rgos_norm; @@ -103,7 +100,7 @@ extern int rcu_scheduler_active; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); -#ifndef CONFIG_PREEMPTION +#ifndef CONFIG_PREEMPT_RCU void rcu_all_qs(void); #endif diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index d94abba1c716..880351ca3dfc 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/list.h> #include <linux/pid.h> +#include <linux/resctrl_types.h> /* CLOSID, RMID value used by the default control group */ #define RESCTRL_RESERVED_CLOSID 0 @@ -25,6 +26,24 @@ int proc_resctrl_show(struct seq_file *m, /* max value for struct rdt_domain's mbps_val */ #define MBA_MAX_MBPS U32_MAX +/* Walk all possible resources, with variants for only controls or monitors. */ +#define for_each_rdt_resource(_r) \ + for ((_r) = resctrl_arch_get_resource(0); \ + (_r) && (_r)->rid < RDT_NUM_RESOURCES; \ + (_r) = resctrl_arch_get_resource((_r)->rid + 1)) + +#define for_each_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->alloc_capable || (r)->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for_each_rdt_resource((r)) \ + if ((r)->mon_capable) + /** * enum resctrl_conf_type - The type of configuration. * @CDP_NONE: No prioritisation, both code and data are controlled or monitored. @@ -40,13 +59,42 @@ enum resctrl_conf_type { #define CDP_NUM_TYPES (CDP_DATA + 1) /* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. + * struct pseudo_lock_region - pseudo-lock region information + * @s: Resctrl schema for the resource to which this + * pseudo-locked region belongs + * @closid: The closid that this pseudo-locked region uses + * @d: RDT domain to which this pseudo-locked region + * belongs + * @cbm: bitmask of the pseudo-locked region + * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread + * completion + * @thread_done: variable used by waitqueue to test if pseudo-locking + * thread completed + * @cpu: core associated with the cache on which the setup code + * will be run + * @line_size: size of the cache lines + * @size: size of pseudo-locked region in bytes + * @kmem: the kernel memory associated with pseudo-locked region + * @minor: minor number of character device associated with this + * region + * @debugfs_dir: pointer to this region's directory in the debugfs + * filesystem + * @pm_reqs: Power management QoS requests related to this region */ -enum resctrl_event_id { - QOS_L3_OCCUP_EVENT_ID = 0x01, - QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, - QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, +struct pseudo_lock_region { + struct resctrl_schema *s; + u32 closid; + struct rdt_ctrl_domain *d; + u32 cbm; + wait_queue_head_t lock_thread_wq; + int thread_done; + int cpu; + unsigned int line_size; + unsigned int size; + void *kmem; + unsigned int minor; + struct dentry *debugfs_dir; + struct list_head pm_reqs; }; /** @@ -155,6 +203,7 @@ enum membw_throttle_mode { /** * struct resctrl_membw - Memory bandwidth allocation related data * @min_bw: Minimum memory bandwidth percentage user can request + * @max_bw: Maximum memory bandwidth value, used as the reset value * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale * @arch_needs_linear: True if we can't configure non-linear resources @@ -165,6 +214,7 @@ enum membw_throttle_mode { */ struct resctrl_membw { u32 min_bw; + u32 max_bw; u32 bw_gran; u32 delay_linear; bool arch_needs_linear; @@ -173,7 +223,6 @@ struct resctrl_membw { u32 *mb_map; }; -struct rdt_parse_data; struct resctrl_schema; enum resctrl_scope { @@ -183,6 +232,16 @@ enum resctrl_scope { }; /** + * enum resctrl_schema_fmt - The format user-space provides for a schema. + * @RESCTRL_SCHEMA_BITMAP: The schema is a bitmap in hex. + * @RESCTRL_SCHEMA_RANGE: The schema is a decimal number. + */ +enum resctrl_schema_fmt { + RESCTRL_SCHEMA_BITMAP, + RESCTRL_SCHEMA_RANGE, +}; + +/** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine @@ -195,12 +254,10 @@ enum resctrl_scope { * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. - * @data_width: Character width of data when displaying - * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values + * @schema_fmt: Which format string and parser is used for this schema. * @evt_list: List of monitoring events - * @fflags: flags to choose base and info files + * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth + * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { @@ -215,22 +272,25 @@ struct rdt_resource { struct list_head ctrl_domains; struct list_head mon_domains; char *name; - int data_width; - u32 default_ctrl; - const char *format_str; - int (*parse_ctrlval)(struct rdt_parse_data *data, - struct resctrl_schema *s, - struct rdt_ctrl_domain *d); + enum resctrl_schema_fmt schema_fmt; struct list_head evt_list; - unsigned long fflags; + unsigned int mbm_cfg_mask; bool cdp_capable; }; +/* + * Get the resource that exists at this level. If the level is not supported + * a dummy/not-capable resource can be returned. Levels >= RDT_NUM_RESOURCES + * will return NULL. + */ +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l); + /** * struct resctrl_schema - configuration abilities of a resource presented to * user-space * @list: Member of resctrl_schema_all. * @name: The name to use in the "schemata" file. + * @fmt_str: Format string to show domain value. * @conf_type: Whether this schema is specific to code/data. * @res: The resource structure exported by the architecture to describe * the hardware that is configured by this schema. @@ -241,16 +301,104 @@ struct rdt_resource { struct resctrl_schema { struct list_head list; char name[8]; + const char *fmt_str; enum resctrl_conf_type conf_type; struct rdt_resource *res; u32 num_closid; }; +struct resctrl_cpu_defaults { + u32 closid; + u32 rmid; +}; + +struct resctrl_mon_config_info { + struct rdt_resource *r; + struct rdt_mon_domain *d; + u32 evtid; + u32 mon_config; +}; + +/** + * resctrl_arch_sync_cpu_closid_rmid() - Refresh this CPU's CLOSID and RMID. + * Call via IPI. + * @info: If non-NULL, a pointer to a struct resctrl_cpu_defaults + * specifying the new CLOSID and RMID for tasks in the default + * resctrl ctrl and mon group when running on this CPU. If NULL, + * this CPU is not re-assigned to a different default group. + * + * Propagates reassignment of CPUs and/or tasks to different resctrl groups + * when requested by the resctrl core code. + * + * This function records the per-cpu defaults specified by @info (if any), + * and then reconfigures the CPU's hardware CLOSID and RMID for subsequent + * execution based on @current, in the same way as during a task switch. + */ +void resctrl_arch_sync_cpu_closid_rmid(void *info); + +/** + * resctrl_get_default_ctrl() - Return the default control value for this + * resource. + * @r: The resource whose default control type is queried. + */ +static inline u32 resctrl_get_default_ctrl(struct rdt_resource *r) +{ + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + return BIT_MASK(r->cache.cbm_len) - 1; + case RESCTRL_SCHEMA_RANGE: + return r->membw.max_bw; + } + + return WARN_ON_ONCE(1); +} + /* The number of closid supported by this resource regardless of CDP */ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); + +/** + * resctrl_arch_mon_event_config_write() - Write the config for an event. + * @config_info: struct resctrl_mon_config_info describing the resource, domain + * and event. + * + * Reads resource, domain and eventid from @config_info and writes the + * event config_info->mon_config into hardware. + * + * Called via IPI to reach a CPU that is a member of the specified domain. + */ +void resctrl_arch_mon_event_config_write(void *config_info); + +/** + * resctrl_arch_mon_event_config_read() - Read the config for an event. + * @config_info: struct resctrl_mon_config_info describing the resource, domain + * and event. + * + * Reads resource, domain and eventid from @config_info and reads the + * hardware config value into config_info->mon_config. + * + * Called via IPI to reach a CPU that is a member of the specified domain. + */ +void resctrl_arch_mon_event_config_read(void *config_info); + +/* For use by arch code to remap resctrl's smaller CDP CLOSID range */ +static inline u32 resctrl_get_config_index(u32 closid, + enum resctrl_conf_type type) +{ + switch (type) { + default: + case CDP_NONE: + return closid; + case CDP_CODE: + return closid * 2 + 1; + case CDP_DATA: + return closid * 2; + } +} + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. @@ -314,6 +462,20 @@ static inline void resctrl_arch_rmid_read_context_check(void) } /** + * resctrl_find_domain() - Search for a domain id in a resource domain list. + * @h: The domain list to search. + * @id: The domain id to search for. + * @pos: A pointer to position in the list id should be inserted. + * + * Search the domain list to find the domain id. If the domain id is + * found, return the domain. NULL otherwise. If the domain id is not + * found (and NULL returned) then the first domain with id bigger than + * the input id can be returned to the caller via @pos. + */ +struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, + struct list_head **pos); + +/** * resctrl_arch_reset_rmid() - Reset any private state associated with rmid * and eventid. * @r: The domain's resource. @@ -340,7 +502,19 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, */ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); +/** + * resctrl_arch_reset_all_ctrls() - Reset the control for each CLOSID to its + * default. + * @r: The resctrl resource to reset. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; +int __init resctrl_init(void); +void __exit resctrl_exit(void); + #endif /* _RESCTRL_H */ diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h new file mode 100644 index 000000000000..f26450b3326b --- /dev/null +++ b/include/linux/resctrl_types.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Arm Ltd. + * Based on arch/x86/kernel/cpu/resctrl/internal.h + */ + +#ifndef __LINUX_RESCTRL_TYPES_H +#define __LINUX_RESCTRL_TYPES_H + +/* Reads to Local DRAM Memory */ +#define READS_TO_LOCAL_MEM BIT(0) + +/* Reads to Remote DRAM Memory */ +#define READS_TO_REMOTE_MEM BIT(1) + +/* Non-Temporal Writes to Local Memory */ +#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) + +/* Non-Temporal Writes to Remote Memory */ +#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) + +/* Reads to Local Memory the system identifies as "Slow Memory" */ +#define READS_TO_LOCAL_S_MEM BIT(4) + +/* Reads to Remote Memory the system identifies as "Slow Memory" */ +#define READS_TO_REMOTE_S_MEM BIT(5) + +/* Dirty Victims to All Types of Memory */ +#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) + +/* Max event bits supported */ +#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) + +enum resctrl_res_level { + RDT_RESOURCE_L3, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + RDT_RESOURCE_SMBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +/* + * Event IDs, the values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ +enum resctrl_event_id { + QOS_L3_OCCUP_EVENT_ID = 0x01, + QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, + QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, +}; + +#endif /* __LINUX_RESCTRL_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c15365a30c0..6e5c38718ff5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -65,6 +65,7 @@ struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; +struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; @@ -382,6 +383,11 @@ enum uclamp_id { #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; +extern void sched_domains_mutex_lock(void); +extern void sched_domains_mutex_unlock(void); +#else +static inline void sched_domains_mutex_lock(void) { } +static inline void sched_domains_mutex_unlock(void) { } #endif struct sched_param { @@ -1311,6 +1317,7 @@ struct task_struct { struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; + struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 3a912ab42bb5..f9aabbc9d22e 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -34,7 +34,11 @@ static inline bool dl_time_before(u64 a, u64 b) struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); +extern void dl_clear_root_domain_cpu(int cpu); #endif /* CONFIG_SMP */ +extern u64 dl_cookie; +extern bool dl_bw_visited(int cpu, u64 cookie); + #endif /* _LINUX_SCHED_DEADLINE_H */ diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index b5035afa2396..35ed4577a6cc 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -35,12 +35,10 @@ extern void show_stack(struct task_struct *task, unsigned long *sp, extern void sched_show_task(struct task_struct *p); -#ifdef CONFIG_SCHED_DEBUG struct seq_file; extern void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); -#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __section(".sched.text") diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1d70a9867fb1..f7545430a548 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -146,6 +146,7 @@ struct sched_ext_entity { u32 weight; s32 sticky_cpu; s32 holding_cpu; + s32 selected_cpu; u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ atomic_long_t ops_state; diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index e670ac282333..439f6029d3b9 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -79,6 +79,21 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) return unlikely(tif_need_resched()); } +static __always_inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb__after_atomic(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} + #else static inline void __current_set_polling(void) { } static inline void __current_clr_polling(void) { } @@ -91,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void) { return unlikely(tif_need_resched()); } -#endif static __always_inline void current_clr_polling(void) { __current_clr_polling(); - /* - * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. - * Once the bit is cleared, we'll get IPIs with every new - * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also - * fold. - */ smp_mb(); /* paired with resched_curr() */ preempt_fold_need_resched(); } +#endif #endif /* _LINUX_SCHED_IDLE_H */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 928a626725e6..b13474825130 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -531,6 +531,13 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + /* + * The atomic_read() below prevents CSE. The following should + * help the compiler generate more efficient code on architectures + * where sync_core_before_usermode() is a no-op. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) + return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index d5d03d919df8..1ef1edbaaf79 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -136,7 +136,8 @@ struct signal_struct { #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ - unsigned int next_posix_timer_id; + unsigned int timer_create_restore_ids:1; + atomic_t next_posix_timer_id; struct hlist_head posix_timers; struct hlist_head ignored_posix_timers; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7f3dbafe1817..7b4301b7235f 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,16 +25,12 @@ enum { }; #undef SD_FLAG -#ifdef CONFIG_SCHED_DEBUG - struct sd_flag_debug { unsigned int meta_flags; char *name; }; extern const struct sd_flag_debug sd_flag_debug[]; -#endif - #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { @@ -166,10 +162,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } -extern void partition_sched_domains_locked(int ndoms_new, - cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new); - extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); @@ -211,12 +203,6 @@ extern void __init set_sched_topology(struct sched_domain_topology_level *tl); struct sched_domain_attr; static inline void -partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - -static inline void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index e45531455d3b..9b959972bf4a 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -22,21 +22,17 @@ #include <linux/atomic.h> #include <asm/seccomp.h> +extern int __secure_computing(void); + #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -extern int __secure_computing(const struct seccomp_data *sd); static inline int secure_computing(void) { if (unlikely(test_syscall_work(SECCOMP))) - return __secure_computing(NULL); + return __secure_computing(); return 0; } #else extern void secure_computing_strict(int this_syscall); -static inline int __secure_computing(const struct seccomp_data *sd) -{ - secure_computing_strict(sd->nr); - return 0; -} #endif extern long prctl_get_seccomp(void); @@ -58,7 +54,7 @@ static inline int secure_computing(void) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif -static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } +static inline int __secure_computing(void) { return 0; } static inline long prctl_get_seccomp(void) { diff --git a/include/linux/sizes.h b/include/linux/sizes.h index c3a00b967d18..49039494076f 100644 --- a/include/linux/sizes.h +++ b/include/linux/sizes.h @@ -23,17 +23,25 @@ #define SZ_4K 0x00001000 #define SZ_8K 0x00002000 #define SZ_16K 0x00004000 +#define SZ_24K 0x00006000 #define SZ_32K 0x00008000 #define SZ_64K 0x00010000 #define SZ_128K 0x00020000 +#define SZ_192K 0x00030000 #define SZ_256K 0x00040000 +#define SZ_384K 0x00060000 #define SZ_512K 0x00080000 #define SZ_1M 0x00100000 #define SZ_2M 0x00200000 +#define SZ_3M 0x00300000 #define SZ_4M 0x00400000 +#define SZ_6M 0x00600000 #define SZ_8M 0x00800000 +#define SZ_12M 0x00c00000 #define SZ_16M 0x01000000 +#define SZ_18M 0x01200000 +#define SZ_24M 0x01800000 #define SZ_32M 0x02000000 #define SZ_64M 0x04000000 #define SZ_128M 0x08000000 diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..98e07e9e9e58 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include <linux/gfp.h> #include <linux/overflow.h> #include <linux/types.h> +#include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/percpu-refcount.h> #include <linux/cleanup.h> @@ -941,8 +942,6 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (__builtin_constant_p(n) && __builtin_constant_p(size)) - return kmalloc_noprof(bytes, flags); return kmalloc_noprof(bytes, flags); } #define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__)) @@ -1082,6 +1081,19 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); +#ifndef CONFIG_KVFREE_RCU_BATCHED +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + +static inline void kfree_rcu_scheduler_running(void) { } +#else +void kvfree_rcu_barrier(void); + +void kfree_rcu_scheduler_running(void); +#endif + /** * kmalloc_size_roundup - Report allocation bucket size for the given size * diff --git a/include/linux/srcu.h b/include/linux/srcu.h index d7ba46e74f58..900b0d5c05f5 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -47,7 +47,13 @@ int init_srcu_struct(struct srcu_struct *ssp); #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). -#define SRCU_READ_FLAVOR_ALL 0x7 // All of the above. +#define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ + SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) + // Flavors requiring synchronize_rcu() + // instead of smp_mb(). +void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); #ifdef CONFIG_TINY_SRCU #include <linux/srcutiny.h> @@ -60,15 +66,6 @@ int init_srcu_struct(struct srcu_struct *ssp); void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); void cleanup_srcu_struct(struct srcu_struct *ssp); -int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); -#ifdef CONFIG_TINY_SRCU -#define __srcu_read_lock_lite __srcu_read_lock -#define __srcu_read_unlock_lite __srcu_read_unlock -#else // #ifdef CONFIG_TINY_SRCU -int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp); -void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp); -#endif // #else // #ifdef CONFIG_TINY_SRCU void synchronize_srcu(struct srcu_struct *ssp); #define SRCU_GET_STATE_COMPLETED 0x1 @@ -258,6 +255,51 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) } /** + * srcu_read_lock_fast - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * + * If srcu_read_lock_fast() is ever used on an srcu_struct structure, + * then none of the other flavors may be used, whether before, during, + * or after. Note that grace-period auto-expediting is disabled for _fast + * srcu_struct structures because auto-expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. + * + * Note that srcu_read_lock_fast() can be invoked only from those contexts + * where RCU is watching, that is, from contexts where it would be legal + * to invoke rcu_read_lock(). Otherwise, lockdep will complain. + */ +static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp) +{ + struct srcu_ctr __percpu *retval; + + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + retval = __srcu_read_lock_fast(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + +/** + * srcu_down_read_fast - register a new reader for an SRCU-protected structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter a semaphore-like SRCU read-side critical section, but for + * a light-weight smp_mb()-free reader. See srcu_read_lock_fast() and + * srcu_down_read() for more information. + * + * The same srcu_struct may be used concurrently by srcu_down_read_fast() + * and srcu_read_lock_fast(). + */ +static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + return __srcu_read_lock_fast(ssp); +} + +/** * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * @@ -278,7 +320,7 @@ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { int retval; - srcu_check_read_flavor_lite(ssp); + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE); retval = __srcu_read_lock_lite(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -335,7 +377,8 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler. * * Calls to srcu_down_read() may be nested, similar to the manner in - * which calls to down_read() may be nested. + * which calls to down_read() may be nested. The same srcu_struct may be + * used concurrently by srcu_down_read() and srcu_read_lock(). */ static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { @@ -361,9 +404,40 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) } /** + * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast(). + * + * Exit a light-weight SRCU read-side critical section. + */ +static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) + __releases(ssp) +{ + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock_fast(ssp, scp); +} + +/** + * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast(). + * + * Exit an SRCU read-side critical section, but not necessarily from + * the same context as the maching srcu_down_read_fast(). + */ +static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) + __releases(ssp) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + __srcu_read_unlock_fast(ssp, scp); +} + +/** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). + * @idx: return value from corresponding srcu_read_lock_lite(). * * Exit a light-weight SRCU read-side critical section. */ @@ -379,7 +453,7 @@ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). + * @idx: return value from corresponding srcu_read_lock_nmisafe(). * * Exit an SRCU read-side critical section, but in an NMI-safe manner. */ diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 1321da803274..380260317d98 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -64,13 +64,38 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); preempt_enable(); return idx; } +struct srcu_ctr; + +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return (int)(intptr_t)(struct srcu_ctr __force __kernel *)scpp; +} + +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return (struct srcu_ctr __percpu *)(intptr_t)idx; +} + +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +{ + return __srcu_ctr_to_ptr(ssp, __srcu_read_lock(ssp)); +} + +static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); +} + +#define __srcu_read_lock_lite __srcu_read_lock +#define __srcu_read_unlock_lite __srcu_read_unlock + static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { synchronize_srcu(ssp); @@ -82,7 +107,7 @@ static inline void srcu_barrier(struct srcu_struct *ssp) } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#define srcu_check_read_flavor_lite(ssp) do { } while (0) +#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index b17814c9d1c7..8bed7e6cc4c1 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -17,14 +17,19 @@ struct srcu_node; struct srcu_struct; +/* One element of the srcu_data srcu_ctrs array. */ +struct srcu_ctr { + atomic_long_t srcu_locks; /* Locks per CPU. */ + atomic_long_t srcu_unlocks; /* Unlocks per CPU. */ +}; + /* * Per-CPU structure feeding into leaf srcu_node, similar in function * to rcu_node. */ struct srcu_data { /* Read-side state. */ - atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ - atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ + struct srcu_ctr srcu_ctrs[2]; /* Locks and unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Values: SRCU_READ_FLAVOR_.* */ @@ -95,7 +100,7 @@ struct srcu_usage { * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { - unsigned int srcu_idx; /* Current rdr array element. */ + struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ @@ -162,6 +167,7 @@ struct srcu_struct { #define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ { \ .sda = &pcpu_name, \ + .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \ __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } @@ -201,10 +207,77 @@ struct srcu_struct { #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); +// Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that +// element's index. +static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp) +{ + return scpp - &ssp->sda->srcu_ctrs[0]; +} + +// Converts an integer to a per-CPU pointer to the corresponding +// ->srcu_ctrs[] array element. +static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx) +{ + return &ssp->sda->srcu_ctrs[idx]; +} + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns a pointer that must be passed to the matching + * srcu_read_unlock_fast(). + * + * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side + * critical sections either because they disables interrupts, because they + * are a single instruction, or because they are a read-modify-write atomic + * operation, depending on the whims of the architecture. + * + * This means that __srcu_read_lock_fast() is not all that fast + * on architectures that support NMIs but do not supply NMI-safe + * implementations of this_cpu_inc(). + */ +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +{ + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_locks.counter); /* Y */ + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); /* Z */ + barrier(); /* Avoid leaking the critical section. */ + return scp; +} + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_fast(), but it must be within the same task. + * + * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side + * critical sections either because they disables interrupts, because they + * are a single instruction, or because they are a read-modify-write atomic + * operation, depending on the whims of the architecture. + * + * This means that __srcu_read_unlock_fast() is not all that fast + * on architectures that support NMIs but do not supply NMI-safe + * implementations of this_cpu_inc(). + */ +static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + barrier(); /* Avoid leaking the critical section. */ + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); +} + /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching @@ -217,13 +290,12 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); */ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ + this_cpu_inc(scp->srcu_locks.counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } /* @@ -240,22 +312,24 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels. -static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) +// Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is +// needed only for flavors that require grace-period smp_mb() calls to be +// promoted to synchronize_rcu(). +static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor) { struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) + if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor)) return; // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. - __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); + __srcu_check_read_flavor(ssp, read_flavor); } // Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. diff --git a/include/linux/string.h b/include/linux/string.h index f8e21e80942f..0403a4ca4c11 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -415,8 +415,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define strtomem_pad(dest, src, pad) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_cstr(src) + \ + __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ @@ -439,8 +441,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define strtomem(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_noncstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_cstr(src) + \ + __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ _dest_len == (size_t)-1); \ @@ -459,8 +463,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define memtostr(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_noncstr(src) + \ + __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ @@ -485,8 +491,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, */ #define memtostr_pad(dest, src) do { \ const size_t _dest_len = __must_be_byte_array(dest) + \ + __must_be_cstr(dest) + \ ARRAY_SIZE(dest); \ - const size_t _src_len = __builtin_object_size(src, 1); \ + const size_t _src_len = __must_be_noncstr(src) + \ + __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ \ diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index 120ca0f28e95..f3ba4f52ff26 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -41,23 +41,23 @@ static inline const char *str_high_low(bool v) } #define str_low_high(v) str_high_low(!(v)) -static inline const char *str_read_write(bool v) -{ - return v ? "read" : "write"; -} -#define str_write_read(v) str_read_write(!(v)) - static inline const char *str_on_off(bool v) { return v ? "on" : "off"; } #define str_off_on(v) str_on_off(!(v)) -static inline const char *str_yes_no(bool v) +static inline const char *str_read_write(bool v) { - return v ? "yes" : "no"; + return v ? "read" : "write"; } -#define str_no_yes(v) str_yes_no(!(v)) +#define str_write_read(v) str_read_write(!(v)) + +static inline const char *str_true_false(bool v) +{ + return v ? "true" : "false"; +} +#define str_false_true(v) str_true_false(!(v)) static inline const char *str_up_down(bool v) { @@ -65,11 +65,11 @@ static inline const char *str_up_down(bool v) } #define str_down_up(v) str_up_down(!(v)) -static inline const char *str_true_false(bool v) +static inline const char *str_yes_no(bool v) { - return v ? "true" : "false"; + return v ? "yes" : "no"; } -#define str_false_true(v) str_true_false(!(v)) +#define str_no_yes(v) str_yes_no(!(v)) /** * str_plural - Return the simple pluralization based on English counts diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index b5ec038069da..91cdf12190a0 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,7 +6,7 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent); +extern void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent); extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); @@ -15,7 +15,7 @@ extern void swap_cgroup_swapoff(int type); #else static inline -void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent) { } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6333204d451..e5603cc91963 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -951,6 +951,10 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); +asmlinkage long sys_open_tree_attr(int dfd, const char __user *path, + unsigned flags, + struct mount_attr __user *uattr, + size_t usize); asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, int to_dfd, const char __user *to_path, unsigned int ms_flags); @@ -1266,14 +1270,14 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, AT_SYMLINK_NOFOLLOW); } -extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); static inline long ksys_ftruncate(unsigned int fd, loff_t length) { return do_sys_ftruncate(fd, length, 1); } -extern long do_sys_truncate(const char __user *pathname, loff_t length); +int do_sys_truncate(const char __user *pathname, loff_t length); static inline long ksys_truncate(const char __user *pathname, loff_t length) { diff --git a/include/linux/sysv_fs.h b/include/linux/sysv_fs.h deleted file mode 100644 index 5cf77dbb8d86..000000000000 --- a/include/linux/sysv_fs.h +++ /dev/null @@ -1,214 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SYSV_FS_H -#define _LINUX_SYSV_FS_H - -#define __packed2__ __attribute__((packed, aligned(2))) - - -#ifndef __KERNEL__ -typedef u16 __fs16; -typedef u32 __fs16; -#endif - -/* inode numbers are 16 bit */ -typedef __fs16 sysv_ino_t; - -/* Block numbers are 24 bit, sometimes stored in 32 bit. - On Coherent FS, they are always stored in PDP-11 manner: the least - significant 16 bits come last. */ -typedef __fs32 sysv_zone_t; - -/* 0 is non-existent */ -#define SYSV_BADBL_INO 1 /* inode of bad blocks file */ -#define SYSV_ROOT_INO 2 /* inode of root directory */ - - -/* Xenix super-block data on disk */ -#define XENIX_NICINOD 100 /* number of inode cache entries */ -#define XENIX_NICFREE 100 /* number of free block list chunk entries */ -struct xenix_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= XENIX_NICFREE */ - sysv_zone_t s_free[XENIX_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= XENIX_NICINOD */ - sysv_ino_t s_inode[XENIX_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_dinfo[4]; /* device information ?? */ - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - char s_clean; /* set to 0x46 when filesystem is properly unmounted */ - char s_fill[371]; - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks - 3 for 2048 byte blocks */ - -}; - -/* - * SystemV FS comes in two variants: - * sysv2: System V Release 2 (e.g. Microport), structure elements aligned(2). - * sysv4: System V Release 4 (e.g. Consensys), structure elements aligned(4). - */ -#define SYSV_NICINOD 100 /* number of inode cache entries */ -#define SYSV_NICFREE 50 /* number of free block list chunk entries */ - -/* SystemV4 super-block data on disk */ -struct sysv4_super_block { - __fs16 s_isize; /* index of first data zone */ - u16 s_pad0; - __fs32 s_fsize; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= SYSV_NICFREE */ - u16 s_pad1; - sysv_zone_t s_free[SYSV_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= SYSV_NICINOD */ - u16 s_pad2; - sysv_ino_t s_inode[SYSV_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time; /* time of last super block update */ - __fs16 s_dinfo[4]; /* device information ?? */ - __fs32 s_tfree; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - u16 s_pad3; - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - s32 s_fill[12]; - __fs32 s_state; /* file system state: 0x7c269d38-s_time means clean */ - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks */ -}; - -/* SystemV2 super-block data on disk */ -struct sysv2_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= SYSV_NICFREE */ - sysv_zone_t s_free[SYSV_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= SYSV_NICINOD */ - sysv_ino_t s_inode[SYSV_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs16 s_dinfo[4]; /* device information ?? */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - s32 s_fill[14]; - __fs32 s_state; /* file system state: 0xcb096f43 means clean */ - s32 s_magic; /* version of file system */ - __fs32 s_type; /* type of file system: 1 for 512 byte blocks - 2 for 1024 byte blocks */ -}; - -/* V7 super-block data on disk */ -#define V7_NICINOD 100 /* number of inode cache entries */ -#define V7_NICFREE 50 /* number of free block list chunk entries */ -struct v7_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= V7_NICFREE */ - sysv_zone_t s_free[V7_NICFREE]; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= V7_NICINOD */ - sysv_ino_t s_inode[V7_NICINOD]; /* some free inodes */ - /* locks, not used by Linux or V7: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - /* the following fields are not maintained by V7: */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_m; /* interleave factor */ - __fs16 s_n; /* interleave factor */ - char s_fname[6]; /* file system name */ - char s_fpack[6]; /* file system pack name */ -}; -/* Constants to aid sanity checking */ -/* This is not a hard limit, nor enforced by v7 kernel. It's actually just - * the limit used by Seventh Edition's ls, though is high enough to assume - * that no reasonable file system would have that much entries in root - * directory. Thus, if we see anything higher, we just probably got the - * endiannes wrong. */ -#define V7_NFILES 1024 -/* The disk addresses are three-byte (despite direct block addresses being - * aligned word-wise in inode). If the most significant byte is non-zero, - * something is most likely wrong (not a filesystem, bad bytesex). */ -#define V7_MAXSIZE 0x00ffffff - -/* Coherent super-block data on disk */ -#define COH_NICINOD 100 /* number of inode cache entries */ -#define COH_NICFREE 64 /* number of free block list chunk entries */ -struct coh_super_block { - __fs16 s_isize; /* index of first data zone */ - __fs32 s_fsize __packed2__; /* total number of zones of this fs */ - /* the start of the free block list: */ - __fs16 s_nfree; /* number of free blocks in s_free, <= COH_NICFREE */ - sysv_zone_t s_free[COH_NICFREE] __packed2__; /* first free block list chunk */ - /* the cache of free inodes: */ - __fs16 s_ninode; /* number of free inodes in s_inode, <= COH_NICINOD */ - sysv_ino_t s_inode[COH_NICINOD]; /* some free inodes */ - /* locks, not used by Linux: */ - char s_flock; /* lock during free block list manipulation */ - char s_ilock; /* lock during inode cache manipulation */ - char s_fmod; /* super-block modified flag */ - char s_ronly; /* flag whether fs is mounted read-only */ - __fs32 s_time __packed2__; /* time of last super block update */ - __fs32 s_tfree __packed2__; /* total number of free zones */ - __fs16 s_tinode; /* total number of free inodes */ - __fs16 s_interleave_m; /* interleave factor */ - __fs16 s_interleave_n; - char s_fname[6]; /* file system volume name */ - char s_fpack[6]; /* file system pack name */ - __fs32 s_unique; /* zero, not used */ -}; - -/* SystemV/Coherent inode data on disk */ -struct sysv_inode { - __fs16 i_mode; - __fs16 i_nlink; - __fs16 i_uid; - __fs16 i_gid; - __fs32 i_size; - u8 i_data[3*(10+1+1+1)]; - u8 i_gen; - __fs32 i_atime; /* time of last access */ - __fs32 i_mtime; /* time of last modification */ - __fs32 i_ctime; /* time of creation */ -}; - -/* SystemV/Coherent directory entry on disk */ -#define SYSV_NAMELEN 14 /* max size of name in struct sysv_dir_entry */ -struct sysv_dir_entry { - sysv_ino_t inode; - char name[SYSV_NAMELEN]; /* up to 14 characters, the rest are zeroes */ -}; - -#define SYSV_DIRSIZE sizeof(struct sysv_dir_entry) /* size of every directory entry */ - -#endif /* _LINUX_SYSV_FS_H */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cf2446c9c30d..dd925d84fa46 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -217,54 +217,6 @@ static inline int arch_within_stack_frames(const void * const stack, } #endif -#ifdef CONFIG_HARDENED_USERCOPY -extern void __check_object_size(const void *ptr, unsigned long n, - bool to_user); - -static __always_inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) -{ - if (!__builtin_constant_p(n)) - __check_object_size(ptr, n, to_user); -} -#else -static inline void check_object_size(const void *ptr, unsigned long n, - bool to_user) -{ } -#endif /* CONFIG_HARDENED_USERCOPY */ - -extern void __compiletime_error("copy source size is too small") -__bad_copy_from(void); -extern void __compiletime_error("copy destination size is too small") -__bad_copy_to(void); - -void __copy_overflow(int size, unsigned long count); - -static inline void copy_overflow(int size, unsigned long count) -{ - if (IS_ENABLED(CONFIG_BUG)) - __copy_overflow(size, count); -} - -static __always_inline __must_check bool -check_copy_size(const void *addr, size_t bytes, bool is_source) -{ - int sz = __builtin_object_size(addr, 0); - if (unlikely(sz >= 0 && sz < bytes)) { - if (!__builtin_constant_p(bytes)) - copy_overflow(sz, bytes); - else if (is_source) - __bad_copy_from(); - else - __bad_copy_to(); - return false; - } - if (WARN_ON_ONCE(bytes > INT_MAX)) - return false; - check_object_size(addr, bytes, is_source); - return true; -} - #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 876e31b4461d..0b8b32bf0655 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -165,6 +165,4 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #endif -struct vdso_data *arch_get_vdso_data(void *vvar_page); - #endif /* _LINUX_TIMENS_H */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 52f5850730b3..24e715f0f6d2 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -240,6 +240,29 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif +#ifndef topology_is_primary_thread + +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + /* + * When disabling SMT, the primary thread of the SMT will remain + * enabled/active. Architectures that have a special primary thread + * (e.g. x86) need to override this function. Otherwise the first + * thread in the SMT can be made the primary thread. + * + * The sibling cpumask of an offline CPU always contains the CPU + * itself on architectures using the implementation of + * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology. + * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for + * building their topology have to check whether to use this default + * implementation or to override it. + */ + return cpu == cpumask_first(topology_sibling_cpumask(cpu)); +} +#define topology_is_primary_thread topology_is_primary_thread + +#endif + static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); @@ -262,6 +285,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops) #endif /* CONFIG_NUMA */ /** + * for_each_node_numadist() - iterate over nodes in increasing distance + * order, starting from a given node + * @node: the iteration variable and the starting node. + * @unvisited: a nodemask to keep track of the unvisited nodes. + * + * This macro iterates over NUMA node IDs in increasing distance from the + * starting @node and yields MAX_NUMNODES when all the nodes have been + * visited. + * + * Note that by the time the loop completes, the @unvisited nodemask will + * be fully cleared, unless the loop exits early. + * + * The difference between for_each_node() and for_each_node_numadist() is + * that the former allows to iterate over nodes in numerical order, whereas + * the latter iterates over nodes in increasing order of distance. + * + * This complexity of this iterator is O(N^2), where N represents the + * number of nodes, as each iteration involves scanning all nodes to + * find the one with the shortest distance. + * + * Requires rcu_lock to be held. + */ +#define for_each_node_numadist(node, unvisited) \ + for (int __start = (node), \ + (node) = nearest_node_nodemask((__start), &(unvisited)); \ + (node) < MAX_NUMNODES; \ + node_clear((node), (unvisited)), \ + (node) = nearest_node_nodemask((__start), &(unvisited))) + +/** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. diff --git a/include/linux/torture.h b/include/linux/torture.h index 0134e7221cae..1b59056c3b18 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -104,6 +104,7 @@ int torture_stutter_init(int s, int sgap); /* Initialization and cleanup. */ bool torture_init_begin(char *ttype, int v); void torture_init_end(void); +unsigned long get_torture_init_jiffies(void); bool torture_cleanup_begin(void); void torture_cleanup_end(void); bool torture_must_stop(void); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index e9c702c1908d..7c06f4795670 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -7,7 +7,7 @@ #include <linux/minmax.h> #include <linux/nospec.h> #include <linux/sched.h> -#include <linux/thread_info.h> +#include <linux/ucopysize.h> #include <asm/uaccess.h> diff --git a/include/linux/ucopysize.h b/include/linux/ucopysize.h new file mode 100644 index 000000000000..41c2d9720466 --- /dev/null +++ b/include/linux/ucopysize.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Perform sanity checking for object sizes for uaccess.h and uio.h. */ +#ifndef __LINUX_UCOPYSIZE_H__ +#define __LINUX_UCOPYSIZE_H__ + +#include <linux/bug.h> + +#ifdef CONFIG_HARDENED_USERCOPY +#include <linux/jump_label.h> +extern void __check_object_size(const void *ptr, unsigned long n, + bool to_user); + +DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + validate_usercopy_range); + +static __always_inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ + if (!__builtin_constant_p(n) && + static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + &validate_usercopy_range)) { + __check_object_size(ptr, n, to_user); + } +} +#else +static inline void check_object_size(const void *ptr, unsigned long n, + bool to_user) +{ } +#endif /* CONFIG_HARDENED_USERCOPY */ + +extern void __compiletime_error("copy source size is too small") +__bad_copy_from(void); +extern void __compiletime_error("copy destination size is too small") +__bad_copy_to(void); + +void __copy_overflow(int size, unsigned long count); + +static inline void copy_overflow(int size, unsigned long count) +{ + if (IS_ENABLED(CONFIG_BUG)) + __copy_overflow(size, count); +} + +static __always_inline __must_check bool +check_copy_size(const void *addr, size_t bytes, bool is_source) +{ + int sz = __builtin_object_size(addr, 0); + if (unlikely(sz >= 0 && sz < bytes)) { + if (!__builtin_constant_p(bytes)) + copy_overflow(sz, bytes); + else if (is_source) + __bad_copy_from(); + else + __bad_copy_to(); + return false; + } + if (WARN_ON_ONCE(bytes > INT_MAX)) + return false; + check_object_size(addr, bytes, is_source); + return true; +} + +#endif /* __LINUX_UCOPYSIZE_H__ */ diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index f85ec5613721..2dc767e08f54 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -132,6 +132,7 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) u32 map_id_down(struct uid_gid_map *map, u32 id); u32 map_id_up(struct uid_gid_map *map, u32 id); +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count); #else @@ -186,6 +187,11 @@ static inline u32 map_id_down(struct uid_gid_map *map, u32 id) return id; } +static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) +{ + return id; +} + static inline u32 map_id_up(struct uid_gid_map *map, u32 id) { return id; diff --git a/include/linux/uio.h b/include/linux/uio.h index 8ada84e85447..49ece9e1888f 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -6,8 +6,8 @@ #define __LINUX_UIO_H #include <linux/kernel.h> -#include <linux/thread_info.h> #include <linux/mm_types.h> +#include <linux/ucopysize.h> #include <uapi/linux/uio.h> struct page; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b1df7d792fa1..2e46b69ff0a6 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -39,6 +39,8 @@ struct page; #define MAX_URETPROBE_DEPTH 64 +#define UPROBE_NO_TRAMPOLINE_VADDR (~0UL) + struct uprobe_consumer { /* * handler() can return UPROBE_HANDLER_REMOVE to signal the need to @@ -143,6 +145,7 @@ struct uprobe_task { struct uprobe *active_uprobe; unsigned long xol_vaddr; + bool signal_denied; struct arch_uprobe *auprobe; }; diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h new file mode 100644 index 000000000000..a91fa24b06e0 --- /dev/null +++ b/include/linux/vdso_datastore.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VDSO_DATASTORE_H +#define _LINUX_VDSO_DATASTORE_H + +#include <linux/mm_types.h> + +extern const struct vm_special_mapping vdso_vvar_mapping; +struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr); + +#endif /* _LINUX_VDSO_DATASTORE_H */ diff --git a/include/linux/vfsdebug.h b/include/linux/vfsdebug.h new file mode 100644 index 000000000000..9cf22d3eb9dd --- /dev/null +++ b/include/linux/vfsdebug.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VFS_DEBUG_H +#define LINUX_VFS_DEBUG_H 1 + +#include <linux/bug.h> + +struct inode; + +#ifdef CONFIG_DEBUG_VFS +void dump_inode(struct inode *inode, const char *reason); + +#define VFS_BUG_ON(cond) BUG_ON(cond) +#define VFS_WARN_ON(cond) (void)WARN_ON(cond) +#define VFS_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define VFS_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) +#define VFS_WARN(cond, format...) (void)WARN(cond, format) + +#define VFS_BUG_ON_INODE(cond, inode) ({ \ + if (unlikely(!!(cond))) { \ + dump_inode(inode, "VFS_BUG_ON_INODE(" #cond")");\ + BUG_ON(1); \ + } \ +}) + +#define VFS_WARN_ON_INODE(cond, inode) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_inode(inode, "VFS_WARN_ON_INODE(" #cond")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) +#else +#define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) + +#define VFS_BUG_ON_INODE(cond, inode) VFS_BUG_ON(cond) +#define VFS_WARN_ON_INODE(cond, inode) BUILD_BUG_ON_INVALID(cond) +#endif /* CONFIG_DEBUG_VFS */ + +#endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f70d0958095c..5a37cb2b6f93 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -151,6 +151,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, DIRECT_MAP_LEVEL3_SPLIT, + DIRECT_MAP_LEVEL2_COLLAPSE, + DIRECT_MAP_LEVEL3_COLLAPSE, #endif #ifdef CONFIG_PER_VMA_LOCK_STATS VMA_LOCK_SUCCESS, diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h index e1dec1a6a749..37e003ae5262 100644 --- a/include/linux/vmcore_info.h +++ b/include/linux/vmcore_info.h @@ -6,9 +6,8 @@ #include <linux/elfcore.h> #include <linux/elf.h> -#define CRASH_CORE_NOTE_NAME "CORE" #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) /* diff --git a/include/linux/wait.h b/include/linux/wait.h index 6d90ad974408..3503fe822e38 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -316,6 +316,9 @@ extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); } \ \ cmd; \ + \ + if (condition) \ + break; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 0754c463224a..cf793d18e5df 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -69,6 +69,8 @@ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer); struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call); const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer); const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); +unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); +unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t, diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 0d51970d809f..3ec915738112 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -683,7 +683,7 @@ enum { #define HCI_ERROR_REMOTE_POWER_OFF 0x15 #define HCI_ERROR_LOCAL_HOST_TERM 0x16 #define HCI_ERROR_PAIRING_NOT_ALLOWED 0x18 -#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE 0x1e +#define HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE 0x1a #define HCI_ERROR_INVALID_LL_PARAMS 0x1e #define HCI_ERROR_UNSPECIFIED 0x1f #define HCI_ERROR_ADVERTISING_TIMEOUT 0x3c diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 90f56656b572..62e9d7673862 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -408,8 +408,6 @@ struct gdma_context { struct gdma_dev mana_ib; }; -#define MAX_NUM_GDMA_DEVICES 4 - static inline bool mana_gd_is_mana(struct gdma_dev *gd) { return gd->dev_id.type == GDMA_DEVICE_MANA; @@ -556,11 +554,15 @@ enum { #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5) +/* Driver can handle holes (zeros) in the device list */ +#define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ - GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT) + GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) #define GDMA_DRV_CAP_FLAGS2 0 @@ -621,11 +623,12 @@ struct gdma_query_max_resources_resp { }; /* HW DATA */ /* GDMA_LIST_DEVICES */ +#define GDMA_DEV_LIST_SIZE 64 struct gdma_list_devices_resp { struct gdma_resp_hdr hdr; u32 num_of_devs; u32 reserved; - struct gdma_dev_id devs[64]; + struct gdma_dev_id devs[GDMA_DEV_LIST_SIZE]; }; /* HW DATA */ /* GDMA_REGISTER_DEVICE */ diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 958a2460330c..8857f5ea77d4 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -127,27 +127,29 @@ enum yfs_cm_operation { E_(afs_call_trace_work, "QUEUE") #define afs_server_traces \ - EM(afs_server_trace_alloc, "ALLOC ") \ EM(afs_server_trace_callback, "CALLBACK ") \ EM(afs_server_trace_destroy, "DESTROY ") \ EM(afs_server_trace_free, "FREE ") \ EM(afs_server_trace_gc, "GC ") \ - EM(afs_server_trace_get_by_addr, "GET addr ") \ - EM(afs_server_trace_get_by_uuid, "GET uuid ") \ - EM(afs_server_trace_get_caps, "GET caps ") \ - EM(afs_server_trace_get_install, "GET inst ") \ - EM(afs_server_trace_get_new_cbi, "GET cbi ") \ EM(afs_server_trace_get_probe, "GET probe") \ - EM(afs_server_trace_give_up_cb, "giveup-cb") \ EM(afs_server_trace_purging, "PURGE ") \ - EM(afs_server_trace_put_call, "PUT call ") \ EM(afs_server_trace_put_cbi, "PUT cbi ") \ - EM(afs_server_trace_put_find_rsq, "PUT f-rsq") \ EM(afs_server_trace_put_probe, "PUT probe") \ - EM(afs_server_trace_put_slist, "PUT slist") \ - EM(afs_server_trace_put_slist_isort, "PUT isort") \ - EM(afs_server_trace_put_uuid_rsq, "PUT u-req") \ - E_(afs_server_trace_update, "UPDATE") + EM(afs_server_trace_see_destroyer, "SEE destr") \ + EM(afs_server_trace_see_expired, "SEE expd ") \ + EM(afs_server_trace_see_purge, "SEE purge") \ + EM(afs_server_trace_see_timer, "SEE timer") \ + EM(afs_server_trace_unuse_call, "UNU call ") \ + EM(afs_server_trace_unuse_create_fail, "UNU cfail") \ + EM(afs_server_trace_unuse_slist, "UNU slist") \ + EM(afs_server_trace_unuse_slist_isort, "UNU isort") \ + EM(afs_server_trace_update, "UPDATE ") \ + EM(afs_server_trace_use_by_uuid, "USE uuid ") \ + EM(afs_server_trace_use_cm_call, "USE cm-cl") \ + EM(afs_server_trace_use_get_caps, "USE gcaps") \ + EM(afs_server_trace_use_give_up_cb, "USE gvupc") \ + EM(afs_server_trace_use_install, "USE inst ") \ + E_(afs_server_trace_wait_create, "WAIT crt ") #define afs_volume_traces \ EM(afs_volume_trace_alloc, "ALLOC ") \ @@ -169,41 +171,48 @@ enum yfs_cm_operation { #define afs_cell_traces \ EM(afs_cell_trace_alloc, "ALLOC ") \ + EM(afs_cell_trace_destroy, "DESTROY ") \ EM(afs_cell_trace_free, "FREE ") \ EM(afs_cell_trace_get_atcell, "GET atcell") \ - EM(afs_cell_trace_get_queue_dns, "GET q-dns ") \ - EM(afs_cell_trace_get_queue_manage, "GET q-mng ") \ - EM(afs_cell_trace_get_queue_new, "GET q-new ") \ EM(afs_cell_trace_get_server, "GET server") \ EM(afs_cell_trace_get_vol, "GET vol ") \ - EM(afs_cell_trace_insert, "INSERT ") \ - EM(afs_cell_trace_manage, "MANAGE ") \ + EM(afs_cell_trace_purge, "PURGE ") \ EM(afs_cell_trace_put_atcell, "PUT atcell") \ EM(afs_cell_trace_put_candidate, "PUT candid") \ - EM(afs_cell_trace_put_destroy, "PUT destry") \ - EM(afs_cell_trace_put_queue_work, "PUT q-work") \ - EM(afs_cell_trace_put_queue_fail, "PUT q-fail") \ + EM(afs_cell_trace_put_final, "PUT final ") \ EM(afs_cell_trace_put_server, "PUT server") \ EM(afs_cell_trace_put_vol, "PUT vol ") \ + EM(afs_cell_trace_queue_again, "QUE again ") \ + EM(afs_cell_trace_queue_dns, "QUE dns ") \ + EM(afs_cell_trace_queue_new, "QUE new ") \ + EM(afs_cell_trace_queue_purge, "QUE purge ") \ + EM(afs_cell_trace_manage, "MANAGE ") \ + EM(afs_cell_trace_managed, "MANAGED ") \ EM(afs_cell_trace_see_source, "SEE source") \ - EM(afs_cell_trace_see_ws, "SEE ws ") \ + EM(afs_cell_trace_see_mgmt_timer, "SEE mtimer") \ EM(afs_cell_trace_unuse_alias, "UNU alias ") \ EM(afs_cell_trace_unuse_check_alias, "UNU chk-al") \ EM(afs_cell_trace_unuse_delete, "UNU delete") \ + EM(afs_cell_trace_unuse_dynroot_mntpt, "UNU dyn-mp") \ EM(afs_cell_trace_unuse_fc, "UNU fc ") \ - EM(afs_cell_trace_unuse_lookup, "UNU lookup") \ + EM(afs_cell_trace_unuse_lookup_dynroot, "UNU lu-dyn") \ + EM(afs_cell_trace_unuse_lookup_error, "UNU lu-err") \ EM(afs_cell_trace_unuse_mntpt, "UNU mntpt ") \ EM(afs_cell_trace_unuse_no_pin, "UNU no-pin") \ EM(afs_cell_trace_unuse_parse, "UNU parse ") \ EM(afs_cell_trace_unuse_pin, "UNU pin ") \ - EM(afs_cell_trace_unuse_probe, "UNU probe ") \ EM(afs_cell_trace_unuse_sbi, "UNU sbi ") \ EM(afs_cell_trace_unuse_ws, "UNU ws ") \ EM(afs_cell_trace_use_alias, "USE alias ") \ EM(afs_cell_trace_use_check_alias, "USE chk-al") \ EM(afs_cell_trace_use_fc, "USE fc ") \ EM(afs_cell_trace_use_fc_alias, "USE fc-al ") \ - EM(afs_cell_trace_use_lookup, "USE lookup") \ + EM(afs_cell_trace_use_lookup_add, "USE lu-add") \ + EM(afs_cell_trace_use_lookup_canonical, "USE lu-can") \ + EM(afs_cell_trace_use_lookup_dynroot, "USE lu-dyn") \ + EM(afs_cell_trace_use_lookup_mntpt, "USE lu-mpt") \ + EM(afs_cell_trace_use_lookup_mount, "USE lu-mnt") \ + EM(afs_cell_trace_use_lookup_ws, "USE lu-ws ") \ EM(afs_cell_trace_use_mntpt, "USE mntpt ") \ EM(afs_cell_trace_use_pin, "USE pin ") \ EM(afs_cell_trace_use_probe, "USE probe ") \ @@ -220,7 +229,7 @@ enum yfs_cm_operation { EM(afs_alist_trace_put_getaddru, "PUT GtAdrU") \ EM(afs_alist_trace_put_parse_empty, "PUT p-empt") \ EM(afs_alist_trace_put_parse_error, "PUT p-err ") \ - EM(afs_alist_trace_put_server_dup, "PUT sv-dup") \ + EM(afs_alist_trace_put_server_create, "PUT sv-crt") \ EM(afs_alist_trace_put_server_oom, "PUT sv-oom") \ EM(afs_alist_trace_put_server_update, "PUT sv-upd") \ EM(afs_alist_trace_put_vlgetcaps, "PUT vgtcap") \ @@ -1270,7 +1279,7 @@ TRACE_EVENT(afs_bulkstat_error, ); TRACE_EVENT(afs_cm_no_server, - TP_PROTO(struct afs_call *call, struct sockaddr_rxrpc *srx), + TP_PROTO(struct afs_call *call, const struct sockaddr_rxrpc *srx), TP_ARGS(call, srx), @@ -1529,7 +1538,7 @@ TRACE_EVENT(afs_server, __entry->reason = reason; ), - TP_printk("s=%08x %s u=%d a=%d", + TP_printk("s=%08x %s r=%d a=%d", __entry->server, __print_symbolic(__entry->reason, afs_server_traces), __entry->ref, @@ -1537,25 +1546,29 @@ TRACE_EVENT(afs_server, ); TRACE_EVENT(afs_volume, - TP_PROTO(afs_volid_t vid, int ref, enum afs_volume_trace reason), + TP_PROTO(unsigned int debug_id, afs_volid_t vid, int ref, + enum afs_volume_trace reason), - TP_ARGS(vid, ref, reason), + TP_ARGS(debug_id, vid, ref, reason), TP_STRUCT__entry( + __field(unsigned int, debug_id) __field(afs_volid_t, vid) __field(int, ref) __field(enum afs_volume_trace, reason) ), TP_fast_assign( - __entry->vid = vid; - __entry->ref = ref; - __entry->reason = reason; + __entry->debug_id = debug_id; + __entry->vid = vid; + __entry->ref = ref; + __entry->reason = reason; ), - TP_printk("V=%llx %s ur=%d", - __entry->vid, + TP_printk("V=%08x %s vid=%llx r=%d", + __entry->debug_id, __print_symbolic(__entry->reason, afs_volume_traces), + __entry->vid, __entry->ref) ); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e81431deaa50..5fbdabe3faea 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -207,7 +207,7 @@ TRACE_EVENT_RCU(rcu_exp_grace_period, __entry->gpevent = gpevent; ), - TP_printk("%s %ld %s", + TP_printk("%s %#lx %s", __entry->rcuname, __entry->gpseq, __entry->gpevent) ); @@ -561,40 +561,6 @@ TRACE_EVENT_RCU(rcu_segcb_stats, ); /* - * Tracepoint for the registration of a single RCU callback of the special - * kvfree() form. The first argument is the RCU type, the second argument - * is a pointer to the RCU callback, the third argument is the offset - * of the callback within the enclosing RCU-protected data structure, - * the fourth argument is the number of lazy callbacks queued, and the - * fifth argument is the total number of callbacks queued. - */ -TRACE_EVENT_RCU(rcu_kvfree_callback, - - TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen), - - TP_ARGS(rcuname, rhp, offset, qlen), - - TP_STRUCT__entry( - __field(const char *, rcuname) - __field(void *, rhp) - __field(unsigned long, offset) - __field(long, qlen) - ), - - TP_fast_assign( - __entry->rcuname = rcuname; - __entry->rhp = rhp; - __entry->offset = offset; - __entry->qlen = qlen; - ), - - TP_printk("%s rhp=%p func=%ld %ld", - __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen) -); - -/* * Tracepoint for marking the beginning rcu_do_batch, performed to start * RCU callback invocation. The first argument is the RCU flavor, * the second is the number of lazy callbacks queued, the third is diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9ea4c404bd4e..bfd97cce40a1 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -193,9 +193,7 @@ static inline long __trace_sched_switch_state(bool preempt, { unsigned int state; -#ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); -#endif /* CONFIG_SCHED_DEBUG */ /* * Preemption ignores task state, therefore preempted tasks are always diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h index fe19da7315a9..50e4b712735a 100644 --- a/include/trace/events/sched_ext.h +++ b/include/trace/events/sched_ext.h @@ -26,6 +26,25 @@ TRACE_EVENT(sched_ext_dump, ) ); +TRACE_EVENT(sched_ext_event, + TP_PROTO(const char *name, __s64 delta), + TP_ARGS(name, delta), + + TP_STRUCT__entry( + __string(name, name) + __field( __s64, delta ) + ), + + TP_fast_assign( + __assign_str(name); + __entry->delta = delta; + ), + + TP_printk("name %s delta %lld", + __get_str(name), __entry->delta + ) +); + #endif /* _TRACE_SCHED_EXT_H */ /* This part must be outside protection */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 1ea2c4c33b86..ef1c27fa3c57 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -85,6 +85,7 @@ /* compatibility flags */ #define MAP_FILE 0 +#define PKEY_UNRESTRICTED 0x0 #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 88dc393c2bca..2892a45023af 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -849,9 +849,11 @@ __SYSCALL(__NR_getxattrat, sys_getxattrat) __SYSCALL(__NR_listxattrat, sys_listxattrat) #define __NR_removexattrat 466 __SYSCALL(__NR_removexattrat, sys_removexattrat) +#define __NR_open_tree_attr 467 +__SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) #undef __NR_syscalls -#define __NR_syscalls 467 +#define __NR_syscalls 468 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h index 5ee30f882736..682b406e1067 100644 --- a/include/uapi/linux/bits.h +++ b/include/uapi/linux/bits.h @@ -4,13 +4,9 @@ #ifndef _UAPI_LINUX_BITS_H #define _UAPI_LINUX_BITS_H -#define __GENMASK(h, l) \ - (((~_UL(0)) - (_UL(1) << (l)) + 1) & \ - (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) \ - (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \ - (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index e16be0d37746..b8f629ef135f 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -33,7 +33,7 @@ * Missing asm support * * __BIT128() would not work in the asm code, as it shifts an - * 'unsigned __init128' data type as direct representation of + * 'unsigned __int128' data type as direct representation of * 128 bit constants is not supported in the gcc compiler, as * they get silently truncated. * diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b44069d29cec..819ded2d39de 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -11,6 +11,7 @@ typedef __u16 Elf32_Half; typedef __u32 Elf32_Off; typedef __s32 Elf32_Sword; typedef __u32 Elf32_Word; +typedef __u16 Elf32_Versym; /* 64-bit ELF base types. */ typedef __u64 Elf64_Addr; @@ -21,6 +22,7 @@ typedef __s32 Elf64_Sword; typedef __u32 Elf64_Word; typedef __u64 Elf64_Xword; typedef __s64 Elf64_Sxword; +typedef __u16 Elf64_Versym; /* These constants are for the segment types stored in the image headers */ #define PT_NULL 0 @@ -107,6 +109,7 @@ typedef __s64 Elf64_Sxword; #define DT_VALRNGLO 0x6ffffd00 #define DT_VALRNGHI 0x6ffffdff #define DT_ADDRRNGLO 0x6ffffe00 +#define DT_GNU_HASH 0x6ffffef5 #define DT_ADDRRNGHI 0x6ffffeff #define DT_VERSYM 0x6ffffff0 #define DT_RELACOUNT 0x6ffffff9 @@ -125,6 +128,8 @@ typedef __s64 Elf64_Sxword; #define STB_GLOBAL 1 #define STB_WEAK 2 +#define STN_UNDEF 0 + #define STT_NOTYPE 0 #define STT_OBJECT 1 #define STT_FUNC 2 @@ -133,6 +138,9 @@ typedef __s64 Elf64_Sxword; #define STT_COMMON 5 #define STT_TLS 6 +#define VER_FLG_BASE 0x1 +#define VER_FLG_WEAK 0x2 + #define ELF_ST_BIND(x) ((x) >> 4) #define ELF_ST_TYPE(x) ((x) & 0xf) #define ELF32_ST_BIND(x) ELF_ST_BIND(x) @@ -291,8 +299,18 @@ typedef struct elf64_phdr { #define SHF_WRITE 0x1 #define SHF_ALLOC 0x2 #define SHF_EXECINSTR 0x4 +#define SHF_MERGE 0x10 +#define SHF_STRINGS 0x20 +#define SHF_INFO_LINK 0x40 +#define SHF_LINK_ORDER 0x80 +#define SHF_OS_NONCONFORMING 0x100 +#define SHF_GROUP 0x200 +#define SHF_TLS 0x400 #define SHF_RELA_LIVEPATCH 0x00100000 #define SHF_RO_AFTER_INIT 0x00200000 +#define SHF_ORDERED 0x04000000 +#define SHF_EXCLUDE 0x08000000 +#define SHF_MASKOS 0x0ff00000 #define SHF_MASKPROC 0xf0000000 /* special section indexes */ @@ -368,101 +386,180 @@ typedef struct elf64_shdr { #define ELF_OSABI ELFOSABI_NONE #endif +/* Note definitions: NN_ defines names. NT_ defines types. */ + +#define NN_GNU_PROPERTY_TYPE_0 "GNU" +#define NT_GNU_PROPERTY_TYPE_0 5 + /* * Notes used in ET_CORE. Architectures export some of the arch register sets * using the corresponding note types via the PTRACE_GETREGSET and * PTRACE_SETREGSET requests. - * The note name for these types is "LINUX", except NT_PRFPREG that is named - * "CORE". */ +#define NN_PRSTATUS "CORE" #define NT_PRSTATUS 1 +#define NN_PRFPREG "CORE" #define NT_PRFPREG 2 +#define NN_PRPSINFO "CORE" #define NT_PRPSINFO 3 +#define NN_TASKSTRUCT "CORE" #define NT_TASKSTRUCT 4 +#define NN_AUXV "CORE" #define NT_AUXV 6 /* * Note to userspace developers: size of NT_SIGINFO note may increase * in the future to accomodate more fields, don't assume it is fixed! */ +#define NN_SIGINFO "CORE" #define NT_SIGINFO 0x53494749 +#define NN_FILE "CORE" #define NT_FILE 0x46494c45 +#define NN_PRXFPREG "LINUX" #define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ +#define NN_PPC_VMX "LINUX" #define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ +#define NN_PPC_SPE "LINUX" #define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ +#define NN_PPC_VSX "LINUX" #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ +#define NN_PPC_TAR "LINUX" #define NT_PPC_TAR 0x103 /* Target Address Register */ +#define NN_PPC_PPR "LINUX" #define NT_PPC_PPR 0x104 /* Program Priority Register */ +#define NN_PPC_DSCR "LINUX" #define NT_PPC_DSCR 0x105 /* Data Stream Control Register */ +#define NN_PPC_EBB "LINUX" #define NT_PPC_EBB 0x106 /* Event Based Branch Registers */ +#define NN_PPC_PMU "LINUX" #define NT_PPC_PMU 0x107 /* Performance Monitor Registers */ +#define NN_PPC_TM_CGPR "LINUX" #define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NN_PPC_TM_CFPR "LINUX" #define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NN_PPC_TM_CVMX "LINUX" #define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NN_PPC_TM_CVSX "LINUX" #define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NN_PPC_TM_SPR "LINUX" #define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NN_PPC_TM_CTAR "LINUX" #define NT_PPC_TM_CTAR 0x10d /* TM checkpointed Target Address Register */ +#define NN_PPC_TM_CPPR "LINUX" #define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ +#define NN_PPC_TM_CDSCR "LINUX" #define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ +#define NN_PPC_PKEY "LINUX" #define NT_PPC_PKEY 0x110 /* Memory Protection Keys registers */ +#define NN_PPC_DEXCR "LINUX" #define NT_PPC_DEXCR 0x111 /* PowerPC DEXCR registers */ +#define NN_PPC_HASHKEYR "LINUX" #define NT_PPC_HASHKEYR 0x112 /* PowerPC HASHKEYR register */ +#define NN_386_TLS "LINUX" #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ +#define NN_386_IOPERM "LINUX" #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ +#define NN_X86_XSTATE "LINUX" #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ /* Old binutils treats 0x203 as a CET state */ +#define NN_X86_SHSTK "LINUX" #define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ +#define NN_X86_XSAVE_LAYOUT "LINUX" #define NT_X86_XSAVE_LAYOUT 0x205 /* XSAVE layout description */ +#define NN_S390_HIGH_GPRS "LINUX" #define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ +#define NN_S390_TIMER "LINUX" #define NT_S390_TIMER 0x301 /* s390 timer register */ +#define NN_S390_TODCMP "LINUX" #define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ +#define NN_S390_TODPREG "LINUX" #define NT_S390_TODPREG 0x303 /* s390 TOD programmable register */ +#define NN_S390_CTRS "LINUX" #define NT_S390_CTRS 0x304 /* s390 control registers */ +#define NN_S390_PREFIX "LINUX" #define NT_S390_PREFIX 0x305 /* s390 prefix register */ +#define NN_S390_LAST_BREAK "LINUX" #define NT_S390_LAST_BREAK 0x306 /* s390 breaking event address */ +#define NN_S390_SYSTEM_CALL "LINUX" #define NT_S390_SYSTEM_CALL 0x307 /* s390 system call restart data */ +#define NN_S390_TDB "LINUX" #define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ +#define NN_S390_VXRS_LOW "LINUX" #define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ +#define NN_S390_VXRS_HIGH "LINUX" #define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ +#define NN_S390_GS_CB "LINUX" #define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ +#define NN_S390_GS_BC "LINUX" #define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ +#define NN_S390_RI_CB "LINUX" #define NT_S390_RI_CB 0x30d /* s390 runtime instrumentation */ +#define NN_S390_PV_CPU_DATA "LINUX" #define NT_S390_PV_CPU_DATA 0x30e /* s390 protvirt cpu dump data */ +#define NN_ARM_VFP "LINUX" #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ +#define NN_ARM_TLS "LINUX" #define NT_ARM_TLS 0x401 /* ARM TLS register */ +#define NN_ARM_HW_BREAK "LINUX" #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ +#define NN_ARM_HW_WATCH "LINUX" #define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ +#define NN_ARM_SYSTEM_CALL "LINUX" #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ +#define NN_ARM_SVE "LINUX" #define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ +#define NN_ARM_PAC_MASK "LINUX" #define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NN_ARM_PACA_KEYS "LINUX" #define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NN_ARM_PACG_KEYS "LINUX" #define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ +#define NN_ARM_TAGGED_ADDR_CTRL "LINUX" #define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ +#define NN_ARM_PAC_ENABLED_KEYS "LINUX" #define NT_ARM_PAC_ENABLED_KEYS 0x40a /* arm64 ptr auth enabled keys (prctl()) */ +#define NN_ARM_SSVE "LINUX" #define NT_ARM_SSVE 0x40b /* ARM Streaming SVE registers */ +#define NN_ARM_ZA "LINUX" #define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ +#define NN_ARM_ZT "LINUX" #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ +#define NN_ARM_FPMR "LINUX" #define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NN_ARM_POE "LINUX" #define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NN_ARM_GCS "LINUX" #define NT_ARM_GCS 0x410 /* ARM GCS state */ +#define NN_ARC_V2 "LINUX" #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ +#define NN_VMCOREDD "LINUX" #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ +#define NN_MIPS_DSP "LINUX" #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ +#define NN_MIPS_FP_MODE "LINUX" #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ +#define NN_MIPS_MSA "LINUX" #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +#define NN_RISCV_CSR "LINUX" #define NT_RISCV_CSR 0x900 /* RISC-V Control and Status Registers */ +#define NN_RISCV_VECTOR "LINUX" #define NT_RISCV_VECTOR 0x901 /* RISC-V vector registers */ +#define NN_RISCV_TAGGED_ADDR_CTRL "LINUX" #define NT_RISCV_TAGGED_ADDR_CTRL 0x902 /* RISC-V tagged address control (prctl()) */ +#define NN_LOONGARCH_CPUCFG "LINUX" #define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ +#define NN_LOONGARCH_CSR "LINUX" #define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ +#define NN_LOONGARCH_LSX "LINUX" #define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ +#define NN_LOONGARCH_LASX "LINUX" #define NT_LOONGARCH_LASX 0xa03 /* LoongArch Loongson Advanced SIMD Extension registers */ +#define NN_LOONGARCH_LBT "LINUX" #define NT_LOONGARCH_LBT 0xa04 /* LoongArch Loongson Binary Translation registers */ +#define NN_LOONGARCH_HW_BREAK "LINUX" #define NT_LOONGARCH_HW_BREAK 0xa05 /* LoongArch hardware breakpoint registers */ +#define NN_LOONGARCH_HW_WATCH "LINUX" #define NT_LOONGARCH_HW_WATCH 0xa06 /* LoongArch hardware watchpoint registers */ -/* Note types with note name "GNU" */ -#define NT_GNU_PROPERTY_TYPE_0 5 - /* Note header in a PT_NOTE section */ typedef struct elf32_note { Elf32_Word n_namesz; /* Name size */ @@ -483,4 +580,34 @@ typedef struct elf64_note { /* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) +typedef struct { + Elf32_Half vd_version; + Elf32_Half vd_flags; + Elf32_Half vd_ndx; + Elf32_Half vd_cnt; + Elf32_Word vd_hash; + Elf32_Word vd_aux; + Elf32_Word vd_next; +} Elf32_Verdef; + +typedef struct { + Elf64_Half vd_version; + Elf64_Half vd_flags; + Elf64_Half vd_ndx; + Elf64_Half vd_cnt; + Elf64_Word vd_hash; + Elf64_Word vd_aux; + Elf64_Word vd_next; +} Elf64_Verdef; + +typedef struct { + Elf32_Word vda_name; + Elf32_Word vda_next; +} Elf32_Verdaux; + +typedef struct { + Elf64_Word vda_name; + Elf64_Word vda_next; +} Elf64_Verdaux; + #endif /* _UAPI_LINUX_ELF_H */ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index bd8167979707..e710967c7c26 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -28,6 +28,8 @@ /* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ #define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */ +#define FAN_MNT_ATTACH 0x01000000 /* Mount was attached */ +#define FAN_MNT_DETACH 0x02000000 /* Mount was detached */ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ @@ -64,6 +66,7 @@ #define FAN_REPORT_NAME 0x00000800 /* Report events with name */ #define FAN_REPORT_TARGET_FID 0x00001000 /* Report dirent target id */ #define FAN_REPORT_FD_ERROR 0x00002000 /* event->fd can report error */ +#define FAN_REPORT_MNT 0x00004000 /* Report mount events */ /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */ #define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME) @@ -94,6 +97,7 @@ #define FAN_MARK_INODE 0x00000000 #define FAN_MARK_MOUNT 0x00000010 #define FAN_MARK_FILESYSTEM 0x00000100 +#define FAN_MARK_MNTNS 0x00000110 /* * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY @@ -147,6 +151,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_PIDFD 4 #define FAN_EVENT_INFO_TYPE_ERROR 5 #define FAN_EVENT_INFO_TYPE_RANGE 6 +#define FAN_EVENT_INFO_TYPE_MNT 7 /* Special info types for FAN_RENAME */ #define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10 @@ -200,6 +205,11 @@ struct fanotify_event_info_range { __u64 count; }; +struct fanotify_event_info_mnt { + struct fanotify_event_info_header hdr; + __u64 mnt_id; +}; + /* * User space may need to record additional information about its decision. * The extra information type records what kind of information is included. diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 1f2c9469f921..05e3aa8fa8bc 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -449,7 +449,8 @@ struct fw_cdev_event_phy_packet { * which the packet arrived. For %FW_CDEV_EVENT_PHY_PACKET_SENT2 and non-ping packet, * the time stamp of isochronous cycle at which the packet was sent. For ping packet, * the tick count for round-trip time measured by 1394 OHCI controller. - * The time stamp of isochronous cycle at which either the response was sent for + * + * The time stamp of isochronous cycle at which either the response was sent for * %FW_CDEV_EVENT_PHY_PACKET_SENT2 or the request arrived for * %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2. * @data: Incoming data diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index c07008816aca..7fa67c2031a5 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -179,7 +179,12 @@ struct statmount { __u32 opt_array; /* [str] Array of nul terminated fs options */ __u32 opt_sec_num; /* Number of security options */ __u32 opt_sec_array; /* [str] Array of nul terminated security options */ - __u64 __spare2[46]; + __u64 supported_mask; /* Mask flags that this kernel supports */ + __u32 mnt_uidmap_num; /* Number of uid mappings */ + __u32 mnt_uidmap; /* [str] Array of uid mappings (as seen from callers namespace) */ + __u32 mnt_gidmap_num; /* Number of gid mappings */ + __u32 mnt_gidmap; /* [str] Array of gid mappings (as seen from callers namespace) */ + __u64 __spare2[43]; char str[]; /* Variable size part containing strings */ }; @@ -217,6 +222,9 @@ struct mnt_id_req { #define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ #define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ #define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */ +#define STATMOUNT_SUPPORTED_MASK 0x00001000U /* Want/got supported mask flags */ +#define STATMOUNT_MNT_UIDMAP 0x00002000U /* Want/got uidmap... */ +#define STATMOUNT_MNT_GIDMAP 0x00004000U /* Want/got gidmap... */ /* * Special @mnt_id values that can be passed to listmount diff --git a/include/uapi/linux/nilfs2_ondisk.h b/include/uapi/linux/nilfs2_ondisk.h index c23f91ae5fe8..3196cc44a002 100644 --- a/include/uapi/linux/nilfs2_ondisk.h +++ b/include/uapi/linux/nilfs2_ondisk.h @@ -188,7 +188,8 @@ struct nilfs_super_block { __le16 s_segment_usage_size; /* Size of a segment usage */ /*98*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*A8*/ char s_volume_name[80]; /* volume name */ +/*A8*/ char s_volume_name[80] /* volume name */ + __kernel_nonstring; /*F8*/ __le32 s_c_interval; /* Commit interval of segment */ __le32 s_c_block_max; /* diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 0524d541d4e3..5fc753c23734 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -385,6 +385,8 @@ enum perf_event_read_format { * * @sample_max_stack: Max number of frame pointers in a callchain, * should be < /proc/sys/kernel/perf_event_max_stack + * Max number of entries of branch stack + * should be < hardware limit */ struct perf_event_attr { diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 4540f6301b8c..2970ef44655a 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -10,6 +10,10 @@ /* Flags for pidfd_open(). */ #define PIDFD_NONBLOCK O_NONBLOCK #define PIDFD_THREAD O_EXCL +#ifdef __KERNEL__ +#include <linux/sched.h> +#define PIDFD_CLONE CLONE_PIDFD +#endif /* Flags for pidfd_send_signal(). */ #define PIDFD_SIGNAL_THREAD (1UL << 0) @@ -20,9 +24,34 @@ #define PIDFD_INFO_PID (1UL << 0) /* Always returned, even if not requested */ #define PIDFD_INFO_CREDS (1UL << 1) /* Always returned, even if not requested */ #define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */ +#define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ + +/* + * ...and for userland we make life simpler - PIDFD_SELF refers to the current + * thread, PIDFD_SELF_PROCESS refers to the process thread group leader. + * + * For nearly all practical uses, a user will want to use PIDFD_SELF. + */ +#define PIDFD_SELF PIDFD_SELF_THREAD +#define PIDFD_SELF_PROCESS PIDFD_SELF_THREAD_GROUP + struct pidfd_info { /* * This mask is similar to the request_mask in statx(2). @@ -62,7 +91,7 @@ struct pidfd_info { __u32 sgid; __u32 fsuid; __u32 fsgid; - __u32 spare0[1]; + __s32 exit_code; }; #define PIDFS_IOCTL_MAGIC 0xFF diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 5c6080680cb2..15c18ef4eb11 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -353,4 +353,15 @@ struct prctl_mm_map { */ #define PR_LOCK_SHADOW_STACK_STATUS 76 +/* + * Controls the mode of timer_create() for CRIU restore operations. + * Enabling this allows CRIU to restore timers with explicit IDs. + * + * Don't use for normal operations as the result might be undefined. + */ +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index a6fce46aeb37..b87df1b485c2 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -70,4 +70,10 @@ #define __counted_by_be(m) #endif +#ifdef __KERNEL__ +#define __kernel_nonstring __nonstring +#else +#define __kernel_nonstring +#endif + #endif /* _UAPI_LINUX_STDDEF_H */ diff --git a/include/vdso/align.h b/include/vdso/align.h new file mode 100644 index 000000000000..02dd8626b5c5 --- /dev/null +++ b/include/vdso/align.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_ALIGN_H +#define __VDSO_ALIGN_H + +#include <vdso/const.h> + +/* @a is a power of 2 value */ +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) +#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* __VDSO_ALIGN_H */ diff --git a/include/vdso/cache.h b/include/vdso/cache.h new file mode 100644 index 000000000000..f89d48304bf8 --- /dev/null +++ b/include/vdso/cache.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_CACHE_H +#define __VDSO_CACHE_H + +#include <asm/cache.h> + +#ifndef SMP_CACHE_BYTES +#define SMP_CACHE_BYTES L1_CACHE_BYTES +#endif + +#ifndef ____cacheline_aligned +#define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) +#endif + +#endif /* __VDSO_ALIGN_H */ diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index d967baa0cd0c..1864e76e8f69 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -9,11 +9,14 @@ #include <uapi/linux/types.h> #include <uapi/asm-generic/errno-base.h> +#include <vdso/align.h> #include <vdso/bits.h> +#include <vdso/cache.h> #include <vdso/clocksource.h> #include <vdso/ktime.h> #include <vdso/limits.h> #include <vdso/math64.h> +#include <vdso/page.h> #include <vdso/processor.h> #include <vdso/time.h> #include <vdso/time32.h> @@ -25,6 +28,15 @@ struct arch_vdso_time_data {}; #endif +#if defined(CONFIG_ARCH_HAS_VDSO_ARCH_DATA) +#include <asm/vdso/arch_data.h> +#elif defined(CONFIG_GENERIC_VDSO_DATA_STORE) +struct vdso_arch_data { + /* Needed for the generic code, never actually used at runtime */ + char __unused; +}; +#endif + #define VDSO_BASES (CLOCK_TAI + 1) #define VDSO_HRES (BIT(CLOCK_REALTIME) | \ BIT(CLOCK_MONOTONIC) | \ @@ -45,11 +57,11 @@ struct arch_vdso_time_data {}; * * There is one vdso_timestamp object in vvar for each vDSO-accelerated * clock_id. For high-resolution clocks, this encodes the time - * corresponding to vdso_data.cycle_last. For coarse clocks this encodes + * corresponding to vdso_time_data.cycle_last. For coarse clocks this encodes * the actual time. * * To be noticed that for highres clocks nsec is left-shifted by - * vdso_data.cs[x].shift. + * vdso_time_data[x].shift. */ struct vdso_timestamp { u64 sec; @@ -57,7 +69,7 @@ struct vdso_timestamp { }; /** - * struct vdso_data - vdso datapage representation + * struct vdso_clock - vdso per clocksource datapage representation * @seq: timebase sequence counter * @clock_mode: clock mode * @cycle_last: timebase at clocksource init @@ -67,19 +79,9 @@ struct vdso_timestamp { * @shift: clocksource shift * @basetime[clock_id]: basetime per clock_id * @offset[clock_id]: time namespace offset per clock_id - * @tz_minuteswest: minutes west of Greenwich - * @tz_dsttime: type of DST correction - * @hrtimer_res: hrtimer resolution - * @__unused: unused - * @arch_data: architecture specific data (optional, defaults - * to an empty struct) * - * vdso_data will be accessed by 64 bit and compat code at the same time - * so we should be careful before modifying this structure. - * - * The ordering of the struct members is optimized to have fast access to the - * often required struct members which are related to CLOCK_REALTIME and - * CLOCK_MONOTONIC. This information is stored in the first cache lines. + * See also struct vdso_time_data for basic access and ordering information as + * struct vdso_clock is used there. * * @basetime is used to store the base time for the system wide time getter * VVAR page. @@ -92,7 +94,7 @@ struct vdso_timestamp { * For clocks which are not affected by time namespace adjustment the * offset must be zero. */ -struct vdso_data { +struct vdso_clock { u32 seq; s32 clock_mode; @@ -108,14 +110,35 @@ struct vdso_data { struct vdso_timestamp basetime[VDSO_BASES]; struct timens_offset offset[VDSO_BASES]; }; +}; + +/** + * struct vdso_time_data - vdso datapage representation + * @arch_data: architecture specific data (optional, defaults + * to an empty struct) + * @clock_data: clocksource related data (array) + * @tz_minuteswest: minutes west of Greenwich + * @tz_dsttime: type of DST correction + * @hrtimer_res: hrtimer resolution + * @__unused: unused + * + * vdso_time_data will be accessed by 64 bit and compat code at the same time + * so we should be careful before modifying this structure. + * + * The ordering of the struct members is optimized to have fast acces to the + * often required struct members which are related to CLOCK_REALTIME and + * CLOCK_MONOTONIC. This information is stored in the first cache lines. + */ +struct vdso_time_data { + struct arch_vdso_time_data arch_data; - s32 tz_minuteswest; - s32 tz_dsttime; - u32 hrtimer_res; - u32 __unused; + struct vdso_clock clock_data[CS_BASES]; - struct arch_vdso_time_data arch_data; -}; + s32 tz_minuteswest; + s32 tz_dsttime; + u32 hrtimer_res; + u32 __unused; +} ____cacheline_aligned; /** * struct vdso_rng_data - vdso RNG state information @@ -136,22 +159,32 @@ struct vdso_rng_data { * With the hidden visibility, the compiler simply generates a PC-relative * relocation, and this is what we need. */ -extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden"))); -extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden"))); -extern struct vdso_rng_data _vdso_rng_data __attribute__((visibility("hidden"))); - -/** - * union vdso_data_store - Generic vDSO data page - */ -union vdso_data_store { - struct vdso_data data[CS_BASES]; - u8 page[1U << CONFIG_PAGE_SHIFT]; +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE +extern struct vdso_time_data vdso_u_time_data __attribute__((visibility("hidden"))); +extern struct vdso_rng_data vdso_u_rng_data __attribute__((visibility("hidden"))); +extern struct vdso_arch_data vdso_u_arch_data __attribute__((visibility("hidden"))); + +extern struct vdso_time_data *vdso_k_time_data; +extern struct vdso_rng_data *vdso_k_rng_data; +extern struct vdso_arch_data *vdso_k_arch_data; + +#define VDSO_ARCH_DATA_SIZE ALIGN(sizeof(struct vdso_arch_data), PAGE_SIZE) +#define VDSO_ARCH_DATA_PAGES (VDSO_ARCH_DATA_SIZE >> PAGE_SHIFT) + +enum vdso_pages { + VDSO_TIME_PAGE_OFFSET, + VDSO_TIMENS_PAGE_OFFSET, + VDSO_RNG_PAGE_OFFSET, + VDSO_ARCH_PAGES_START, + VDSO_ARCH_PAGES_END = VDSO_ARCH_PAGES_START + VDSO_ARCH_DATA_PAGES - 1, + VDSO_NR_PAGES }; +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ + /* * The generic vDSO implementation requires that gettimeofday.h * provides: - * - __arch_get_vdso_data(): to get the vdso datapage. * - __arch_get_hw_counter(): to get the hw counter based on the * clock_mode. * - gettimeofday_fallback(): fallback for gettimeofday. @@ -164,6 +197,27 @@ union vdso_data_store { #include <asm/vdso/gettimeofday.h> #endif /* ENABLE_COMPAT_VDSO */ +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_VDSO_GETRANDOM +#define __vdso_u_rng_data PROVIDE(vdso_u_rng_data = vdso_u_data + 2 * PAGE_SIZE); +#else +#define __vdso_u_rng_data +#endif + +#ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA +#define __vdso_u_arch_data PROVIDE(vdso_u_arch_data = vdso_u_data + 3 * PAGE_SIZE); +#else +#define __vdso_u_arch_data +#endif + +#define VDSO_VVAR_SYMS \ + PROVIDE(vdso_u_data = . - __VDSO_PAGES * PAGE_SIZE); \ + PROVIDE(vdso_u_time_data = vdso_u_data); \ + __vdso_u_rng_data \ + __vdso_u_arch_data \ + + #endif /* !__ASSEMBLY__ */ #endif /* __VDSO_DATAPAGE_H */ diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 3ddb03bb05cb..0a98fed550ba 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -7,49 +7,53 @@ #include <asm/barrier.h> #include <vdso/datapage.h> -static __always_inline u32 vdso_read_begin(const struct vdso_data *vd) +static __always_inline u32 vdso_read_begin(const struct vdso_clock *vc) { u32 seq; - while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) + while (unlikely((seq = READ_ONCE(vc->seq)) & 1)) cpu_relax(); smp_rmb(); return seq; } -static __always_inline u32 vdso_read_retry(const struct vdso_data *vd, +static __always_inline u32 vdso_read_retry(const struct vdso_clock *vc, u32 start) { u32 seq; smp_rmb(); - seq = READ_ONCE(vd->seq); + seq = READ_ONCE(vc->seq); return seq != start; } -static __always_inline void vdso_write_begin(struct vdso_data *vd) +static __always_inline void vdso_write_begin(struct vdso_time_data *vd) { + struct vdso_clock *vc = vd->clock_data; + /* * WRITE_ONCE() is required otherwise the compiler can validly tear * updates to vd[x].seq and it is possible that the value seen by the * reader is inconsistent. */ - WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1); - WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1); + WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1); + WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1); smp_wmb(); } -static __always_inline void vdso_write_end(struct vdso_data *vd) +static __always_inline void vdso_write_end(struct vdso_time_data *vd) { + struct vdso_clock *vc = vd->clock_data; + smp_wmb(); /* * WRITE_ONCE() is required otherwise the compiler can validly tear * updates to vd[x].seq and it is possible that the value seen by the * reader is inconsistent. */ - WRITE_ONCE(vd[CS_HRES_COARSE].seq, vd[CS_HRES_COARSE].seq + 1); - WRITE_ONCE(vd[CS_RAW].seq, vd[CS_RAW].seq + 1); + WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1); + WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1); } #endif /* !__ASSEMBLY__ */ diff --git a/init/.kunitconfig b/init/.kunitconfig new file mode 100644 index 000000000000..acb906b1a5f9 --- /dev/null +++ b/init/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_TEST=y diff --git a/init/Kconfig b/init/Kconfig index 324c2886b2ea..681f38ee68db 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -129,6 +129,9 @@ config CC_HAS_COUNTED_BY # https://github.com/llvm/llvm-project/pull/112636 depends on !(CC_IS_CLANG && CLANG_VERSION < 190103) +config CC_HAS_MULTIDIMENSIONAL_NONSTRING + def_bool $(success,echo 'char tag[][4] __attribute__((__nonstring__)) = { };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror) + config RUSTC_HAS_COERCE_POINTEE def_bool RUSTC_VERSION >= 108400 @@ -1195,7 +1198,8 @@ config CPUSETS_V1 help Legacy cgroup v1 cpusets controller which has been deprecated by cgroup v2 implementation. The v1 is there for legacy applications - which haven't migrated to the new cgroup v2 interface yet. If you + which haven't migrated to the new cgroup v2 interface yet. Legacy + interface includes cpuset filesystem and /proc/<pid>/cpuset. If you do not have any such application then you are completely fine leaving this option disabled. @@ -1203,7 +1207,7 @@ config CPUSETS_V1 config PROC_PID_CPUSET bool "Include legacy /proc/<pid>/cpuset file" - depends on CPUSETS + depends on CPUSETS_V1 default y config CGROUP_DEVICE @@ -1454,6 +1458,13 @@ config INITRAMFS_PRESERVE_MTIME If unsure, say Y. +config INITRAMFS_TEST + bool "Test initramfs cpio archive extraction" if !KUNIT_ALL_TESTS + depends on BLK_DEV_INITRD && KUNIT=y + default KUNIT_ALL_TESTS + help + Build KUnit tests for initramfs. See Documentation/dev-tools/kunit + choice prompt "Compiler optimization level" default CC_OPTIMIZE_FOR_PERFORMANCE @@ -1869,11 +1880,6 @@ config KALLSYMS_ALL Say N unless you really need all symbols, or kernel live patching. -config KALLSYMS_ABSOLUTE_PERCPU - bool - depends on KALLSYMS - default X86_64 && SMP - # end of the "standard kernel features (expert users)" menu config ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/init/Makefile b/init/Makefile index 10b652d33e87..d6f75d8907e0 100644 --- a/init/Makefile +++ b/init/Makefile @@ -12,6 +12,7 @@ else obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o endif obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o +obj-$(CONFIG_INITRAMFS_TEST) += initramfs_test.o obj-y += init_task.o diff --git a/init/initramfs.c b/init/initramfs.c index b2f7583bb1f5..72bad44a1d41 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -20,6 +20,7 @@ #include <linux/security.h> #include "do_mounts.h" +#include "initramfs_internal.h" static __initdata bool csum_present; static __initdata u32 io_csum; @@ -75,6 +76,7 @@ static __initdata struct hash { struct hash *next; char name[N_ALIGN(PATH_MAX)]; } *head[32]; +static __initdata bool hardlink_seen; static inline int hash(int major, int minor, int ino) { @@ -108,19 +110,21 @@ static char __init *find_link(int major, int minor, int ino, strcpy(q->name, name); q->next = NULL; *p = q; + hardlink_seen = true; return NULL; } static void __init free_hash(void) { struct hash **p, *q; - for (p = head; p < head + 32; p++) { + for (p = head; hardlink_seen && p < head + 32; p++) { while (*p) { q = *p; *p = q->next; kfree(q); } } + hardlink_seen = false; } #ifdef CONFIG_INITRAMFS_PRESERVE_MTIME @@ -143,9 +147,8 @@ struct dir_entry { char name[]; }; -static void __init dir_add(const char *name, time64_t mtime) +static void __init dir_add(const char *name, size_t nlen, time64_t mtime) { - size_t nlen = strlen(name) + 1; struct dir_entry *de; de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); @@ -169,7 +172,7 @@ static void __init dir_utime(void) #else static void __init do_utime(char *filename, time64_t mtime) {} static void __init do_utime_path(const struct path *path, time64_t mtime) {} -static void __init dir_add(const char *name, time64_t mtime) {} +static void __init dir_add(const char *name, size_t nlen, time64_t mtime) {} static void __init dir_utime(void) {} #endif @@ -188,14 +191,11 @@ static __initdata u32 hdr_csum; static void __init parse_header(char *s) { unsigned long parsed[13]; - char buf[9]; int i; - buf[8] = '\0'; - for (i = 0, s += 6; i < 13; i++, s += 8) { - memcpy(buf, s, 8); - parsed[i] = simple_strtoul(buf, NULL, 16); - } + for (i = 0, s += 6; i < 13; i++, s += 8) + parsed[i] = simple_strntoul(s, NULL, 16, 8); + ino = parsed[0]; mode = parsed[1]; uid = parsed[2]; @@ -256,7 +256,7 @@ static __initdata char *header_buf, *symlink_buf, *name_buf; static int __init do_start(void) { - read_into(header_buf, 110, GotHeader); + read_into(header_buf, CPIO_HDRLEN, GotHeader); return 0; } @@ -396,7 +396,7 @@ static int __init do_name(void) init_mkdir(collected, mode); init_chown(collected, uid, gid, 0); init_chmod(collected, mode); - dir_add(collected, mtime); + dir_add(collected, name_len, mtime); } else if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { if (maybe_link() == 0) { @@ -497,20 +497,33 @@ static unsigned long my_inptr __initdata; /* index of next byte to be processed #include <linux/decompress/generic.h> -static char * __init unpack_to_rootfs(char *buf, unsigned long len) +/** + * unpack_to_rootfs - decompress and extract an initramfs archive + * @buf: input initramfs archive to extract + * @len: length of initramfs data to process + * + * Returns: NULL for success or an error message string + * + * This symbol shouldn't be used externally. It's available for unit tests. + */ +char * __init unpack_to_rootfs(char *buf, unsigned long len) { long written; decompress_fn decompress; const char *compress_name; - static __initdata char msg_buf[64]; + struct { + char header[CPIO_HDRLEN]; + char symlink[PATH_MAX + N_ALIGN(PATH_MAX) + 1]; + char name[N_ALIGN(PATH_MAX)]; + } *bufs = kmalloc(sizeof(*bufs), GFP_KERNEL); - header_buf = kmalloc(110, GFP_KERNEL); - symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); - name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); - - if (!header_buf || !symlink_buf || !name_buf) + if (!bufs) panic_show_mem("can't allocate buffers"); + header_buf = bufs->header; + symlink_buf = bufs->symlink; + name_buf = bufs->name; + state = Start; this_header = 0; message = NULL; @@ -538,12 +551,9 @@ static char * __init unpack_to_rootfs(char *buf, unsigned long len) if (res) error("decompressor failed"); } else if (compress_name) { - if (!message) { - snprintf(msg_buf, sizeof msg_buf, - "compression method %s not configured", - compress_name); - message = msg_buf; - } + pr_err("compression method %s not configured\n", + compress_name); + error("decompressor failed"); } else error("invalid magic at start of compressed archive"); if (state != Reset) @@ -553,9 +563,9 @@ static char * __init unpack_to_rootfs(char *buf, unsigned long len) len -= my_inptr; } dir_utime(); - kfree(name_buf); - kfree(symlink_buf); - kfree(header_buf); + /* free any hardlink state collected without optional TRAILER!!! */ + free_hash(); + kfree(bufs); return message; } diff --git a/init/initramfs_internal.h b/init/initramfs_internal.h new file mode 100644 index 000000000000..233dad16b0a0 --- /dev/null +++ b/init/initramfs_internal.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __INITRAMFS_INTERNAL_H__ +#define __INITRAMFS_INTERNAL_H__ + +char *unpack_to_rootfs(char *buf, unsigned long len); +#define CPIO_HDRLEN 110 + +#endif diff --git a/init/initramfs_test.c b/init/initramfs_test.c new file mode 100644 index 000000000000..517e5e04e5cc --- /dev/null +++ b/init/initramfs_test.c @@ -0,0 +1,407 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> +#include <linux/fcntl.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init_syscalls.h> +#include <linux/stringify.h> +#include <linux/timekeeping.h> +#include "initramfs_internal.h" + +struct initramfs_test_cpio { + char *magic; + unsigned int ino; + unsigned int mode; + unsigned int uid; + unsigned int gid; + unsigned int nlink; + unsigned int mtime; + unsigned int filesize; + unsigned int devmajor; + unsigned int devminor; + unsigned int rdevmajor; + unsigned int rdevminor; + unsigned int namesize; + unsigned int csum; + char *fname; + char *data; +}; + +static size_t fill_cpio(struct initramfs_test_cpio *cs, size_t csz, char *out) +{ + int i; + size_t off = 0; + + for (i = 0; i < csz; i++) { + char *pos = &out[off]; + struct initramfs_test_cpio *c = &cs[i]; + size_t thislen; + + /* +1 to account for nulterm */ + thislen = sprintf(pos, "%s" + "%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x" + "%s", + c->magic, c->ino, c->mode, c->uid, c->gid, c->nlink, + c->mtime, c->filesize, c->devmajor, c->devminor, + c->rdevmajor, c->rdevminor, c->namesize, c->csum, + c->fname) + 1; + pr_debug("packing (%zu): %.*s\n", thislen, (int)thislen, pos); + off += thislen; + while (off & 3) + out[off++] = '\0'; + + memcpy(&out[off], c->data, c->filesize); + off += c->filesize; + while (off & 3) + out[off++] = '\0'; + } + + return off; +} + +static void __init initramfs_test_extract(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len; + struct timespec64 ts_before, ts_after; + struct kstat st = {}; + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .uid = 12, + .gid = 34, + .nlink = 1, + .mtime = 56, + .filesize = 0, + .devmajor = 0, + .devminor = 1, + .rdevmajor = 0, + .rdevminor = 0, + .namesize = sizeof("initramfs_test_extract"), + .csum = 0, + .fname = "initramfs_test_extract", + }, { + .magic = "070701", + .ino = 2, + .mode = S_IFDIR | 0777, + .nlink = 1, + .mtime = 57, + .devminor = 1, + .namesize = sizeof("initramfs_test_extract_dir"), + .fname = "initramfs_test_extract_dir", + }, { + .magic = "070701", + .namesize = sizeof("TRAILER!!!"), + .fname = "TRAILER!!!", + } }; + + /* +3 to cater for any 4-byte end-alignment */ + cpio_srcbuf = kzalloc(ARRAY_SIZE(c) * (CPIO_HDRLEN + PATH_MAX + 3), + GFP_KERNEL); + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + ktime_get_real_ts64(&ts_before); + err = unpack_to_rootfs(cpio_srcbuf, len); + ktime_get_real_ts64(&ts_after); + if (err) { + KUNIT_FAIL(test, "unpack failed %s", err); + goto out; + } + + KUNIT_EXPECT_EQ(test, init_stat(c[0].fname, &st, 0), 0); + KUNIT_EXPECT_TRUE(test, S_ISREG(st.mode)); + KUNIT_EXPECT_TRUE(test, uid_eq(st.uid, KUIDT_INIT(c[0].uid))); + KUNIT_EXPECT_TRUE(test, gid_eq(st.gid, KGIDT_INIT(c[0].gid))); + KUNIT_EXPECT_EQ(test, st.nlink, 1); + if (IS_ENABLED(CONFIG_INITRAMFS_PRESERVE_MTIME)) { + KUNIT_EXPECT_EQ(test, st.mtime.tv_sec, c[0].mtime); + } else { + KUNIT_EXPECT_GE(test, st.mtime.tv_sec, ts_before.tv_sec); + KUNIT_EXPECT_LE(test, st.mtime.tv_sec, ts_after.tv_sec); + } + KUNIT_EXPECT_EQ(test, st.blocks, c[0].filesize); + + KUNIT_EXPECT_EQ(test, init_stat(c[1].fname, &st, 0), 0); + KUNIT_EXPECT_TRUE(test, S_ISDIR(st.mode)); + if (IS_ENABLED(CONFIG_INITRAMFS_PRESERVE_MTIME)) { + KUNIT_EXPECT_EQ(test, st.mtime.tv_sec, c[1].mtime); + } else { + KUNIT_EXPECT_GE(test, st.mtime.tv_sec, ts_before.tv_sec); + KUNIT_EXPECT_LE(test, st.mtime.tv_sec, ts_after.tv_sec); + } + + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); + KUNIT_EXPECT_EQ(test, init_rmdir(c[1].fname), 0); +out: + kfree(cpio_srcbuf); +} + +/* + * Don't terminate filename. Previously, the cpio filename field was passed + * directly to filp_open(collected, O_CREAT|..) without nulterm checks. See + * https://lore.kernel.org/linux-fsdevel/20241030035509.20194-2-ddiss@suse.de + */ +static void __init initramfs_test_fname_overrun(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len, suffix_off; + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .uid = 0, + .gid = 0, + .nlink = 1, + .mtime = 1, + .filesize = 0, + .devmajor = 0, + .devminor = 1, + .rdevmajor = 0, + .rdevminor = 0, + .namesize = sizeof("initramfs_test_fname_overrun"), + .csum = 0, + .fname = "initramfs_test_fname_overrun", + } }; + + /* + * poison cpio source buffer, so we can detect overrun. source + * buffer is used by read_into() when hdr or fname + * are already available (e.g. no compression). + */ + cpio_srcbuf = kmalloc(CPIO_HDRLEN + PATH_MAX + 3, GFP_KERNEL); + memset(cpio_srcbuf, 'B', CPIO_HDRLEN + PATH_MAX + 3); + /* limit overrun to avoid crashes / filp_open() ENAMETOOLONG */ + cpio_srcbuf[CPIO_HDRLEN + strlen(c[0].fname) + 20] = '\0'; + + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + /* overwrite trailing fname terminator and padding */ + suffix_off = len - 1; + while (cpio_srcbuf[suffix_off] == '\0') { + cpio_srcbuf[suffix_off] = 'P'; + suffix_off--; + } + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NOT_NULL(test, err); + + kfree(cpio_srcbuf); +} + +static void __init initramfs_test_data(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len; + struct file *file; + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .uid = 0, + .gid = 0, + .nlink = 1, + .mtime = 1, + .filesize = sizeof("ASDF") - 1, + .devmajor = 0, + .devminor = 1, + .rdevmajor = 0, + .rdevminor = 0, + .namesize = sizeof("initramfs_test_data"), + .csum = 0, + .fname = "initramfs_test_data", + .data = "ASDF", + } }; + + /* +6 for max name and data 4-byte padding */ + cpio_srcbuf = kmalloc(CPIO_HDRLEN + c[0].namesize + c[0].filesize + 6, + GFP_KERNEL); + + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NULL(test, err); + + file = filp_open(c[0].fname, O_RDONLY, 0); + if (IS_ERR(file)) { + KUNIT_FAIL(test, "open failed"); + goto out; + } + + /* read back file contents into @cpio_srcbuf and confirm match */ + len = kernel_read(file, cpio_srcbuf, c[0].filesize, NULL); + KUNIT_EXPECT_EQ(test, len, c[0].filesize); + KUNIT_EXPECT_MEMEQ(test, cpio_srcbuf, c[0].data, len); + + fput(file); + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); +out: + kfree(cpio_srcbuf); +} + +static void __init initramfs_test_csum(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len; + struct initramfs_test_cpio c[] = { { + /* 070702 magic indicates a valid csum is present */ + .magic = "070702", + .ino = 1, + .mode = S_IFREG | 0777, + .nlink = 1, + .filesize = sizeof("ASDF") - 1, + .devminor = 1, + .namesize = sizeof("initramfs_test_csum"), + .csum = 'A' + 'S' + 'D' + 'F', + .fname = "initramfs_test_csum", + .data = "ASDF", + }, { + /* mix csum entry above with no-csum entry below */ + .magic = "070701", + .ino = 2, + .mode = S_IFREG | 0777, + .nlink = 1, + .filesize = sizeof("ASDF") - 1, + .devminor = 1, + .namesize = sizeof("initramfs_test_csum_not_here"), + /* csum ignored */ + .csum = 5555, + .fname = "initramfs_test_csum_not_here", + .data = "ASDF", + } }; + + cpio_srcbuf = kmalloc(8192, GFP_KERNEL); + + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NULL(test, err); + + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); + KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), 0); + + /* mess up the csum and confirm that unpack fails */ + c[0].csum--; + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NOT_NULL(test, err); + + /* + * file (with content) is still retained in case of bad-csum abort. + * Perhaps we should change this. + */ + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); + KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), -ENOENT); + kfree(cpio_srcbuf); +} + +/* + * hardlink hashtable may leak when the archive omits a trailer: + * https://lore.kernel.org/r/20241107002044.16477-10-ddiss@suse.de/ + */ +static void __init initramfs_test_hardlink(struct kunit *test) +{ + char *err, *cpio_srcbuf; + size_t len; + struct kstat st0, st1; + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .nlink = 2, + .devminor = 1, + .namesize = sizeof("initramfs_test_hardlink"), + .fname = "initramfs_test_hardlink", + }, { + /* hardlink data is present in last archive entry */ + .magic = "070701", + .ino = 1, + .mode = S_IFREG | 0777, + .nlink = 2, + .filesize = sizeof("ASDF") - 1, + .devminor = 1, + .namesize = sizeof("initramfs_test_hardlink_link"), + .fname = "initramfs_test_hardlink_link", + .data = "ASDF", + } }; + + cpio_srcbuf = kmalloc(8192, GFP_KERNEL); + + len = fill_cpio(c, ARRAY_SIZE(c), cpio_srcbuf); + + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NULL(test, err); + + KUNIT_EXPECT_EQ(test, init_stat(c[0].fname, &st0, 0), 0); + KUNIT_EXPECT_EQ(test, init_stat(c[1].fname, &st1, 0), 0); + KUNIT_EXPECT_EQ(test, st0.ino, st1.ino); + KUNIT_EXPECT_EQ(test, st0.nlink, 2); + KUNIT_EXPECT_EQ(test, st1.nlink, 2); + + KUNIT_EXPECT_EQ(test, init_unlink(c[0].fname), 0); + KUNIT_EXPECT_EQ(test, init_unlink(c[1].fname), 0); + + kfree(cpio_srcbuf); +} + +#define INITRAMFS_TEST_MANY_LIMIT 1000 +#define INITRAMFS_TEST_MANY_PATH_MAX (sizeof("initramfs_test_many-") \ + + sizeof(__stringify(INITRAMFS_TEST_MANY_LIMIT))) +static void __init initramfs_test_many(struct kunit *test) +{ + char *err, *cpio_srcbuf, *p; + size_t len = INITRAMFS_TEST_MANY_LIMIT * + (CPIO_HDRLEN + INITRAMFS_TEST_MANY_PATH_MAX + 3); + char thispath[INITRAMFS_TEST_MANY_PATH_MAX]; + int i; + + p = cpio_srcbuf = kmalloc(len, GFP_KERNEL); + + for (i = 0; i < INITRAMFS_TEST_MANY_LIMIT; i++) { + struct initramfs_test_cpio c = { + .magic = "070701", + .ino = i, + .mode = S_IFREG | 0777, + .nlink = 1, + .devminor = 1, + .fname = thispath, + }; + + c.namesize = 1 + sprintf(thispath, "initramfs_test_many-%d", i); + p += fill_cpio(&c, 1, p); + } + + len = p - cpio_srcbuf; + err = unpack_to_rootfs(cpio_srcbuf, len); + KUNIT_EXPECT_NULL(test, err); + + for (i = 0; i < INITRAMFS_TEST_MANY_LIMIT; i++) { + sprintf(thispath, "initramfs_test_many-%d", i); + KUNIT_EXPECT_EQ(test, init_unlink(thispath), 0); + } + + kfree(cpio_srcbuf); +} + +/* + * The kunit_case/_suite struct cannot be marked as __initdata as this will be + * used in debugfs to retrieve results after test has run. + */ +static struct kunit_case __refdata initramfs_test_cases[] = { + KUNIT_CASE(initramfs_test_extract), + KUNIT_CASE(initramfs_test_fname_overrun), + KUNIT_CASE(initramfs_test_data), + KUNIT_CASE(initramfs_test_csum), + KUNIT_CASE(initramfs_test_hardlink), + KUNIT_CASE(initramfs_test_many), + {}, +}; + +static struct kunit_suite initramfs_test_suite = { + .name = "initramfs", + .test_cases = initramfs_test_cases, +}; +kunit_test_init_section_suites(&initramfs_test_suite); + +MODULE_DESCRIPTION("Initramfs KUnit test suite"); +MODULE_LICENSE("GPL v2"); diff --git a/init/main.c b/init/main.c index 2a1757826397..7f0a2a3dbd29 100644 --- a/init/main.c +++ b/init/main.c @@ -1553,7 +1553,6 @@ static noinline void __init kernel_init_freeable(void) init_mm_internals(); - rcu_init_tasks_generic(); do_pre_smp_initcalls(); lockup_detector_init(); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f7acae5f7e1d..6427d71c773b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2423,7 +2423,7 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) goto out_wake; } - iowq->t.function = io_cqring_timer_wakeup; + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); hrtimer_set_expires(timer, iowq->timeout); return HRTIMER_RESTART; out_wake: diff --git a/io_uring/net.c b/io_uring/net.c index 5d0b56ff50ee..50e8a3ccc9de 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -148,7 +148,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr); if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); } } @@ -441,7 +441,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static void io_req_msg_cleanup(struct io_kiocb *req, unsigned int issue_flags) { - req->flags &= ~REQ_F_NEED_CLEANUP; io_netmsg_recycle(req, issue_flags); } @@ -1441,6 +1440,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); + zc->notif = NULL; io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); @@ -1501,6 +1501,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); + sr->notif = NULL; io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 48fc8cf70784..c5fb817b1e28 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -407,8 +407,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return -EALREADY; - hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); - io->timer.function = io_link_timeout_fn; + hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode); hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); return 0; } @@ -430,8 +429,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, data->ts = *ts; list_add_tail(&timeout->list, &ctx->timeout_list); - hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); - data->timer.function = io_timeout_fn; + hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); return 0; } @@ -557,7 +555,6 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); if (is_timeout_link) { struct io_submit_link *link = &req->ctx->submit_state.link; @@ -568,6 +565,10 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; timeout->head = link->last; link->last->flags |= REQ_F_ARM_LTIMEOUT; + hrtimer_setup(&data->timer, io_link_timeout_fn, io_timeout_get_clock(data), + data->mode); + } else { + hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), data->mode); } return 0; } @@ -627,7 +628,6 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) } add: list_add(&timeout->list, entry); - data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; @@ -646,7 +646,6 @@ void io_queue_linked_timeout(struct io_kiocb *req) if (timeout->head) { struct io_timeout_data *data = req->async_data; - data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); diff --git a/kernel/Makefile b/kernel/Makefile index 87866b037fbe..434929de17ef 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,11 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_context_tracking.o += -DDISABLE_BRANCH_PROFILING +endif + # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() # in coverage traces. KCOV_INSTRUMENT_softirq.o := n diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 7f358740e958..367eaf2c78b7 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -350,11 +350,10 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - if (d_is_positive(d)) { - /* update watch filter fields */ - watch->dev = d->d_sb->s_dev; - watch->ino = d_backing_inode(d)->i_ino; - } + /* update watch filter fields */ + watch->dev = d->d_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; + inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; @@ -419,10 +418,11 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret) { + if (ret && ret != -ENOENT) { audit_put_watch(watch); return ret; } + ret = 0; /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9c853cde9abe..78fd876a5473 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2207,10 +2207,8 @@ __audit_reusename(const __user char *uptr) list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; - if (n->name->uptr == uptr) { - atomic_inc(&n->name->refcnt); - return n->name; - } + if (n->name->uptr == uptr) + return refname(n->name); } return NULL; } @@ -2237,7 +2235,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - atomic_inc(&name->refcnt); + refname(name); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2369,7 +2367,7 @@ out_alloc: return; if (name) { n->name = name; - atomic_inc(&name->refcnt); + refname(name); } out: @@ -2496,7 +2494,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - atomic_inc(&found_child->name->refcnt); + refname(found_child->name); } } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f27ce162427a..672abe111282 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1284,8 +1284,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u atomic_set(&t->cancelling, 0); INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; + hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9aaf5124648b..dc3aa91a6ba0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,14 +150,14 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } -static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; @@ -166,7 +166,7 @@ static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, inc_nlink(dir); bpf_dentry_finalize(dentry, inode, dir); - return 0; + return NULL; } struct map_iter { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 60611df77957..6e604caa870c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21702,12 +21702,12 @@ patch_map_ops_generic: if (insn->imm == BPF_FUNC_get_smp_processor_id && verifier_inlines_helper_call(env, insn->imm)) { /* BPF_FUNC_get_smp_processor_id inlining is an - * optimization, so if pcpu_hot.cpu_number is ever + * optimization, so if cpu_number is ever * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ #ifdef CONFIG_SMP - insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; diff --git a/kernel/cfi.c b/kernel/cfi.c index 08caad776717..19be79639542 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -7,6 +7,8 @@ #include <linux/cfi.h> +bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); + enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type) { @@ -17,7 +19,7 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, pr_err("CFI failure at %pS (no target information)\n", (void *)addr); - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + if (cfi_warn) { __warn(NULL, 0, (void *)addr, 0, regs, NULL); return BUG_TRAP_TYPE_WARN; } diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c964dd7ff967..95ab39e1ec8f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -168,6 +168,7 @@ struct cgroup_mgctx { extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; +extern bool cgrp_dfl_visible; /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e28d5f0d20ed..11ea8d24ac72 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = { int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; + bool cgrp_v1_visible = false; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) { if (cgroup1_subsys_absent(ss)) continue; + cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } + if (cgrp_dfl_visible && !cgrp_v1_visible) + pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n"); + + return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index afc665b7b1fe..f231fe3a0744 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_visible; +bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; @@ -4447,7 +4447,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -5831,7 +5831,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) } /* - * This extra ref will be put in cgroup_free_fn() and guarantees + * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 25c1d7b77e2f..b69a7db67090 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include "cgroup-internal.h" #include "cpuset-internal.h" /* @@ -175,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = update_relax_domain_level(cs, val); break; default: @@ -373,6 +375,46 @@ out: return ret; } +#ifdef CONFIG_PROC_PID_CPUSET +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc/<pid>/cpuset. + */ +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + char *buf; + struct cgroup_subsys_state *css; + int retval; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto out; + + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + + if (retval == -E2BIG) + retval = -ENAMETOOLONG; + if (retval < 0) + goto out_free; + seq_puts(m, buf); + seq_putc(m, '\n'); + retval = 0; +out_free: + kfree(buf); +out: + return retval; +} +#endif /* CONFIG_PROC_PID_CPUSET */ + static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); @@ -424,24 +466,31 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_MEM_HARDWALL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: + pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: + pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); break; case FILE_SPREAD_SLAB: + pr_warn_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); break; default: diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f910c828973..39c1fc643d77 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,7 +21,6 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ -#include "cgroup-internal.h" #include "cpuset-internal.h" #include <linux/init.h> @@ -954,10 +953,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs) css_task_iter_end(&it); } -static void dl_rebuild_rd_accounting(void) +void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; + int cpu; + u64 cookie = ++dl_cookie; lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); @@ -965,11 +966,12 @@ static void dl_rebuild_rd_accounting(void) rcu_read_lock(); - /* - * Clear default root domain DL accounting, it will be computed again - * if a task belongs to it. - */ - dl_clear_root_domain(&def_root_domain); + for_each_possible_cpu(cpu) { + if (dl_bw_visited(cpu, cookie)) + continue; + + dl_clear_root_domain_cpu(cpu); + } cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { @@ -990,16 +992,6 @@ static void dl_rebuild_rd_accounting(void) rcu_read_unlock(); } -static void -partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - mutex_lock(&sched_domains_mutex); - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - dl_rebuild_rd_accounting(); - mutex_unlock(&sched_domains_mutex); -} - /* * Rebuild scheduler domains. * @@ -1061,7 +1053,7 @@ void rebuild_sched_domains_locked(void) ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); + partition_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ void rebuild_sched_domains_locked(void) @@ -1083,6 +1075,13 @@ void rebuild_sched_domains(void) cpus_read_unlock(); } +void cpuset_reset_sched_domains(void) +{ + mutex_lock(&cpuset_mutex); + partition_sched_domains(1, NULL, NULL); + mutex_unlock(&cpuset_mutex); +} + /** * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -4244,50 +4243,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc/<pid>/cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - rcu_read_lock(); - spin_lock_irq(&css_set_lock); - css = task_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - spin_unlock_irq(&css_set_lock); - rcu_read_unlock(); - - if (retval == -E2BIG) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 074653f964c1..039d1eb2f215 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -430,9 +430,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of, if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) { + pr_info_once("Freezing with imperfect legacy cgroup freezer. " + "See cgroup.freeze of cgroup v2\n"); freeze = true; - else + } else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6..2fa3a4fb2aaf 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -68,22 +68,6 @@ static inline bool valid_type(enum misc_res_type type) } /** - * misc_cg_res_total_usage() - Get the current total usage of the resource. - * @type: misc res type. - * - * Context: Any context. - * Return: Current total usage of the resource. - */ -u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - if (valid_type(type)) - return atomic64_read(&root_cg.res[type].usage); - - return 0; -} -EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); - -/** * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. * @type: Type of the misc res. * @capacity: Supported capacity of the misc res on the host. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index aac91466279f..4bb587d5d34f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -299,40 +299,6 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) spin_unlock_irq(&cgroup_rstat_lock); } -/* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) -{ - int cpu; - - lockdep_assert_held(&cgroup_rstat_lock); - - for_each_possible_cpu(cpu) { - struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - - for (; pos; pos = pos->rstat_flush_next) { - struct cgroup_subsys_state *css; - - cgroup_base_stat_flush(pos, cpu); - bpf_rstat_flush(pos, cgroup_parent(pos), cpu); - - rcu_read_lock(); - list_for_each_entry_rcu(css, &pos->rstat_css_list, - rstat_css_node) - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); - } - - /* play nice and yield if necessary */ - if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - __cgroup_rstat_unlock(cgrp, cpu); - if (!cond_resched()) - cpu_relax(); - __cgroup_rstat_lock(cgrp, cpu); - } - } -} - /** * cgroup_rstat_flush - flush stats in @cgrp's subtree * @cgrp: target cgroup @@ -348,38 +314,30 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) */ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { + int cpu; + might_sleep(); + for_each_possible_cpu(cpu) { + struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); - __cgroup_rstat_unlock(cgrp, -1); -} + /* Reacquire for each CPU to avoid disabling IRQs too long */ + __cgroup_rstat_lock(cgrp, cpu); + for (; pos; pos = pos->rstat_flush_next) { + struct cgroup_subsys_state *css; -/** - * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold - * @cgrp: target cgroup - * - * Flush stats in @cgrp's subtree and prevent further flushes. Must be - * paired with cgroup_rstat_flush_release(). - * - * This function may block. - */ -void cgroup_rstat_flush_hold(struct cgroup *cgrp) - __acquires(&cgroup_rstat_lock) -{ - might_sleep(); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); -} + cgroup_base_stat_flush(pos, cpu); + bpf_rstat_flush(pos, cgroup_parent(pos), cpu); -/** - * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() - * @cgrp: cgroup used by tracepoint - */ -void cgroup_rstat_flush_release(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) -{ - __cgroup_rstat_unlock(cgrp, -1); + rcu_read_lock(); + list_for_each_entry_rcu(css, &pos->rstat_css_list, + rstat_css_node) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } + __cgroup_rstat_unlock(cgrp, cpu); + if (!cond_resched()) + cpu_relax(); + } } int cgroup_rstat_init(struct cgroup *cgrp) @@ -612,36 +570,34 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime, ntime; + struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; + cgroup_rstat_flush(cgrp); + __cgroup_rstat_lock(cgrp, -1); + bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, - &utime, &stime); - ntime = cgrp->bstat.ntime; - cgroup_rstat_flush_release(cgrp); + &bstat.cputime.utime, &bstat.cputime.stime); + __cgroup_rstat_unlock(cgrp, -1); } else { - /* cgrp->bstat of root is not actually used, reuse it */ - root_cgroup_cputime(&cgrp->bstat); - usage = cgrp->bstat.cputime.sum_exec_runtime; - utime = cgrp->bstat.cputime.utime; - stime = cgrp->bstat.cputime.stime; - ntime = cgrp->bstat.ntime; + root_cgroup_cputime(&bstat); } - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - do_div(ntime, NSEC_PER_USEC); + do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); + do_div(bstat.cputime.utime, NSEC_PER_USEC); + do_div(bstat.cputime.stime, NSEC_PER_USEC); + do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", - usage, utime, stime, ntime); + bstat.cputime.sum_exec_runtime, + bstat.cputime.utime, + bstat.cputime.stime, + bstat.ntime); - cgroup_force_idle_show(seq, &cgrp->bstat); + cgroup_force_idle_show(seq, &bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 3fabb8f55ef6..dd7c32fb5ac1 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -46,7 +46,7 @@ CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set -# CONFIG_UBSAN_SIGNED_WRAP is not set +# CONFIG_UBSAN_INTEGER_WRAP is not set # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set # CONFIG_UBSAN_ALIGNMENT is not set diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 938c48952d26..fb5be6e9b423 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -80,17 +80,16 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void) */ static noinstr void ct_kernel_exit_state(int offset) { - int seq; - /* * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ rcu_task_trace_heavyweight_enter(); // Before CT state update! - seq = ct_state_inc(offset); - // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); + // RCU is still watching. Better not be in extended quiescent state! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu()); + (void)ct_state_inc(offset); + // RCU is no longer watching. } /* diff --git a/kernel/cpu.c b/kernel/cpu.c index 07455d25329c..ad755db29efd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1453,11 +1453,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); - /* - * Do post unplug cleanup. This is still protected against - * concurrent CPU hotplug via cpu_add_remove_lock. - */ - lockup_detector_cleanup(); arch_smt_update(); return ret; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 078fe5bc5a74..335b8425dd4b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 5b4e6d3bf7bc..b8fe0b3d0ffb 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -584,6 +584,22 @@ int dma_direct_supported(struct device *dev, u64 mask) return mask >= phys_to_dma_unencrypted(dev, min_mask); } +static const struct bus_dma_region *dma_find_range(struct device *dev, + unsigned long start_pfn) +{ + const struct bus_dma_region *m; + + for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { + unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); + + if (start_pfn >= cpu_start_pfn && + start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) + return m; + } + + return NULL; +} + /* * To check whether all ram resource ranges are covered by dma range map * Returns 0 when further check is needed @@ -593,20 +609,12 @@ static int check_ram_in_range_map(unsigned long start_pfn, unsigned long nr_pages, void *data) { unsigned long end_pfn = start_pfn + nr_pages; - const struct bus_dma_region *bdr = NULL; - const struct bus_dma_region *m; struct device *dev = data; while (start_pfn < end_pfn) { - for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { - unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); + const struct bus_dma_region *bdr; - if (start_pfn >= cpu_start_pfn && - start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) { - bdr = m; - break; - } - } + bdr = dma_find_range(dev, start_pfn); if (!bdr) return 1; diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 095c775e001e..d4b8bd0af79b 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -6,6 +6,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +# Branch profiling isn't noinstr-safe +ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING + CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e33691d5adf7..20154572ede9 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -49,7 +49,7 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Do seccomp after ptrace, to catch any tracer changes. */ if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(NULL); + ret = __secure_computing(); if (ret == -1L) return ret; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 8a47e52a454f..6c83ad674d01 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -22,6 +22,7 @@ struct callchain_cpus_entries { int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; +static const int six_hundred_forty_kb = 640 * 1024; static inline size_t perf_callchain_entry__sizeof(void) { @@ -266,12 +267,8 @@ exit_put: return entry; } -/* - * Used for sysctl_perf_event_max_stack and - * sysctl_perf_event_max_contexts_per_stack. - */ -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int perf_event_max_stack_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; @@ -292,3 +289,32 @@ int perf_event_max_stack_handler(const struct ctl_table *table, int write, return ret; } + +static const struct ctl_table callchain_sysctl_table[] = { + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_THOUSAND, + }, +}; + +static int __init init_callchain_sysctls(void) +{ + register_sysctl_init("kernel", callchain_sysctl_table); + return 0; +} +core_initcall(init_callchain_sysctls); + diff --git a/kernel/events/core.c b/kernel/events/core.c index 823aa0824916..5d2221ec6d7c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -55,6 +55,7 @@ #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> +#include <linux/percpu-rwsem.h> #include "internal.h" @@ -452,8 +453,8 @@ static struct kmem_cache *perf_event_cache; */ int sysctl_perf_event_paranoid __read_mostly = 2; -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ +static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* * max perf event sample rate @@ -463,6 +464,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; @@ -484,7 +486,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, +static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -506,9 +508,7 @@ int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, return 0; } -int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; - -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, +static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -528,6 +528,52 @@ int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, return 0; } +static const struct ctl_table events_core_sysctl_table[] = { + /* + * User-space relies on this file as a feature check for + * perf_events being enabled. It's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_event_max_sample_rate_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +}; + +static int __init init_events_core_sysctls(void) +{ + register_sysctl_init("kernel", events_core_sysctl_table); + return 0; +} +core_initcall(init_events_core_sysctls); + + /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not @@ -1147,8 +1193,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - timer->function = perf_mux_hrtimer_handler; + hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) @@ -1172,42 +1218,40 @@ static int perf_mux_hrtimer_restart_ipi(void *arg) return perf_mux_hrtimer_restart(arg); } +static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) +{ + return *this_cpu_ptr(pmu->cpu_pmu_context); +} + void perf_pmu_disable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { - WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); -} - -static void get_ctx(struct perf_event_context *ctx) -{ - refcount_inc(&ctx->refcount); + int *count = &this_cpc(pmu)->pmu_disable_count; + WARN_ON_ONCE(*count == 0); } -static void *alloc_task_ctx_data(struct pmu *pmu) +static inline void perf_pmu_read(struct perf_event *event) { - if (pmu->task_ctx_cache) - return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); - - return NULL; + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); } -static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +static void get_ctx(struct perf_event_context *ctx) { - if (pmu->task_ctx_cache && task_ctx_data) - kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -2303,7 +2347,7 @@ static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled @@ -2444,9 +2488,8 @@ __perf_remove_from_context(struct perf_event *event, pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -2584,7 +2627,7 @@ static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2691,7 +2734,7 @@ error: static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); /* * Groups consisting entirely of software events can always go on. @@ -3314,9 +3357,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); - cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -3473,8 +3515,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -3522,52 +3563,17 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ - for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ - pos2 = list_first_entry(head2, typeof(*pos2), member); \ - !list_entry_is_head(pos1, head1, member) && \ - !list_entry_is_head(pos2, head2, member); \ - pos1 = list_next_entry(pos1, member), \ - pos2 = list_next_entry(pos2, member)) - -static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, - struct perf_event_context *next_ctx) -{ - struct perf_event_pmu_context *prev_epc, *next_epc; - - if (!prev_ctx->nr_task_data) - return; - - double_list_for_each_entry(prev_epc, next_epc, - &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, - pmu_ctx_entry) { - - if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) - continue; - - /* - * PMU specific parts of task perf context can require - * additional synchronization. As an example of such - * synchronization see implementation details of Intel - * LBR call stack data profiling; - */ - if (prev_epc->pmu->swap_task_ctx) - prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); - else - swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); - } -} - -static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, + struct task_struct *task, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) - pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); + pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in); } } @@ -3630,17 +3636,16 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_ctx_sched_task_cb(ctx, false); - perf_event_swap_task_ctx_data(ctx, next_ctx); + perf_ctx_sched_task_cb(ctx, task, false); perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of - * ctx->task and ctx->task_ctx_data are immaterial - * since those values are always verified under - * ctx->lock which we're now holding. + * ctx->task is immaterial since this value is + * always verified under ctx->lock which we're now + * holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); @@ -3660,7 +3665,7 @@ unlock: perf_ctx_disable(ctx, false); inside_switch: - perf_ctx_sched_task_cb(ctx, false); + perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); @@ -3673,7 +3678,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); this_cpu_dec(perf_sched_cb_usages); barrier(); @@ -3685,7 +3690,7 @@ void perf_sched_cb_dec(struct pmu *pmu) void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); @@ -3702,7 +3707,8 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, + struct task_struct *task, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; @@ -3716,7 +3722,7 @@ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_i perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); - pmu->sched_task(cpc->task_epc, sched_in); + pmu->sched_task(cpc->task_epc, task, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -3734,7 +3740,7 @@ static void perf_pmu_sched_task(struct task_struct *prev, return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) - __perf_pmu_sched_task(cpc, sched_in); + __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in); } static void perf_event_switch(struct task_struct *task, @@ -3802,7 +3808,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx) if (!pmu_ctx->ctx->task) return; - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } @@ -3930,11 +3936,15 @@ static int merge_sched_in(struct perf_event *event, void *data) if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + if (*perf_event_fasync(event)) + event->pending_kill = POLL_HUP; + + perf_event_wakeup(event); } else { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); event->pmu_ctx->rotate_necessary = 1; - cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } @@ -4029,7 +4039,7 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx, false); - perf_ctx_sched_task_cb(ctx, true); + perf_ctx_sched_task_cb(ctx, task, true); perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); @@ -4060,7 +4070,7 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_event_sched_in(cpuctx, ctx, NULL); - perf_ctx_sched_task_cb(cpuctx->task_ctx, true); + perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx, false); @@ -4618,15 +4628,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -4951,7 +4954,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; - void *task_ctx_data = NULL; if (!ctx->task) { /* @@ -4961,11 +4963,14 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, */ struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { - atomic_set(&epc->refcount, 1); + /* + * One extra reference for the pmu; see perf_pmu_free(). + */ + atomic_set(&epc->refcount, 2); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; @@ -4981,14 +4986,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, if (!new) return ERR_PTR(-ENOMEM); - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = alloc_task_ctx_data(pmu); - if (!task_ctx_data) { - kfree(new); - return ERR_PTR(-ENOMEM); - } - } - __perf_init_event_pmu_context(new, pmu); /* @@ -5023,14 +5020,7 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, epc->ctx = ctx; found_epc: - if (task_ctx_data && !epc->task_ctx_data) { - epc->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - ctx->nr_task_data++; - } raw_spin_unlock_irq(&ctx->lock); - - free_task_ctx_data(pmu, task_ctx_data); kfree(new); return epc; @@ -5041,11 +5031,18 @@ static void get_pmu_ctx(struct perf_event_pmu_context *epc) WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } +static void free_cpc_rcu(struct rcu_head *head) +{ + struct perf_cpu_pmu_context *cpc = + container_of(head, typeof(*cpc), epc.rcu_head); + + kfree(cpc); +} + static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); - kfree(epc->task_ctx_data); kfree(epc); } @@ -5075,8 +5072,10 @@ static void put_pmu_ctx(struct perf_event_pmu_context *epc) raw_spin_unlock_irqrestore(&ctx->lock, flags); - if (epc->embedded) + if (epc->embedded) { + call_rcu(&epc->rcu_head, free_cpc_rcu); return; + } call_rcu(&epc->rcu_head, free_epc_rcu); } @@ -5152,6 +5151,225 @@ static void unaccount_freq_event(void) atomic_dec(&nr_freq_events); } + +static struct perf_ctx_data * +alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global) +{ + struct perf_ctx_data *cd; + + cd = kzalloc(sizeof(*cd), GFP_KERNEL); + if (!cd) + return NULL; + + cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL); + if (!cd->data) { + kfree(cd); + return NULL; + } + + cd->global = global; + cd->ctx_cache = ctx_cache; + refcount_set(&cd->refcount, 1); + + return cd; +} + +static void free_perf_ctx_data(struct perf_ctx_data *cd) +{ + kmem_cache_free(cd->ctx_cache, cd->data); + kfree(cd); +} + +static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head) +{ + struct perf_ctx_data *cd; + + cd = container_of(rcu_head, struct perf_ctx_data, rcu_head); + free_perf_ctx_data(cd); +} + +static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd) +{ + call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu); +} + +static int +attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, + bool global) +{ + struct perf_ctx_data *cd, *old = NULL; + + cd = alloc_perf_ctx_data(ctx_cache, global); + if (!cd) + return -ENOMEM; + + for (;;) { + if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (old) + perf_free_ctx_data_rcu(old); + return 0; + } + + if (!old) { + /* + * After seeing a dead @old, we raced with + * removal and lost, try again to install @cd. + */ + continue; + } + + if (refcount_inc_not_zero(&old->refcount)) { + free_perf_ctx_data(cd); /* unused */ + return 0; + } + + /* + * @old is a dead object, refcount==0 is stable, try and + * replace it with @cd. + */ + } + return 0; +} + +static void __detach_global_ctx_data(void); +DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem); +static refcount_t global_ctx_data_ref; + +static int +attach_global_ctx_data(struct kmem_cache *ctx_cache) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + int ret; + + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; +again: + /* Allocate everything */ + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (cd && !cd->global) { + cd->global = 1; + if (!refcount_inc_not_zero(&cd->refcount)) + cd = NULL; + } + if (!cd) { + get_task_struct(p); + goto alloc; + } + } + } + + refcount_set(&global_ctx_data_ref, 1); + + return 0; +alloc: + ret = attach_task_ctx_data(p, ctx_cache, true); + put_task_struct(p); + if (ret) { + __detach_global_ctx_data(); + return ret; + } + goto again; +} + +static int +attach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache; + int ret; + + if (!ctx_cache) + return -ENOMEM; + + if (task) + return attach_task_ctx_data(task, ctx_cache, false); + + ret = attach_global_ctx_data(ctx_cache); + if (ret) + return ret; + + event->attach_state |= PERF_ATTACH_GLOBAL_DATA; + return 0; +} + +static void +detach_task_ctx_data(struct task_struct *p) +{ + struct perf_ctx_data *cd; + + scoped_guard (rcu) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !refcount_dec_and_test(&cd->refcount)) + return; + } + + /* + * The old ctx_data may be lost because of the race. + * Nothing is required to do for the case. + * See attach_task_ctx_data(). + */ + if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); +} + +static void __detach_global_ctx_data(void) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + +again: + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !cd->global) + continue; + cd->global = 0; + get_task_struct(p); + goto detach; + } + } + return; +detach: + detach_task_ctx_data(p); + put_task_struct(p); + goto again; +} + +static void detach_global_ctx_data(void) +{ + if (refcount_dec_not_one(&global_ctx_data_ref)) + return; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (!refcount_dec_and_test(&global_ctx_data_ref)) + return; + + /* remove everything */ + __detach_global_ctx_data(); +} + +static void detach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + + event->attach_state &= ~PERF_ATTACH_TASK_DATA; + + if (task) + return detach_task_ctx_data(task); + + if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) { + detach_global_ctx_data(); + event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA; + } +} + static void unaccount_event(struct perf_event *event) { bool dec = false; @@ -5246,6 +5464,8 @@ static int exclusive_event_init(struct perf_event *event) return -EBUSY; } + event->attach_state |= PERF_ATTACH_EXCLUSIVE; + return 0; } @@ -5253,14 +5473,13 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!is_exclusive_pmu(pmu)) - return; - /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); + + event->attach_state &= ~PERF_ATTACH_EXCLUSIVE; } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) @@ -5292,8 +5511,7 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void perf_addr_filters_splice(struct perf_event *event, - struct list_head *head); +static void perf_free_addr_filters(struct perf_event *event); static void perf_pending_task_sync(struct perf_event *event) { @@ -5319,39 +5537,22 @@ static void perf_pending_task_sync(struct perf_event *event) rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); } -static void _free_event(struct perf_event *event) +/* vs perf_event_alloc() error */ +static void __free_event(struct perf_event *event) { - irq_work_sync(&event->pending_irq); - irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); - - unaccount_event(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) + put_callchain_buffers(); - security_perf_event_free(event); + kfree(event->addr_filter_ranges); - if (event->rb) { - /* - * Can happen when we close an event with re-directed output. - * - * Since we have a 0 refcount, perf_mmap_close() will skip - * over us; possibly making our ring_buffer_put() the last. - */ - mutex_lock(&event->mmap_mutex); - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - } + if (event->attach_state & PERF_ATTACH_EXCLUSIVE) + exclusive_event_destroy(event); if (is_cgroup_event(event)) perf_detach_cgroup(event); - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filter_ranges); + if (event->attach_state & PERF_ATTACH_TASK_DATA) + detach_perf_ctx_data(event); if (event->destroy) event->destroy(event); @@ -5363,22 +5564,60 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); - if (event->pmu_ctx) + if (event->pmu_ctx) { + /* + * put_pmu_ctx() needs an event->ctx reference, because of + * epc->ctx. + */ + WARN_ON_ONCE(!event->ctx); + WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); + } /* - * perf_event_free_task() relies on put_ctx() being 'last', in particular - * all task references must be cleaned up. + * perf_event_free_task() relies on put_ctx() being 'last', in + * particular all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); - exclusive_event_destroy(event); - module_put(event->pmu->module); + if (event->pmu) + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } +DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) + +/* vs perf_event_alloc() success */ +static void _free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); + + unaccount_event(event); + + security_perf_event_free(event); + + if (event->rb) { + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + __free_event(event); +} + /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. @@ -5847,6 +6086,10 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) if (is_event_hup(event)) return events; + if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && + event->attr.pinned)) + return events; + /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. @@ -6616,7 +6859,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; - int ret = 0, flags = 0; + int ret, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6634,9 +6877,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + user_extra = nr_pages; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; if (vma->vm_pgoff == 0) { - nr_pages = (vma_size / PAGE_SIZE) - 1; + nr_pages -= 1; + + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + goto unlock; + + WARN_ON_ONCE(event->ctx->parent_ctx); + + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + goto unlock; + + if (atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Success -- managed to mmap() the same buffer + * multiple times. + */ + ret = 0; + /* We need the rb to map pages. */ + rb = event->rb; + goto unlock; + } + + /* + * Raced against perf_mmap_close()'s + * atomic_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); + } + } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -6645,16 +6933,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) */ u64 aux_offset, aux_size; - if (!event->rb) - return -EINVAL; - - nr_pages = vma_size / PAGE_SIZE; - if (nr_pages > INT_MAX) - return -ENOMEM; - - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; - rb = event->rb; if (!rb) goto aux_unlock; @@ -6695,48 +6973,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } atomic_set(&rb->aux_mmap_count, 1); - user_extra = nr_pages; - - goto accounting; - } - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); -again: - mutex_lock(&event->mmap_mutex); - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) { - ret = -EINVAL; - goto unlock; - } - - if (!atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Raced against perf_mmap_close(); remove the - * event and try again. - */ - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - goto again; - } - - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; } - user_extra = nr_pages + 1; - -accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -6804,6 +7042,8 @@ accounting: rb->aux_mmap_locked = extra; } + ret = 0; + unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); @@ -6828,7 +7068,7 @@ aux_unlock: if (!ret) ret = map_range(rb, vma); - if (event->pmu->event_mapped) + if (!ret && event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; @@ -7452,9 +7692,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) @@ -7467,9 +7706,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) @@ -7528,6 +7766,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) @@ -8522,10 +8763,58 @@ static void perf_event_task(struct task_struct *task, task_ctx); } +/* + * Allocate data for a new task when profiling system-wide + * events which require PMU specific data + */ +static void +perf_event_alloc_task_data(struct task_struct *child, + struct task_struct *parent) +{ + struct kmem_cache *ctx_cache = NULL; + struct perf_ctx_data *cd; + + if (!refcount_read(&global_ctx_data_ref)) + return; + + scoped_guard (rcu) { + cd = rcu_dereference(parent->perf_ctx_data); + if (cd) + ctx_cache = cd->ctx_cache; + } + + if (!ctx_cache) + return; + + guard(percpu_read)(&global_ctx_data_rwsem); + scoped_guard (rcu) { + cd = rcu_dereference(child->perf_ctx_data); + if (!cd) { + /* + * A system-wide event may be unaccount, + * when attaching the perf_ctx_data. + */ + if (!refcount_read(&global_ctx_data_ref)) + return; + goto attach; + } + + if (!cd->global) { + cd->global = 1; + refcount_inc(&cd->refcount); + } + } + + return; +attach: + attach_task_ctx_data(child, ctx_cache, true); +} + void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); + perf_event_alloc_task_data(task, current); } /* @@ -8589,7 +8878,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strscpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -9033,7 +9322,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } cpy_name: - strscpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name); name = tmp; got_name: /* @@ -9457,7 +9746,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - strscpy(name, sym, KSYM_NAME_LEN); + strscpy(name, sym); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; @@ -10840,6 +11129,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, void perf_event_free_bpf_prog(struct perf_event *event) { + if (!event->prog) + return; + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; @@ -10938,6 +11230,17 @@ static void perf_addr_filters_splice(struct perf_event *event, free_filters_list(&list); } +static void perf_free_addr_filters(struct perf_event *event) +{ + /* + * Used during free paths, there is no concurrency. + */ + if (list_empty(&event->addr_filters.list)) + return; + + perf_addr_filters_splice(event, NULL); +} + /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. @@ -11376,8 +11679,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hwc->hrtimer.function = perf_swevent_hrtimer; + hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); /* * Since hrtimers have a fixed rate, we can do a static freq->period @@ -11614,11 +11916,6 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -static void free_pmu_context(struct pmu *pmu) -{ - free_percpu(pmu->cpu_pmu_context); -} - /* * Let userspace know that this PMU supports address range filtering: */ @@ -11628,7 +11925,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return sysfs_emit(page, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -11639,7 +11936,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); + return sysfs_emit(page, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -11650,7 +11947,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); + return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -11681,7 +11978,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); @@ -11824,6 +12121,7 @@ del_dev: free_dev: put_device(pmu->dev); + pmu->dev = NULL; goto out; } @@ -11845,57 +12143,85 @@ static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) return true; } -int perf_pmu_register(struct pmu *pmu, const char *name, int type) +static void perf_pmu_free(struct pmu *pmu) { - int cpu, ret, max = PERF_TYPE_MAX; + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; + if (pmu->cpu_pmu_context) { + int cpu; - pmu->type = -1; - if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { - ret = -EINVAL; - goto free_pdc; - } + for_each_possible_cpu(cpu) { + struct perf_cpu_pmu_context *cpc; - if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { - ret = -EINVAL; - goto free_pdc; + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); + if (!cpc) + continue; + if (cpc->epc.embedded) { + /* refcount managed */ + put_pmu_ctx(&cpc->epc); + continue; + } + kfree(cpc); + } + free_percpu(pmu->cpu_pmu_context); } +} + +DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T)) + +int perf_pmu_register(struct pmu *_pmu, const char *name, int type) +{ + int cpu, max = PERF_TYPE_MAX; + + struct pmu *pmu __free(pmu_unregister) = _pmu; + guard(mutex)(&pmus_lock); + + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) + return -EINVAL; + + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, + "Can not register a pmu with an invalid scope.\n")) + return -EINVAL; pmu->name = name; if (type >= 0) max = type; - ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL); - if (ret < 0) - goto free_pdc; + CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL); + if (pmu_type.id < 0) + return pmu_type.id; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && pmu_type.id != type); - type = ret; - pmu->type = type; + pmu->type = pmu_type.id; atomic_set(&pmu->exclusive_cnt, 0); if (pmu_bus_running && !pmu->dev) { - ret = pmu_dev_alloc(pmu); + int ret = pmu_dev_alloc(pmu); if (ret) - goto free_idr; + return ret; } - ret = -ENOMEM; - pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *); if (!pmu->cpu_pmu_context) - goto free_dev; + return -ENOMEM; for_each_possible_cpu(cpu) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = + kmalloc_node(sizeof(struct perf_cpu_pmu_context), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + if (!cpc) + return -ENOMEM; + + *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc; __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } @@ -11932,39 +12258,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) * Now that the PMU is complete, make it visible to perf_try_init_event(). */ if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) - goto free_context; + return -EINVAL; list_add_rcu(&pmu->entry, &pmus); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - - return ret; - -free_context: - free_percpu(pmu->cpu_pmu_context); - -free_dev: - if (pmu->dev && pmu->dev != PMU_NULL_DEV) { - device_del(pmu->dev); - put_device(pmu->dev); - } - -free_idr: - idr_remove(&pmu_idr, pmu->type); - -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; + take_idr_id(pmu_type); + _pmu = no_free_ptr(pmu); // let it rip + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); - idr_remove(&pmu_idr, pmu->type); - mutex_unlock(&pmus_lock); + scoped_guard (mutex, &pmus_lock) { + list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + } /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -11973,14 +12281,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); - } - free_pmu_context(pmu); + perf_pmu_free(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -12020,48 +12321,61 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); - if (!ret) { - if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && - has_extended_regs(event)) - ret = -EOPNOTSUPP; + if (ret) + goto err_pmu; - if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) - ret = -EINVAL; + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) { + ret = -EOPNOTSUPP; + goto err_destroy; + } - if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { - const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); - struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); - int cpu; - - if (pmu_cpumask && cpumask) { - cpu = cpumask_any_and(pmu_cpumask, cpumask); - if (cpu >= nr_cpu_ids) - ret = -ENODEV; - else - event->event_caps |= PERF_EV_CAP_READ_SCOPE; - } else { - ret = -ENODEV; - } - } + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + ret = -EINVAL; + goto err_destroy; + } + + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask; + struct cpumask *pmu_cpumask; + int cpu; - if (ret && event->destroy) - event->destroy(event); + cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + pmu_cpumask = perf_scope_cpumask(pmu->scope); + + ret = -ENODEV; + if (!pmu_cpumask || !cpumask) + goto err_destroy; + + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + goto err_destroy; + + event->event_caps |= PERF_EV_CAP_READ_SCOPE; } - if (ret) - module_put(pmu->module); + return 0; + +err_destroy: + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } +err_pmu: + event->pmu = NULL; + module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; - int idx, type, ret; struct pmu *pmu; + int type, ret; - idx = srcu_read_lock(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain @@ -12074,7 +12388,7 @@ static struct pmu *perf_init_event(struct perf_event *event) pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; } /* @@ -12093,13 +12407,12 @@ static struct pmu *perf_init_event(struct perf_event *event) } again: - rcu_read_lock(); - pmu = idr_find(&pmu_idr, type); - rcu_read_unlock(); + scoped_guard (rcu) + pmu = idr_find(&pmu_idr, type); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) - goto fail; + return ERR_PTR(-ENOENT); ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { @@ -12108,27 +12421,21 @@ again: } if (ret) - pmu = ERR_PTR(ret); + return ERR_PTR(ret); - goto unlock; + return pmu; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } + if (ret != -ENOENT) + return ERR_PTR(ret); } -fail: - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - return pmu; + return ERR_PTR(-ENOENT); } static void attach_sb_event(struct perf_event *event) @@ -12255,7 +12562,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, void *context, int cgroup_fd) { struct pmu *pmu; - struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; @@ -12270,8 +12576,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; - event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, - node); + struct perf_event *event __free(__free_event) = + kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); @@ -12378,15 +12684,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) - goto err_ns; + return ERR_PTR(-EINVAL); if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); - if (IS_ERR(pmu)) { - err = PTR_ERR(pmu); - goto err_ns; + if (IS_ERR(pmu)) + return (void*)pmu; + + /* + * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config(). + * The attach should be right after the perf_init_event(). + * Otherwise, the __free_event() would mistakenly detach the non-exist + * perf_ctx_data because of the other errors between them. + */ + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + err = attach_perf_ctx_data(event); + if (err) + return ERR_PTR(err); } /* @@ -12394,49 +12710,39 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { - err = -EINVAL; - goto err_pmu; - } + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) + return ERR_PTR(-EINVAL); if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || - event->attr.aux_pause || event->attr.aux_resume)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + event->attr.aux_pause || event->attr.aux_resume)) + return ERR_PTR(-EOPNOTSUPP); - if (event->attr.aux_pause && event->attr.aux_resume) { - err = -EINVAL; - goto err_pmu; - } + if (event->attr.aux_pause && event->attr.aux_resume) + return ERR_PTR(-EINVAL); if (event->attr.aux_start_paused) { - if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return ERR_PTR(-EOPNOTSUPP); event->hw.aux_paused = 1; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) - goto err_pmu; + return ERR_PTR(err); } err = exclusive_event_init(event); if (err) - goto err_pmu; + return ERR_PTR(err); if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); - if (!event->addr_filter_ranges) { - err = -ENOMEM; - goto err_per_task; - } + if (!event->addr_filter_ranges) + return ERR_PTR(-ENOMEM); /* * Clone the parent's vma offsets: they are valid until exec() @@ -12460,42 +12766,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_addr_filters; + return ERR_PTR(err); + event->attach_state |= PERF_ATTACH_CALLCHAIN; } } err = security_perf_event_alloc(event); if (err) - goto err_callchain_buffer; + return ERR_PTR(err); /* symmetric to unaccount_event() in _free_event() */ account_event(event); - return event; - -err_callchain_buffer: - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } -err_addr_filters: - kfree(event->addr_filter_ranges); - -err_per_task: - exclusive_event_destroy(event); - -err_pmu: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - if (event->destroy) - event->destroy(event); - module_put(pmu->module); -err_ns: - if (event->hw.target) - put_task_struct(event->hw.target); - call_rcu(&event->rcu_head, free_event_rcu); - - return ERR_PTR(err); + return_ptr(event); } static int perf_copy_attr(struct perf_event_attr __user *uattr, @@ -13569,6 +13852,12 @@ void perf_event_exit_task(struct task_struct *child) * At this point we need to send EXIT events to cpu contexts. */ perf_event_task(child, NULL, 0); + + /* + * Detach the perf_ctx_data for the system-wide event. + */ + guard(percpu_read)(&global_ctx_data_rwsem); + detach_task_ctx_data(child); } static void perf_free_event(struct perf_event *event, @@ -13744,7 +14033,6 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); - /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } @@ -14002,6 +14290,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); + child->perf_ctx_data = NULL; ret = perf_event_init_context(child, clone_flags); if (ret) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bc4a61029b6d..8ec2cb688903 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 180509132d4b..5130b119d0ae 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -19,7 +19,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, EPOLLIN); + atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM); handle->event->pending_wakeup = 1; @@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b4ca8898fe17..70c84b9d7be3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2169,8 +2169,8 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) */ unsigned long uprobe_get_trampoline_vaddr(void) { + unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; - unsigned long trampoline_vaddr = -1; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ @@ -2311,9 +2311,8 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { - spin_lock_irq(&t->sighand->siglock); + utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); - spin_unlock_irq(&t->sighand->siglock); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; @@ -2746,9 +2745,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* see uprobe_deny_signal() */ - spin_unlock_irq(¤t->sighand->siglock); + if (utask->signal_denied) { + set_thread_flag(TIF_SIGPENDING); + utask->signal_denied = false; + } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); diff --git a/kernel/exit.c b/kernel/exit.c index 3485e5fc499e..c2e6c7b7779f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,6 +69,7 @@ #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> +#include <linux/pidfs.h> #include <uapi/linux/wait.h> @@ -122,14 +123,22 @@ static __init int kernel_exit_sysfs_init(void) late_initcall(kernel_exit_sysfs_init); #endif -static void __unhash_process(struct task_struct *p, bool group_dead) +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) { nr_threads--; - detach_pid(p, PIDTYPE_PID); + detach_pid(post->pids, p, PIDTYPE_PID); if (group_dead) { - detach_pid(p, PIDTYPE_TGID); - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); @@ -141,7 +150,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); @@ -174,9 +183,6 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -197,23 +203,15 @@ static void __exit_signal(struct task_struct *tsk) task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; - __unhash_process(tsk, group_dead); + __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk, TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) @@ -239,22 +237,27 @@ void __weak release_thread(struct task_struct *dead_task) void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); + pidfs_exit(p); cgroup_release(p); + thread_pid = get_pid(p->thread_pid); + write_lock_irq(&tasklist_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -278,7 +281,20 @@ repeat: write_unlock_irq(&tasklist_lock); proc_flush_pid(thread_pid); put_pid(thread_pid); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); + /* + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. + */ + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); + put_task_struct_rcu_user(p); p = leader; @@ -741,10 +757,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_state = EXIT_ZOMBIE; /* - * sub-thread or delay_group_leader(), wake up the - * PIDFD_THREAD waiters. + * Ignore thread-group leaders that exited before all + * subthreads did. */ - if (!thread_group_empty(tsk)) + if (!delay_group_leader(tsk)) do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { diff --git a/kernel/fork.c b/kernel/fork.c index 735405a9c5f3..a61a4407ebdf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1891,8 +1891,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); INIT_HLIST_HEAD(&sig->ignored_posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->real_timer.function = it_real_fn; + hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); @@ -2032,25 +2031,18 @@ static inline void rcu_copy_process(struct task_struct *p) */ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - int pidfd; struct file *pidfd_file; - pidfd = get_unused_fd_flags(O_CLOEXEC); + CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd; pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) { - put_unused_fd(pidfd); + if (IS_ERR(pidfd_file)) return PTR_ERR(pidfd_file); - } - /* - * anon_inode_getfile() ignores everything outside of the - * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. - */ - pidfd_file->f_flags |= (flags & PIDFD_THREAD); + *ret = pidfd_file; - return pidfd; + return take_fd(pidfd); } /** @@ -2432,8 +2424,11 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; - /* Note that no task has been attached to @pid yet. */ - retval = __pidfd_prepare(pid, flags, &pidfile); + /* + * Note that no task has been attached to @pid yet indicate + * that via CLONE_PIDFD. + */ + retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 3db8567f5a44..cca15859a50b 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -50,10 +50,10 @@ */ static struct { struct futex_hash_bucket *queues; - unsigned long hashsize; + unsigned long hashmask; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) -#define futex_hashsize (__futex_data.hashsize) +#define futex_hashmask (__futex_data.hashmask) /* @@ -119,7 +119,7 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); - return &futex_queues[hash & (futex_hashsize - 1)]; + return &futex_queues[hash & futex_hashmask]; } @@ -1127,27 +1127,28 @@ void futex_exit_release(struct task_struct *tsk) static int __init futex_init(void) { + unsigned long hashsize, i; unsigned int futex_shift; - unsigned long i; #ifdef CONFIG_BASE_SMALL - futex_hashsize = 16; + hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + hashsize = roundup_pow_of_two(256 * num_possible_cpus()); #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, 0, + hashsize, 0, 0, &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; + hashsize, hashsize); + hashsize = 1UL << futex_shift; - for (i = 0; i < futex_hashsize; i++) { + for (i = 0; i < hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } + futex_hashmask = hashsize - 1; return 0; } core_initcall(futex_init); diff --git a/kernel/iomem.c b/kernel/iomem.c index dc2120776e1c..75e61c1c6bc0 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -6,7 +6,8 @@ #include <linux/ioremap.h> #ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +static void *arch_memremap_wb(resource_size_t offset, unsigned long size, + unsigned long flags) { #ifdef ioremap_cache return (__force void *)ioremap_cache(offset, size); @@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size, flags); if (!addr) - addr = arch_memremap_wb(offset, size); + addr = arch_memremap_wb(offset, size, flags); } /* diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c901436ebd9f..0ff987d3a799 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -232,6 +232,21 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, } #endif +static void irq_enable(struct irq_desc *desc) +{ + if (!irqd_irq_disabled(&desc->irq_data)) { + unmask_irq(desc); + } else { + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) { + desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + unmask_irq(desc); + } + } +} + static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -332,21 +347,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc) irq_domain_deactivate_irq(&desc->irq_data); } -void irq_enable(struct irq_desc *desc) -{ - if (!irqd_irq_disabled(&desc->irq_data)) { - unmask_irq(desc); - } else { - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) { - desc->irq_data.chip->irq_enable(&desc->irq_data); - irq_state_clr_masked(desc); - } else { - unmask_irq(desc); - } - } -} - static void __irq_disable(struct irq_desc *desc, bool mask) { if (irqd_irq_disabled(&desc->irq_data)) { diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a979523640d0..b0290849c395 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -90,7 +90,6 @@ extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); -extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); @@ -98,18 +97,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); -extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask); - #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else extern void irq_mark_irq(unsigned int irq); #endif -extern int __irq_get_irqchip_state(struct irq_data *data, - enum irqchip_irq_state which, - bool *state); - irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -139,8 +132,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern void irq_set_thread_affinity(struct irq_desc *desc); - extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); @@ -442,6 +433,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) return desc->pending_mask; } bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); +void irq_force_complete_move(struct irq_desc *desc); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -467,6 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } +static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 287830739783..4258cd6bd3b4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -991,7 +991,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } -unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) +static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ec6d8e72d980..2861f89880af 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1589,9 +1589,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, } } -int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, + unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f300bb6be3bd..753eef8e041c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -35,6 +35,8 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif +static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); + static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); @@ -187,7 +189,7 @@ bool irq_can_set_affinity_usr(unsigned int irq) * set_cpus_allowed_ptr() here as we hold desc->lock and this * code can be called from hard interrupt context. */ -void irq_set_thread_affinity(struct irq_desc *desc) +static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; @@ -2789,8 +2791,7 @@ out: irq_put_desc_unlock(desc, flags); } -int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, - bool *state) +static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index eb150afd671f..147cabb4c077 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) return true; } +void irq_force_complete_move(struct irq_desc *desc) +{ + for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) { + if (d->chip && d->chip->irq_force_complete_move) { + d->chip->irq_force_complete_move(d); + return; + } + } +} + void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); @@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata) if (!masked) idata->chip->irq_unmask(idata); } + +bool irq_can_move_in_process_context(struct irq_data *data) +{ + /* + * Get the top level irq_data in the hierarchy, which is optimized + * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled. + */ + data = irq_desc_get_irq_data(irq_data_to_desc(data)); + return irq_can_move_pcntxt(data); +} diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 396a067a8a56..ad4a85e31160 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/pci.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/xarray.h> @@ -269,16 +270,11 @@ fail: return ret; } -void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) -{ - *msg = entry->msg; -} - void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) { struct msi_desc *entry = irq_get_msi_desc(irq); - __get_cached_msi_msg(entry, msg); + *msg = entry->msg; } EXPORT_SYMBOL_GPL(get_cached_msi_msg); @@ -342,26 +338,30 @@ int msi_setup_device_data(struct device *dev) } /** - * msi_lock_descs - Lock the MSI descriptor storage of a device + * __msi_lock_descs - Lock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_lock_descs(struct device *dev) +void __msi_lock_descs(struct device *dev) { mutex_lock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_lock_descs); +EXPORT_SYMBOL_GPL(__msi_lock_descs); /** - * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * __msi_unlock_descs - Unlock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_unlock_descs(struct device *dev) +void __msi_unlock_descs(struct device *dev) { /* Invalidate the index which was cached by the iterator */ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_unlock_descs); +EXPORT_SYMBOL_GPL(__msi_unlock_descs); static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, enum msi_desc_filter filter) @@ -447,7 +447,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; - unsigned int ret = 0; bool pcimsi = false; struct xarray *xa; @@ -461,7 +460,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) pcimsi = to_pci_dev(dev)->msi_enabled; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); xa = &dev->msi.data->__domains[domid].store; desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { @@ -470,16 +469,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (pcimsi) { - if (index < desc->nvec_used) - ret = desc->irq + index; - } else { - ret = desc->irq; - } + if (!pcimsi) + return desc->irq; + if (index < desc->nvec_used) + return desc->irq + index; } - - msi_unlock_descs(dev); - return ret; + return 0; } EXPORT_SYMBOL_GPL(msi_domain_get_virq); @@ -756,12 +751,30 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw return info->ops->msi_translate(domain, fwspec, hwirq, type); } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct msi_desc *desc = irq_data_get_msi_desc(irqd); + + if (!desc) + return; + + seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi); + seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo); + seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data); +} +#endif + static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, .deactivate = msi_domain_deactivate, .translate = msi_domain_translate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = msi_domain_debug_show, +#endif }; static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, @@ -979,9 +992,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; - struct fwnode_handle *fwnode, *fwnalloced = NULL; - struct msi_domain_template *bundle; const struct msi_parent_ops *pops; + struct fwnode_handle *fwnode; if (!irq_domain_is_msi_parent(parent)) return false; @@ -989,7 +1001,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) return false; - bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + struct msi_domain_template *bundle __free(kfree) = + bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); if (!bundle) return false; @@ -1012,41 +1025,36 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, * node as they are not guaranteed to have a fwnode. They are never * looked up and always handled in the context of the device. */ - if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) - fwnode = dev->fwnode; + struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL; + + if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)) + fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name); else - fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); + fwnode = dev->fwnode; if (!fwnode) - goto free_bundle; + return false; if (msi_setup_device_data(dev)) - goto free_fwnode; - - msi_lock_descs(dev); + return false; + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) - goto fail; + return false; if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) - goto fail; + return false; domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); if (!domain) - goto fail; + return false; + /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */ + retain_ptr(bundle); + retain_ptr(fwnode_alloced); domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; - msi_unlock_descs(dev); return true; - -fail: - msi_unlock_descs(dev); -free_fwnode: - irq_domain_free_fwnode(fwnalloced); -free_bundle: - kfree(bundle); - return false; } /** @@ -1060,12 +1068,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) struct msi_domain_info *info; struct irq_domain *domain; - msi_lock_descs(dev); - + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); - if (!domain || !irq_domain_is_msi_device(domain)) - goto unlock; + return; dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; @@ -1074,9 +1080,6 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) irq_domain_remove(domain); irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); - -unlock: - msi_unlock_descs(dev); } /** @@ -1092,16 +1095,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, { struct msi_domain_info *info; struct irq_domain *domain; - bool ret = false; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); if (domain && irq_domain_is_msi_device(domain)) { info = domain->host_data; - ret = info->bus_token == bus_token; + return info->bus_token == bus_token; } - msi_unlock_descs(dev); - return ret; + return false; } static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -1333,21 +1334,17 @@ static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl) } /** - * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain + * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain * @dev: Pointer to device struct of the device for which the interrupts * are allocated * @domid: Id of the interrupt domain to operate on * @first: First index to allocate (inclusive) * @last: Last index to allocate (inclusive) * - * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() - * pair. Use this for MSI irqdomains which implement their own descriptor - * allocation/free. - * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) +int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { struct msi_ctrl ctrl = { .domid = domid, @@ -1356,29 +1353,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, .nirqs = last + 1 - first, }; + guard(msi_descs_lock)(dev); return msi_domain_alloc_locked(dev, &ctrl); } - -/** - * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain - * @dev: Pointer to device struct of the device for which the interrupts - * are allocated - * @domid: Id of the interrupt domain to operate on - * @first: First index to allocate (inclusive) - * @last: Last index to allocate (inclusive) - * - * Return: %0 on success or an error code. - */ -int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) -{ - int ret; - - msi_lock_descs(dev); - ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); - return ret; -} EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); /** @@ -1481,12 +1458,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *icookie) { - struct msi_map map; - - msi_lock_descs(dev); - map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); - msi_unlock_descs(dev); - return map; + guard(msi_descs_lock)(dev); + return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); } /** @@ -1523,13 +1496,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, icookie.value = ((u64)type << 32) | hwirq; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) map.index = -EINVAL; else map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); - msi_unlock_descs(dev); - return map.index >= 0 ? map.virq : map.index; } @@ -1599,8 +1570,8 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) * @first: First index to free (inclusive) * @last: Last index to free (inclusive) */ -void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) +static void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { struct msi_ctrl ctrl = { .domid = domid, @@ -1622,9 +1593,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); } EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); @@ -1654,9 +1624,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) */ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_all_locked(dev, domid); - msi_unlock_descs(dev); } /** @@ -1675,12 +1644,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) return; - msi_lock_descs(dev); - if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { - msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, - desc->msi_index); - } - msi_unlock_descs(dev); + guard(msi_descs_lock)(dev); + if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) + return; + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); } /** diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9a0ca605d4a..4198f30aac3c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -148,16 +148,8 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets if --absolute-percpu is not in effect */ - if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; - - /* ...otherwise, positive offsets are absolute values */ - if (kallsyms_offsets[idx] >= 0) - return kallsyms_offsets[idx]; - - /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ - return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; + /* values are unsigned offsets */ + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; } static unsigned int get_symbol_seq(int index) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 2c596851f8a9..7c1a65bd5f8d 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, */ task1 = find_task_by_vpid(pid1); task2 = find_task_by_vpid(pid2); - if (!task1 || !task2) + if (unlikely(!task1 || !task2)) goto err_no_task; get_task_struct(task1); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d17b8..a114949eeed5 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. +KASAN_SANITIZE_lockdep.o := n KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 97fb6f3f840a..9ef9850aeebe 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -67,3 +67,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ + +/* + * Locking events for rtlock_slowlock() + */ +LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */ +LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */ +LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */ + +/* + * Locking events for rt_mutex_slowlock() + */ +LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */ +LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */ +LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */ +LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ +LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ +LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ + +/* + * Locking events for lockdep + */ +LOCK_EVENT(lockdep_acquire) +LOCK_EVENT(lockdep_lock) +LOCK_EVENT(lockdep_nocheck) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4470680f0226..b15757e63626 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,12 @@ #include <linux/lockdep.h> #include <linux/context_tracking.h> #include <linux/console.h> +#include <linux/kasan.h> #include <asm/sections.h> #include "lockdep_internals.h" +#include "lock_events.h" #include <trace/events/lock.h> @@ -170,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { lockdep_lock(); + lockevent_inc(lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -5091,8 +5094,12 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(lock->key == &__lockdep_no_track__)) return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) + lockevent_inc(lockdep_acquire); + + if (!prove_locking || lock->key == &__lockdep_no_validate__) { check = 0; + lockevent_inc(lockdep_nocheck); + } if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -5824,6 +5831,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!debug_locks) return; + /* + * As KASAN instrumentation is disabled and lock_acquire() is usually + * the first lockdep call when a task tries to acquire a lock, add + * kasan_check_byte() here to check for use-after-free and other + * memory errors. + */ + kasan_check_byte(lock); + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b36f23de48f1..19b636f60a24 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -143,6 +143,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; + MUTEX_WARN_ON(lock->magic != lock); + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4a8df1800cbb..c80902eacd79 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -27,6 +27,7 @@ #include <trace/events/lock.h> #include "rtmutex_common.h" +#include "lock_events.h" #ifndef WW_RT # define build_ww_mutex() (false) @@ -1612,10 +1613,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) + if (try_to_take_rt_mutex(lock, current, waiter)) { + lockevent_inc(rtmutex_slow_acq3); break; + } if (timeout && !timeout->task) { ret = -ETIMEDOUT; @@ -1638,8 +1642,10 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { + lockevent_inc(rtmutex_slow_sleep); rt_mutex_schedule(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1694,6 +1700,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtmutex_slowlock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { @@ -1701,6 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq1); return 0; } @@ -1719,10 +1727,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq2); } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); + lockevent_inc(rtmutex_deadlock); } /* @@ -1751,6 +1761,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); + lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); return ret; } @@ -1823,9 +1834,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtlock_slowlock); - if (try_to_take_rt_mutex(lock, current, NULL)) + if (try_to_take_rt_mutex(lock, current, NULL)) { + lockevent_inc(rtlock_slow_acq1); return; + } rt_mutex_init_rtlock_waiter(&waiter); @@ -1838,8 +1852,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, for (;;) { /* Try to acquire the lock again */ - if (try_to_take_rt_mutex(lock, current, &waiter)) + if (try_to_take_rt_mutex(lock, current, &waiter)) { + lockevent_inc(rtlock_slow_acq2); break; + } if (&waiter == rt_mutex_top_waiter(lock)) owner = rt_mutex_owner(lock); @@ -1847,8 +1863,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { + lockevent_inc(rtlock_slow_sleep); schedule_rtlock(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_RTLOCK_WAIT); @@ -1865,6 +1883,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); + lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) diff --git a/kernel/module/main.c b/kernel/module/main.c index 1fb9ad289a6f..a256cc919ad7 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1221,18 +1221,6 @@ void __weak module_arch_freeing_init(struct module *mod) { } -void *__module_writable_address(struct module *mod, void *loc) -{ - for_class_mod_mem_type(type, text) { - struct module_memory *mem = &mod->mem[type]; - - if (loc >= mem->base && loc < mem->base + mem->size) - return loc + (mem->rw_copy - mem->base); - } - - return loc; -} - static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { unsigned int size = PAGE_ALIGN(mod->mem[type].size); @@ -1250,21 +1238,15 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) if (!ptr) return -ENOMEM; - mod->mem[type].base = ptr; - if (execmem_is_rox(execmem_type)) { - ptr = vzalloc(size); + int err = execmem_make_temp_rw(ptr, size); - if (!ptr) { - execmem_free(mod->mem[type].base); + if (err) { + execmem_free(ptr); return -ENOMEM; } - mod->mem[type].rw_copy = ptr; mod->mem[type].is_rox = true; - } else { - mod->mem[type].rw_copy = mod->mem[type].base; - memset(mod->mem[type].base, 0, size); } /* @@ -1278,18 +1260,29 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) * *do* eventually get freed, but let's just keep things simple * and avoid *any* false positives. */ - kmemleak_not_leak(ptr); + if (!mod->mem[type].is_rox) + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; return 0; } +static void module_memory_restore_rox(struct module *mod) +{ + for_class_mod_mem_type(type, text) { + struct module_memory *mem = &mod->mem[type]; + + if (mem->is_rox) + execmem_restore_rox(mem->base, mem->size); + } +} + static void module_memory_free(struct module *mod, enum mod_mem_type type) { struct module_memory *mem = &mod->mem[type]; - if (mem->is_rox) - vfree(mem->rw_copy); - execmem_free(mem->base); } @@ -2642,7 +2635,6 @@ static int move_module(struct module *mod, struct load_info *info) for_each_mod_mem_type(type) { if (!mod->mem[type].size) { mod->mem[type].base = NULL; - mod->mem[type].rw_copy = NULL; continue; } @@ -2659,7 +2651,6 @@ static int move_module(struct module *mod, struct load_info *info) void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; const char *sname; - unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; @@ -2680,14 +2671,12 @@ static int move_module(struct module *mod, struct load_info *info) ret = PTR_ERR(dest); goto out_err; } - addr = (unsigned long)dest; codetag_section_found = true; } else { enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; - addr = (unsigned long)mod->mem[type].base + offset; - dest = mod->mem[type].rw_copy + offset; + dest = mod->mem[type].base + offset; } if (shdr->sh_type != SHT_NOBITS) { @@ -2710,13 +2699,14 @@ static int move_module(struct module *mod, struct load_info *info) * users of info can keep taking advantage and using the newly * minted official memory area. */ - shdr->sh_addr = addr; + shdr->sh_addr = (unsigned long)dest; pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr, (long)shdr->sh_size, info->secstrings + shdr->sh_name); } return 0; out_err: + module_memory_restore_rox(mod); for (t--; t >= 0; t--) module_memory_free(mod, t); if (codetag_section_found) @@ -2863,17 +2853,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr, return 0; } -int __weak module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - return 0; -} - static int post_relocation(struct module *mod, const struct load_info *info) { - int ret; - /* Sort exception table now relocations are done. */ sort_extable(mod->extable, mod->extable + mod->num_exentries); @@ -2885,24 +2866,7 @@ static int post_relocation(struct module *mod, const struct load_info *info) add_kallsyms(mod, info); /* Arch-specific module finalizing. */ - ret = module_finalize(info->hdr, info->sechdrs, mod); - if (ret) - return ret; - - for_each_mod_mem_type(type) { - struct module_memory *mem = &mod->mem[type]; - - if (mem->is_rox) { - if (!execmem_update_copy(mem->base, mem->rw_copy, - mem->size)) - return -ENOMEM; - - vfree(mem->rw_copy); - mem->rw_copy = NULL; - } - } - - return module_post_finalize(info->hdr, info->sechdrs, mod); + return module_finalize(info->hdr, info->sechdrs, mod); } /* Call module constructors. */ @@ -3499,6 +3463,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->mem[type].size); } + module_memory_restore_rox(mod); module_deallocate(mod, info); free_copy: /* diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 74834ba15615..03f4142cfbf4 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> +#include <linux/execmem.h> #include "internal.h" static int module_set_memory(const struct module *mod, enum mod_mem_type type, @@ -32,12 +33,12 @@ static int module_set_memory(const struct module *mod, enum mod_mem_type type, int module_enable_text_rox(const struct module *mod) { for_class_mod_mem_type(type, text) { + const struct module_memory *mem = &mod->mem[type]; int ret; - if (mod->mem[type].is_rox) - continue; - - if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + if (mem->is_rox) + ret = execmem_restore_rox(mem->base, mem->size); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) ret = module_set_memory(mod, type, set_memory_rox); else ret = module_set_memory(mod, type, set_memory_x); diff --git a/kernel/padata.c b/kernel/padata.c index 418987056340..b3d4eacc4f5d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -290,7 +290,7 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, if (remove_object) { list_del_init(&padata->list); ++pd->processed; - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); } spin_unlock(&reorder->lock); diff --git a/kernel/pid.c b/kernel/pid.c index 924084713be8..4ac2ce46817f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); @@ -128,11 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { - /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; - unsigned long flags; - spin_lock_irqsave(&pidmap_lock, flags); + lockdep_assert_not_held(&tasklist_lock); + + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; @@ -155,11 +141,23 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } pidfs_remove_pid(pid); - spin_unlock_irqrestore(&pidmap_lock, flags); + spin_unlock(&pidmap_lock); call_rcu(&pid->rcu, delayed_put_pid); } +void free_pids(struct pid **pids) +{ + int tmp; + + /* + * This can batch pidmap_lock. + */ + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (pids[tmp]) + free_pid(pids[tmp]); +} + struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { @@ -211,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -238,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); if (nr < 0) { @@ -272,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, upid = pid->numbers + ns->level; idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pidfs_add_pid(pid); @@ -281,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); put_pid_ns(ns); out_free: - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -302,7 +300,7 @@ out_free: if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -310,9 +308,9 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) @@ -339,17 +337,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid *pid = *task_pid_ptr(task, type); + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + + pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) +static void __change_pid(struct pid **pids, struct task_struct *task, + enum pid_type type, struct pid *new) { - struct pid **pid_ptr = task_pid_ptr(task, type); - struct pid *pid; + struct pid **pid_ptr, *pid; int tmp; + lockdep_assert_held_write(&tasklist_lock); + + pid_ptr = task_pid_ptr(task, type); pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); @@ -364,18 +368,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type, if (pid_has_task(pid, tmp)) return; - free_pid(pid); + WARN_ON(pids[type]); + pids[type] = pid; } -void detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) { - __change_pid(task, type, NULL); + __change_pid(pids, task, type, NULL); } -void change_pid(struct task_struct *task, enum pid_type type, +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, struct pid *pid) { - __change_pid(task, type, pid); + __change_pid(pids, task, type, pid); attach_pid(task, type); } @@ -386,6 +391,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + lockdep_assert_held_write(&tasklist_lock); + /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); @@ -403,6 +410,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); + lockdep_assert_held_write(&tasklist_lock); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } @@ -564,15 +572,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { - unsigned int f_flags; + unsigned int f_flags = 0; struct pid *pid; struct task_struct *task; + enum pid_type type; - pid = pidfd_get_pid(pidfd, &f_flags); - if (IS_ERR(pid)) - return ERR_CAST(pid); + switch (pidfd) { + case PIDFD_SELF_THREAD: + type = PIDTYPE_PID; + pid = get_task_pid(current, type); + break; + case PIDFD_SELF_THREAD_GROUP: + type = PIDTYPE_TGID; + pid = get_task_pid(current, type); + break; + default: + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + type = PIDTYPE_TGID; + break; + } - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, type); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 07668433644b..057db78876cd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2461,7 +2461,6 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); -static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ @@ -2474,7 +2473,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre static u64 syslog_seq; -static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -4466,7 +4464,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ -static bool pr_flush(int timeout_ms, bool reset_on_progress) +bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185d..aa42de4d2768 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -18,7 +18,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPTION + default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) select TREE_RCU help This option selects the RCU implementation that is @@ -65,6 +65,17 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU @@ -91,7 +102,7 @@ config NEED_TASKS_RCU config TASKS_RCU bool - default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) + default NEED_TASKS_RCU && PREEMPTION select IRQ_WORK config FORCE_TASKS_RUDE_RCU @@ -323,21 +334,27 @@ config RCU_LAZY depends on RCU_NOCB_CPU default n help - To save power, batch RCU callbacks and flush after delay, memory - pressure, or callback list growing too big. + To save power, batch RCU callbacks and delay starting the + corresponding grace period for multiple seconds. The grace + period will be started after this delay, in case of memory + pressure, or if the corresponding CPU's callback list grows + too large. - Requires rcu_nocbs=all to be set. + These delays happen only on rcu_nocbs CPUs, that is, CPUs + whose callbacks have been offloaded. - Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to + globally disable these delays. config RCU_LAZY_DEFAULT_OFF bool "Turn RCU lazy invocation off by default" depends on RCU_LAZY default n help - Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default - off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch - it back on. + Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel + to boot with these energy-efficiency delays disabled. Use the + rcutree.enable_rcu_lazy=0 kernel-boot parameter to override + the this option at boot time, thus re-enabling these delays. config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 6af90510a1ca..12e4c64ebae1 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -54,7 +54,7 @@ config RCU_TORTURE_TEST Say N if you are unsure. config RCU_TORTURE_TEST_CHK_RDR_STATE - tristate "Check rcutorture reader state" + bool "Check rcutorture reader state" depends on RCU_TORTURE_TEST default n help @@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE Say N if you are unsure. config RCU_TORTURE_TEST_LOG_CPU - tristate "Log CPU for rcutorture failures" + bool "Log CPU for rcutorture failures" depends on RCU_TORTURE_TEST default n help @@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU Say Y here if you want CPU IDs logged. Say N if you are unsure. +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..eed2951a4962 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); } /* @@ -590,6 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif static inline void rcu_gp_set_torture_wait(int duration) { } #endif +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU @@ -611,8 +613,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } @@ -624,7 +624,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); @@ -636,6 +635,12 @@ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU void rcu_bind_current_to_nocb(void); #else diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..65095664f5c5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -135,6 +135,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); @@ -147,6 +148,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; @@ -272,6 +274,9 @@ struct rt_read_seg { bool rt_preempted; int rt_cpu; int rt_end_cpu; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -406,6 +411,8 @@ struct rcu_torture_ops { void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); long cbflood_max; int irq_capable; int can_boost; @@ -610,6 +617,8 @@ static struct rcu_torture_ops rcu_ops = { .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) ? has_rcu_reader_blocked : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -655,6 +664,8 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" @@ -677,8 +688,11 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { int idx; + struct srcu_ctr __percpu *scp; int ret = 0; + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); @@ -694,6 +708,12 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } return ret; } @@ -719,6 +739,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) @@ -791,6 +813,7 @@ static struct rcu_torture_ops srcu_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -834,6 +857,7 @@ static struct rcu_torture_ops srcud_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -1148,8 +1172,19 @@ static int rcu_torture_boost(void *arg) unsigned long gp_state; unsigned long gp_state_time; unsigned long oldstarttime; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ sched_set_fifo_low(current); @@ -1225,6 +1260,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); @@ -1728,7 +1764,7 @@ rcu_torture_fakewriter(void *arg) do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else { switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -1873,6 +1909,8 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) { + int mask; + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) return; @@ -1899,11 +1937,27 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->extendables && - !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + + /* + * non-preemptible RCU in a preemptible kernel uses preempt_disable() + * as rcu_read_lock(). + */ + mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + + WARN_ONCE(cur_ops->extendables && !(curstate & mask) && (preempt_count() & PREEMPT_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->readlock_nesting && - !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + + /* + * non-preemptible RCU in a preemptible kernel uses "preempt_count() & + * PREEMPT_MASK" as ->readlock_nesting(). + */ + mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } @@ -1965,6 +2019,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); } } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } /* * Next, remove old protection, in decreasing order of strength @@ -2512,7 +2573,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " @@ -2522,11 +2583,11 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " "preempt_duration=%d preempt_interval=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, @@ -3553,6 +3614,7 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) { @@ -3597,7 +3659,7 @@ rcu_torture_cleanup(void) rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); kfree(fakewriter_tasks); @@ -3635,7 +3697,11 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); @@ -3648,6 +3714,27 @@ rcu_torture_cleanup(void) else pr_cont(" ..."); } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[20+1]; + char buf2[20+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); @@ -3994,6 +4081,14 @@ rcu_torture_init(void) rcu_torture_init_srcu_lockdep(); + if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -4050,8 +4145,9 @@ rcu_torture_init(void) writer_task); if (torture_init_error(firsterr)) goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -4060,7 +4156,7 @@ rcu_torture_init(void) goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); if (torture_init_error(firsterr)) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1b47376acdc4..f11a7c2af778 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + static void srcu_lite_ref_scale_read_section(const int nloops) { int i; @@ -1163,7 +1193,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff..6e9fe2ce1075 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -20,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -98,7 +102,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { int newval; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); @@ -120,7 +124,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { preempt_enable(); return; /* Already running or nothing to do. */ @@ -138,7 +142,7 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); preempt_enable(); @@ -159,7 +163,7 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); preempt_enable(); @@ -172,7 +176,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { preempt_enable(); @@ -199,7 +203,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; @@ -261,7 +265,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { unsigned long ret; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); preempt_enable(); @@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b83c74c4dcc0..d2a694944553 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -116,8 +116,9 @@ do { \ /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. */ static void init_srcu_struct_data(struct srcu_struct *ssp) { @@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); @@ -247,15 +246,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); - ssp->srcu_idx = 0; ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; - if (!is_static) + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); @@ -429,10 +429,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp) } /* - * Computes approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx, and returns true if - * the caller did the proper barrier (gp), and if the count of the locks - * matches that of the unlocks passed in. + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. */ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { @@ -443,20 +443,20 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); if (IS_ENABLED(CONFIG_PROVE_RCU)) mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed reader flavors for srcu_struct at %ps.\n", ssp); - if (mask & SRCU_READ_FLAVOR_LITE && !gp) + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) return false; return sum == unlocks; } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { @@ -467,7 +467,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), @@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) unsigned long unlocks; unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); - did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding @@ -509,48 +509,49 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * If the locks are the same as the unlocks, then there must have * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read - * the current ->srcu_idx but not yet have incremented its CPU's - * ->srcu_lock_count[idx] counter. In fact, it is possible + * the current ->srcu_ctrp but not yet have incremented its CPU's + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there - * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks - * in a system whose address space was fully populated with memory. - * Call this quantity Nt. + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted at this point in the - * code for a long time. That now-preempted updater has already - * flipped ->srcu_idx (possibly during the preceding grace period), - * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_unlock_count[idx] counters. - * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_lock_count[idx] - * counter, in the absence of nesting? + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that value - * to index its increment of ->srcu_lock_count[idx]. But as soon as - * it leaves that SRCU read-side critical section, it will increment - * ->srcu_unlock_count[idx], which must follow the updater's above - * read from that same value. Thus, as soon the reading task does - * an smp_mb() and a later fetch from ->srcu_idx, that task will be - * guaranteed to get the new index. Except that the increment of - * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the - * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() - * is before the smp_mb(). Thus, that task might not see the new - * value of ->srcu_idx until the -second- __srcu_read_lock(), - * which in turn means that this task might well increment - * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, - * not just once. + * the old value of ->srcu_ctrp and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_ctrp twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any * later task running on that same CPU. * - * That is, there can be almost Nt + Nc further increments of - * ->srcu_lock_count[idx] for the old index, where Nc is the number - * of CPUs. But this is OK because the size of the task_struct - * structure limits the value of Nt and current systems limit Nc - * to a few thousand. + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. * * OK, but what about nesting? This does impose a limit on * nesting of half of the size of the task_struct structure @@ -581,10 +582,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[0]); - sum += atomic_long_read(&sdp->srcu_lock_count[1]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } @@ -647,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { @@ -674,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ @@ -743,12 +749,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -760,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -773,13 +778,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { - int idx; - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_lock_count[idx]); + atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scpp); } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); @@ -790,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); */ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_unlock_count[idx]); + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); @@ -1096,13 +1098,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1114,30 +1118,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Because the flip of ->srcu_idx is executed only if the + * Because the flip of ->srcu_ctrp is executed only if the * preceding call to srcu_readers_active_idx_check() found that - * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched - * and because that summing uses atomic_long_read(), there is - * ordering due to a control dependency between that summing and - * the WRITE_ONCE() in this call to srcu_flip(). This ordering - * ensures that if this updater saw a given reader's increment from - * __srcu_read_lock(), that reader was using a value of ->srcu_idx - * from before the previous call to srcu_flip(), which should be - * quite rare. This ordering thus helps forward progress because - * the grace period could otherwise be delayed by additional - * calls to __srcu_read_lock() using that old (soon to be new) - * value of ->srcu_idx. + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_ctrp from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_ctrp. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of - * ->srcu_idx, this updater's earlier scans cannot have seen + * ->srcu_ctrp, this updater's earlier scans cannot have seen * that reader's increments, which is all to the good, because * this grace period need not wait on that reader. After all, * if those earlier scans had seen that reader, there would have @@ -1152,7 +1156,8 @@ static void srcu_flip(struct srcu_struct *ssp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -1198,7 +1203,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) check_init_srcu_struct(ssp); /* If _lite() readers, don't do unsolicited expediting. */ - if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); @@ -1398,8 +1403,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, * read-side critical sections are delimited by srcu_read_lock() and * srcu_read_unlock(), and may be nested. * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. + * The callback will be invoked from process context, but with bh + * disabled. The callback function must therefore be fast and must + * not block. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) @@ -1465,8 +1474,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1675,7 +1685,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return READ_ONCE(ssp->srcu_idx); + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1692,7 +1702,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1720,7 +1730,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1738,7 +1748,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1849,7 +1859,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { @@ -1896,7 +1908,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; - idx = ssp->srcu_idx & 0x1; + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", @@ -1914,8 +1926,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); - u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); /* * Make sure that a lock is always counted if the corresponding @@ -1923,8 +1935,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); - l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); c0 = l0 - u0; c1 = l1 - u1; @@ -2001,6 +2013,7 @@ static int srcu_module_coming(struct module *mod) ssp->sda = alloc_percpu(struct srcu_data); if (WARN_ON_ONCE(!ssp->sda)) return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; } return 0; } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 59314da5eb60..466668eb4fad 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -2256,7 +2256,7 @@ void __init tasks_cblist_init_generic(void) #endif } -void __init rcu_init_tasks_generic(void) +static int __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU rcu_spawn_tasks_kthread(); @@ -2272,7 +2272,10 @@ void __init rcu_init_tasks_generic(void) // Run the self-tests. rcu_tasks_initiate_self_tests(); + + return 0; } +core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f31911465..c1ebfd51768b 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; @@ -159,10 +152,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } @@ -246,15 +232,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long long rcutorture_gather_gp_seqs(void) { - if (head) - kasan_record_aux_stack(ptr); + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); - __kvfree_call_rcu(head, ptr); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); } -EXPORT_SYMBOL_GPL(kvfree_call_rcu); +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif void __init rcu_init(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..659f83e71048 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -538,6 +538,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long long rcutorture_gather_gp_seqs(void) +{ + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; + + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); + #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on @@ -1254,7 +1274,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1268,7 +1288,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1283,7 +1303,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1612,12 +1632,10 @@ static void rcu_sr_normal_complete(struct llist_node *node) { struct rcu_synchronize *rs = container_of( (struct rcu_head *) node, struct rcu_synchronize, head); - unsigned long oldstate = (unsigned long) rs->head.func; WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && - !poll_state_synchronize_rcu(oldstate), - "A full grace period is not passed yet: %lu", - rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n"); /* Finally. */ complete(&rs->completion); @@ -1801,10 +1819,14 @@ static noinline_for_stack bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); @@ -2931,13 +2953,8 @@ static int __init rcu_spawn_core_kthreads(void) static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } @@ -3107,7 +3124,7 @@ module_param(enable_rcu_lazy, bool, 0444); * critical sections have completed. * * Use this API instead of call_rcu() if you don't want the callback to be - * invoked after very long periods of time, which can happen on systems without + * delayed for very long periods of time, which can happen on systems without * memory pressure and on systems which are lightly loaded or mostly idle. * This function will cause callbacks to be invoked sooner than later at the * expense of extra power. Other than that, this function is identical to, and @@ -3138,6 +3155,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * + * It is perfectly legal to repost an RCU callback, potentially with + * a different callback function, from within its callback function. + * The specified function will be invoked after another full grace period + * has elapsed. This use case is similar in form to the common practice + * of reposting a timer from within its own handler. + * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, @@ -3166,6 +3189,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + * + * Specific to call_rcu() (as opposed to the other call_rcu*() functions), + * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many + * seconds before starting the grace period needed by the corresponding + * callback. This delay can significantly improve energy-efficiency + * on low-utilization battery-powered devices. To avoid this delay, + * in latency-sensitive kernel code, use call_rcu_hurry(). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3214,7 +3244,7 @@ static void synchronize_rcu_normal(void) * snapshot before adding a request. */ if (IS_ENABLED(CONFIG_PROVE_RCU)) - rs.head.func = (void *) get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&rs.oldstate); rcu_sr_normal_add_req(&rs); @@ -3357,14 +3387,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 77efed89c79e..8d4895c854c5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) * specified leaf rcu_node structure, which is acquired by the caller. */ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, - unsigned long mask, bool wake) + unsigned long mask_in, bool wake) __releases(rnp->lock) { int cpu; + unsigned long mask; struct rcu_data *rdp; raw_lockdep_assert_held_rcu_node(rnp); - if (!(rnp->expmask & mask)) { + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2605dd234a13..5ff3bc56ff51 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1557,8 +1557,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { - char bufw[20]; - char bufr[20]; + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; @@ -1572,9 +1575,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) typeof(*rdp), nocb_entry_rdp); - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], @@ -1586,12 +1593,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3600152b858e..3c0bbbbb686f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -833,8 +833,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); @@ -975,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs diff --git a/kernel/reboot.c b/kernel/reboot.c index b5a8569e5d81..41ab9e1ba357 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -704,6 +704,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); + pr_flush(1000, true); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } diff --git a/kernel/rseq.c b/kernel/rseq.c index 2cb16091ec0a..b7a1ec327e81 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -78,24 +78,24 @@ efault: return -EFAULT; } -static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, - u32 node_id, u32 mm_cid) -{ - rseq_kernel_fields(t)->cpu_id_start = cpu_id; - rseq_kernel_fields(t)->cpu_id = cpu_id; - rseq_kernel_fields(t)->node_id = node_id; - rseq_kernel_fields(t)->mm_cid = mm_cid; -} +/* + * Update an rseq field and its in-kernel copy in lock-step to keep a coherent + * state. + */ +#define rseq_unsafe_put_user(t, value, field, error_label) \ + do { \ + unsafe_put_user(value, &t->rseq->field, error_label); \ + rseq_kernel_fields(t)->field = value; \ + } while (0) + #else static int rseq_validate_ro_fields(struct task_struct *t) { return 0; } -static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, - u32 node_id, u32 mm_cid) -{ -} +#define rseq_unsafe_put_user(t, value, field, error_label) \ + unsafe_put_user(value, &t->rseq->field, error_label) #endif /* @@ -173,17 +173,18 @@ static int rseq_update_cpu_node_id(struct task_struct *t) WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; - unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); - unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_put_user(node_id, &rseq->node_id, efault_end); - unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); + + rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); - rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); trace_rseq_update(t); return 0; @@ -195,6 +196,7 @@ efault: static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { + struct rseq __user *rseq = t->rseq; u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; @@ -202,40 +204,61 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) * Validate read-only rseq fields. */ if (rseq_validate_ro_fields(t)) - return -EFAULT; - /* - * Reset cpu_id_start to its initial state (0). - */ - if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) - return -EFAULT; - /* - * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming - * in after unregistration can figure out that rseq needs to be - * registered again. - */ - if (put_user(cpu_id, &t->rseq->cpu_id)) - return -EFAULT; - /* - * Reset node_id to its initial state (0). - */ - if (put_user(node_id, &t->rseq->node_id)) - return -EFAULT; + goto efault; + + if (!user_write_access_begin(rseq, t->rseq_len)) + goto efault; + /* - * Reset mm_cid to its initial state (0). + * Reset all fields to their initial state. + * + * All fields have an initial state of 0 except cpu_id which is set to + * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after + * unregistration can figure out that rseq needs to be registered + * again. */ - if (put_user(mm_cid, &t->rseq->mm_cid)) - return -EFAULT; - - rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); + rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ + user_write_access_end(); + return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; +} + +/* + * Get the user-space pointer value stored in the 'rseq_cs' field. + */ +static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +{ + if (!rseq_cs) + return -EFAULT; + +#ifdef CONFIG_64BIT + if (get_user(*rseq_cs, &rseq->rseq_cs)) + return -EFAULT; +#else + if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) + return -EFAULT; +#endif + return 0; } +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; @@ -244,17 +267,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; -#ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) - return -EFAULT; -#endif + ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + if (ret) + return ret; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } + /* Check that the pointer value fits in the user-space process space. */ if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; @@ -330,7 +352,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) return !!event_mask; } -static int clear_rseq_cs(struct task_struct *t) +static int clear_rseq_cs(struct rseq __user *rseq) { /* * The rseq_cs field is set to NULL on preemption or signal @@ -341,9 +363,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) + if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) return -EFAULT; return 0; #endif @@ -375,11 +397,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); + return clear_rseq_cs(t->rseq); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; - ret = clear_rseq_cs(t); + ret = clear_rseq_cs(t->rseq); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -453,6 +475,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; + u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -507,6 +530,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; + + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. + */ + if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) + return -EFAULT; + if (rseq_cs && clear_rseq_cs(rseq)) + return -EFAULT; + #ifdef CONFIG_DEBUG_RSEQ /* * Initialize the in-kernel rseq fields copy for validation of diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd45..8ae86371ddcd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -22,6 +22,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING +endif # # Build efficiency: # diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index fae1f5c921eb..72d97aa8b726 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -61,6 +61,7 @@ #ifdef CONFIG_SCHED_CLASS_EXT # include "ext.c" +# include "ext_idle.c" #endif #include "syscalls.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 80a3df49ab47..bf9d8db94b70 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -68,9 +68,7 @@ # include "cpufreq_schedutil.c" #endif -#ifdef CONFIG_SCHED_DEBUG -# include "debug.c" -#endif +#include "debug.c" #ifdef CONFIG_SCHEDSTATS # include "stats.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 67189907214d..87540217fc09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,6 @@ #include "autogroup.h" #include "pelt.h" #include "smp.h" -#include "stats.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" @@ -119,7 +118,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -129,7 +127,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); */ #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | -const_debug unsigned int sysctl_sched_features = +__read_mostly unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT @@ -143,13 +141,12 @@ const_debug unsigned int sysctl_sched_features = */ __read_mostly int sysctl_resched_latency_warn_ms = 100; __read_mostly int sysctl_resched_latency_warn_once = 1; -#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; +__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -800,11 +797,10 @@ void update_rq_clock(struct rq *rq) if (rq->clock_update_flags & RQCF_ACT_SKIP) return; -#ifdef CONFIG_SCHED_DEBUG if (sched_feat(WARN_DOUBLE_CLOCK)) - SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; -#endif + clock = sched_clock_cpu(cpu_of(rq)); scx_rq_clock_update(rq, clock); @@ -916,8 +912,7 @@ static void hrtick_rq_init(struct rq *rq) #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - rq->hrtick_timer.function = hrtick; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1720,7 +1715,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; - SCHED_WARN_ON(!bucket->tasks); + WARN_ON_ONCE(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; @@ -1740,7 +1735,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fix up the expected value. */ - SCHED_WARN_ON(bucket->value > rq_clamp); + WARN_ON_ONCE(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); uclamp_rq_set(rq, clamp_id, bkt_clamp); @@ -1757,7 +1752,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1784,7 +1779,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1942,12 +1937,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, } if (update_root_tg) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_update_root_tg(); } if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_sync_util_min_rt_default(); } @@ -2122,7 +2117,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + WARN_ON_ONCE(flags & DEQUEUE_SLEEP); WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq); @@ -2727,7 +2722,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) * XXX do further audits, this smells like something putrid. */ if (ctx->flags & SCA_MIGRATE_DISABLE) - SCHED_WARN_ON(!p->on_cpu); + WARN_ON_ONCE(!p->on_cpu); else lockdep_assert_held(&p->pi_lock); @@ -3292,7 +3287,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { -#ifdef CONFIG_SCHED_DEBUG unsigned int state = READ_ONCE(p->__state); /* @@ -3330,7 +3324,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(!cpu_online(new_cpu)); WARN_ON_ONCE(is_migration_disabled(p)); -#endif trace_sched_migrate_task(p, new_cpu); @@ -3922,13 +3915,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { - /* - * The BPF scheduler may depend on select_task_rq() being invoked during - * wakeups. In addition, @p may end up executing on a different CPU - * regardless of what happens in the wakeup path making the ttwu_queue - * optimization less meaningful. Skip if on SCX. - */ - if (task_on_scx(p)) + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ + if (!scx_allow_ttwu_queue(p)) return false; /* @@ -4196,7 +4184,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4490,7 +4478,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -5578,7 +5566,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_SCHED_DEBUG static u64 cpu_resched_latency(struct rq *rq) { int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); @@ -5623,9 +5610,6 @@ static int __init setup_resched_latency_warn_ms(char *str) return 1; } __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); -#else -static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } -#endif /* CONFIG_SCHED_DEBUG */ /* * This function gets called by the timer code, with HZ frequency. @@ -5746,7 +5730,7 @@ static void sched_tick_remote(struct work_struct *work) * we are always sure that there is no proxy (only a * single task is running). */ - SCHED_WARN_ON(rq->curr != rq->donor); + WARN_ON_ONCE(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5966,7 +5950,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CT_STATE_USER); + WARN_ON_ONCE(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6719,9 +6703,7 @@ static void __sched notrace __schedule(int sched_mode) picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); -#ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -#endif if (likely(prev != next)) { rq->nr_switches++; @@ -6812,7 +6794,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * deadlock if the callback attempts to acquire a lock which is * already acquired. */ - SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); + WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, @@ -7095,7 +7077,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); + WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -7290,7 +7272,7 @@ int __sched __cond_resched(void) return 1; } /* - * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick * whether the current CPU is in an RCU read-side critical section, * so the tick can report quiescent states even for CPUs looping * in kernel context. In contrast, in non-preemptible kernels, @@ -7299,6 +7281,8 @@ int __sched __cond_resched(void) * RCU quiescent state. Therefore, the following code causes * cond_resched() to report a quiescent state, but only when RCU * is in urgent need of one. + * A third case, preemptible, but non-PREEMPT_RCU provides for + * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); @@ -7647,10 +7631,57 @@ PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ +#define preempt_dynamic_mode -1 + static inline void preempt_dynamic_init(void) { } #endif /* CONFIG_PREEMPT_DYNAMIC */ +const char *preempt_modes[] = { + "none", "voluntary", "full", "lazy", NULL, +}; + +const char *preempt_model_str(void) +{ + bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) && + (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) || + IS_ENABLED(CONFIG_PREEMPT_LAZY)); + static char buf[128]; + + if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) { + struct seq_buf s; + + seq_buf_init(&s, buf, sizeof(buf)); + seq_buf_puts(&s, "PREEMPT"); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + seq_buf_printf(&s, "%sRT%s", + brace ? "_{" : "_", + brace ? "," : ""); + + if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { + seq_buf_printf(&s, "(%s)%s", + preempt_dynamic_mode > 0 ? + preempt_modes[preempt_dynamic_mode] : "undef", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + seq_buf_printf(&s, "LAZY%s", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD)) + return "VOLUNTARY"; + + return "NONE"; +} + int io_schedule_prepare(void) { int old_iowait = current->in_iowait; @@ -7765,10 +7796,9 @@ void show_state_filter(unsigned int state_filter) sched_show_task(p); } -#ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -#endif + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: @@ -8183,7 +8213,7 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); if (--num_cpus_frozen) return; /* @@ -8202,7 +8232,7 @@ static void cpuset_cpu_inactive(unsigned int cpu) cpuset_update_active_cpus(); } else { num_cpus_frozen++; - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); } } @@ -8424,9 +8454,9 @@ void __init sched_init_smp(void) * CPU masks are stable and all blatant races in the below code cannot * happen. */ - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); sched_init_domains(cpu_active_mask); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) @@ -9016,7 +9046,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static struct task_group *sched_get_task_group(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -9028,13 +9058,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - - return tg; -} - -static void sched_change_group(struct task_struct *tsk, struct task_group *group) -{ - tsk->sched_task_group = group; + tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -9055,20 +9079,11 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct task_group *group; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - /* - * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous - * group changes. - */ - group = sched_get_task_group(tsk); - if (group == tsk->sched_task_group) - return; - update_rq_clock(rq); running = task_current_donor(rq, tsk); @@ -9079,7 +9094,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, group); + sched_change_group(tsk); if (!for_autogroup) scx_cgroup_move_task(tsk); @@ -9203,7 +9218,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) unsigned int clamps; lockdep_assert_held(&uclamp_mutex); - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent @@ -9295,7 +9310,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); guard(mutex)(&uclamp_mutex); guard(rcu)(); @@ -10538,7 +10553,7 @@ static void task_mm_cid_work(struct callback_head *work) struct mm_struct *mm; int weight, cpu; - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); work->next = work; /* Prevent double-add */ if (t->flags & PF_EXITING) diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1ef98a93eb1d..c4606ca89210 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -65,7 +65,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * a cookie until after we've removed it, we must have core scheduling * enabled here. */ - SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq)); if (sched_core_enqueued(p)) sched_core_dequeue(rq, p, DEQUEUE_SAVE); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ff4df16b5186..ad45a8fea245 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -166,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i) } } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { struct root_domain *rd = cpu_rq(cpu)->rd; - if (rd->visit_gen == gen) + if (rd->visit_cookie == cookie) return true; - rd->visit_gen = gen; + rd->visit_cookie = cookie; return false; } @@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i) return SCHED_CAPACITY_SCALE; } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { return false; } @@ -249,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; - SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } @@ -262,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ @@ -276,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; - SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */ } static inline @@ -286,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) dl_rq->this_bw = 0; - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); } static inline @@ -1382,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = dl_task_timer; + hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } /* @@ -1839,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = inactive_task_timer; + hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #define __node_2_dle(node) \ @@ -2956,7 +2954,7 @@ void dl_add_task_root_domain(struct task_struct *p) struct dl_bw *dl_b; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - if (!dl_task(p)) { + if (!dl_task(p) || dl_entity_is_special(&p->dl)) { raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return; } @@ -2981,18 +2979,22 @@ void dl_clear_root_domain(struct root_domain *rd) rd->dl_bw.total_bw = 0; /* - * dl_server bandwidth is only restored when CPUs are attached to root - * domains (after domains are created or CPUs moved back to the - * default root doamin). + * dl_servers are not tasks. Since dl_add_task_root_domain ignores + * them, we need to account for them here explicitly. */ for_each_cpu(i, rd->span) { struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; if (dl_server(dl_se) && cpu_active(i)) - rd->dl_bw.total_bw += dl_se->dl_bw; + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); } } +void dl_clear_root_domain_cpu(int cpu) +{ + dl_clear_root_domain(cpu_rq(cpu)->rd); +} + #endif /* CONFIG_SMP */ static void switched_from_dl(struct rq *rq, struct task_struct *p) @@ -3171,15 +3173,18 @@ DEFINE_SCHED_CLASS(dl) = { #endif }; -/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ -static u64 dl_generation; +/* + * Used for dl_bw check and update, used under sched_rt_handler()::mutex and + * sched_domains_mutex. + */ +u64 dl_cookie; int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu, cpus, ret = 0; unsigned long flags; @@ -3192,7 +3197,7 @@ int sched_dl_global_validate(void) for_each_online_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) + if (dl_bw_visited(cpu, cookie)) goto next; dl_b = dl_bw_of(cpu); @@ -3229,7 +3234,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -3240,7 +3245,7 @@ void sched_dl_do_global(void) for_each_possible_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) { + if (dl_bw_visited(cpu, cookie)) { rcu_read_unlock_sched(); continue; } @@ -3567,9 +3572,7 @@ void dl_bw_free(int cpu, u64 dl_bw) } #endif -#ifdef CONFIG_SCHED_DEBUG void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ef047add7f9e..56ae54e0ce6a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -244,11 +244,13 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - static const char * preempt_modes[] = { - "none", "voluntary", "full", "lazy", - }; - int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int j; + + /* Count entries in NULL terminated preempt_modes */ + for (j = 0; preempt_modes[j]; j++) + ; + j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); for (; i < j; i++) { if (preempt_dynamic_mode == i) @@ -292,7 +294,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, bool orig; cpus_read_lock(); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); orig = sched_debug_verbose; result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); @@ -304,7 +306,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, sd_dentry = NULL; } - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); cpus_read_unlock(); return result; @@ -515,9 +517,9 @@ static __init int sched_init_debug(void) debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); update_sched_domain_debugfs(); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); #endif #ifdef CONFIG_NUMA_BALANCING diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7b9dfee858e7..21575d39c376 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6,6 +6,9 @@ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com> */ +#include <linux/btf_ids.h> +#include "ext_idle.h" + #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { @@ -93,7 +96,7 @@ enum scx_ops_flags { /* * Keep built-in idle tracking even if ops.update_idle() is implemented. */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, /* * By default, if there are no other task to run on the CPU, ext core @@ -101,7 +104,7 @@ enum scx_ops_flags { * flag is specified, such tasks are passed to ops.enqueue() with * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. */ - SCX_OPS_ENQ_LAST = 1LLU << 1, + SCX_OPS_ENQ_LAST = 1LLU << 1, /* * An exiting task may schedule after PF_EXITING is set. In such cases, @@ -114,13 +117,13 @@ enum scx_ops_flags { * depend on pid lookups and wants to handle these tasks directly, the * following flag can be used. */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, + SCX_OPS_ENQ_EXITING = 1LLU << 2, /* * If set, only tasks with policy set to SCHED_EXT are attached to * sched_ext. If clear, SCHED_NORMAL tasks are also included. */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, /* * A migration disabled task can only execute on its current CPU. By @@ -133,7 +136,29 @@ enum scx_ops_flags { * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr * and thus may disagree with cpumask_weight(p->cpus_ptr). */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, /* * CPU cgroup support flags @@ -144,7 +169,9 @@ enum scx_ops_flags { SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -779,6 +806,7 @@ enum scx_deq_flags { enum scx_pick_idle_cpu_flags { SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ }; enum scx_kick_flags { @@ -894,16 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all); static struct sched_ext_ops scx_ops; static bool scx_warned_zero_slice; +DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); - -#ifdef CONFIG_SMP -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); -#endif static struct static_key_false scx_has_op[SCX_OPI_END] = { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; @@ -938,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* idle tracking */ -#ifdef CONFIG_SMP -#ifdef CONFIG_CPUMASK_OFFSTACK -#define CL_ALIGNED_IF_ONSTACK -#else -#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -#endif - -static struct { - cpumask_var_t cpu; - cpumask_var_t smt; -} idle_masks CL_ALIGNED_IF_ONSTACK; - -#endif /* CONFIG_SMP */ - /* for %SCX_KICK_WAIT */ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; @@ -1473,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return p; } +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * The total number of tasks enqueued (or pick_task-ed) with a + * default time slice (SCX_SLICE_DFL). + */ + s64 SCX_EV_ENQ_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +/* + * The event counter is organized by a per-CPU variable to minimize the + * accounting overhead without synchronization. A system-wide view on the + * event counter is constructed when requested by scx_bpf_get_event_stat(). + */ +static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); + +/** + * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This can be used when preemption is not disabled. + */ +#define scx_add_event(name, cnt) do { \ + this_cpu_add(event_stats_cpu.name, cnt); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This should be used only when preemption is disabled. + */ +#define __scx_add_event(name, cnt) do { \ + __this_cpu_add(event_stats_cpu.name, cnt); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' + * @dst_e: destination event stats + * @src_e: source event stats + * @kind: a kind of event to be aggregated + */ +#define scx_agg_event(dst_e, src_e, kind) do { \ + (dst_e)->kind += READ_ONCE((src_e)->kind); \ +} while(0) + +/** + * scx_dump_event - Dump an event 'kind' in 'events' to 's' + * @s: output seq_buf + * @events: event stats + * @kind: a kind of event to dump + */ +#define scx_dump_event(s, events, kind) do { \ + dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ +} while (0) + + +static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz); + static enum scx_ops_enable_state scx_ops_enable_state(void) { return atomic_read(&scx_ops_enable_state_var); @@ -2018,21 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) + if (scx_rq_bypassing(rq)) { + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); goto global; + } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; /* see %SCX_OPS_ENQ_EXITING */ if (!static_branch_unlikely(&scx_ops_enq_exiting) && - unlikely(p->flags & PF_EXITING)) + unlikely(p->flags & PF_EXITING)) { + __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); goto local; + } /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && - is_migration_disabled(p)) + is_migration_disabled(p)) { + __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; + } if (!SCX_HAS_OP(enqueue)) goto global; @@ -2072,6 +2197,7 @@ local: */ touch_core_sched(rq, p); p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); local_norefill: dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); return; @@ -2079,6 +2205,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); dispatch_enqueue(find_global_dsq(p), p, enq_flags); } @@ -2150,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; + + if ((enq_flags & SCX_ENQ_CPU_SELECTED) && + unlikely(cpu_of(rq) != p->scx.selected_cpu)) + __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); } static void ops_dequeue(struct task_struct *p, u64 deq_flags) @@ -2337,11 +2468,11 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, - bool trigger_error) + bool enforce) { int cpu = cpu_of(rq); - SCHED_WARN_ON(task_cpu(p) == cpu); + WARN_ON_ONCE(task_cpu(p) == cpu); /* * If @p has migration disabled, @p->cpus_ptr is updated to contain only @@ -2356,7 +2487,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * easily be masked if task_allowed_on_cpu() is done first. */ if (unlikely(is_migration_disabled(p))) { - if (trigger_error) + if (enforce) scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", p->comm, p->pid, task_cpu(p), cpu); return false; @@ -2369,14 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * picked CPU is outside the allowed mask. */ if (!task_allowed_on_cpu(p, cpu)) { - if (trigger_error) + if (enforce) scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", cpu, p->comm, p->pid); return false; } - if (!scx_rq_online(rq)) + if (!scx_rq_online(rq)) { + if (enforce) + __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; + } return true; } @@ -2446,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } #else /* CONFIG_SMP */ static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } +static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ @@ -2893,6 +3027,7 @@ no_tasks: if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; + __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -3159,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq) */ if (keep_prev) { p = prev; - if (!p->scx.slice) + if (!p->scx.slice) { p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } } else { p = first_local_task(rq); if (!p) { @@ -3176,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); } } @@ -3220,418 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, #ifdef CONFIG_SMP -static bool test_and_clear_cpu_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - /* - * SMT mask should be cleared whether we can claim @cpu or not. The SMT - * cluster is not wholly idle either way. This also prevents - * scx_pick_idle_cpu() from getting caught in an infinite loop. - */ - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - /* - * If offline, @cpu is not its own sibling and - * scx_pick_idle_cpu() can get caught in an infinite loop as - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu - * is eventually cleared. - * - * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to - * reduce memory writes, which may help alleviate cache - * coherence pressure. - */ - if (cpumask_intersects(smt, idle_masks.smt)) - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - else if (cpumask_test_cpu(cpu, idle_masks.smt)) - __cpumask_clear_cpu(cpu, idle_masks.smt); - } -#endif - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -} - -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -{ - int cpu; - -retry: - if (sched_smt_active()) { - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); - if (cpu < nr_cpu_ids) - goto found; - - if (flags & SCX_PICK_IDLE_CORE) - return -EBUSY; - } - - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); - if (cpu >= nr_cpu_ids) - return -EBUSY; - -found: - if (test_and_clear_cpu_idle(cpu)) - return cpu; - else - goto retry; -} - -/* - * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC - * domain is not defined). - */ -static unsigned int llc_weight(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sd->span_weight; -} - -/* - * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC - * domain is not defined). - */ -static struct cpumask *llc_span(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sched_domain_span(sd); -} - -/* - * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the - * NUMA domain is not defined). - */ -static unsigned int numa_weight(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return 0; - sg = sd->groups; - if (!sg) - return 0; - - return sg->group_weight; -} - -/* - * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA - * domain is not defined). - */ -static struct cpumask *numa_span(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return NULL; - sg = sd->groups; - if (!sg) - return NULL; - - return sched_group_span(sg); -} - -/* - * Return true if the LLC domains do not perfectly overlap with the NUMA - * domains, false otherwise. - */ -static bool llc_numa_mismatch(void) -{ - int cpu; - - /* - * We need to scan all online CPUs to verify whether their scheduling - * domains overlap. - * - * While it is rare to encounter architectures with asymmetric NUMA - * topologies, CPU hotplugging or virtualized environments can result - * in asymmetric configurations. - * - * For example: - * - * NUMA 0: - * - LLC 0: cpu0..cpu7 - * - LLC 1: cpu8..cpu15 [offline] - * - * NUMA 1: - * - LLC 0: cpu16..cpu23 - * - LLC 1: cpu24..cpu31 - * - * In this case, if we only check the first online CPU (cpu0), we might - * incorrectly assume that the LLC and NUMA domains are fully - * overlapping, which is incorrect (as NUMA 1 has two distinct LLC - * domains). - */ - for_each_online_cpu(cpu) - if (llc_weight(cpu) != numa_weight(cpu)) - return true; - - return false; -} - -/* - * Initialize topology-aware scheduling. - * - * Detect if the system has multiple LLC or multiple NUMA domains and enable - * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle - * selection policy. - * - * Assumption: the kernel's internal topology representation assumes that each - * CPU belongs to a single LLC domain, and that each LLC domain is entirely - * contained within a single NUMA node. - */ -static void update_selcpu_topology(void) -{ - bool enable_llc = false, enable_numa = false; - unsigned int nr_cpus; - s32 cpu = cpumask_first(cpu_online_mask); - - /* - * Enable LLC domain optimization only when there are multiple LLC - * domains among the online CPUs. If all online CPUs are part of a - * single LLC domain, the idle CPU selection logic can choose any - * online CPU without bias. - * - * Note that it is sufficient to check the LLC domain of the first - * online CPU to determine whether a single LLC domain includes all - * CPUs. - */ - rcu_read_lock(); - nr_cpus = llc_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus()) - enable_llc = true; - pr_debug("sched_ext: LLC=%*pb weight=%u\n", - cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); - } - - /* - * Enable NUMA optimization only when there are multiple NUMA domains - * among the online CPUs and the NUMA domains don't perfectly overlaps - * with the LLC domains. - * - * If all CPUs belong to the same NUMA node and the same LLC domain, - * enabling both NUMA and LLC optimizations is unnecessary, as checking - * for an idle CPU in the same domain twice is redundant. - */ - nr_cpus = numa_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) - enable_numa = true; - pr_debug("sched_ext: NUMA=%*pb weight=%u\n", - cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); - } - rcu_read_unlock(); - - pr_debug("sched_ext: LLC idle selection %s\n", - str_enabled_disabled(enable_llc)); - pr_debug("sched_ext: NUMA idle selection %s\n", - str_enabled_disabled(enable_numa)); - - if (enable_llc) - static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); - if (enable_numa) - static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); -} - -/* - * Built-in CPU idle selection policy: - * - * 1. Prioritize full-idle cores: - * - always prioritize CPUs from fully idle cores (both logical CPUs are - * idle) to avoid interference caused by SMT. - * - * 2. Reuse the same CPU: - * - prefer the last used CPU to take advantage of cached data (L1, L2) and - * branch prediction optimizations. - * - * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. - * - * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. - * - * 5. Pick any idle CPU usable by the task. - * - * Step 3 and 4 are performed only if the system has, respectively, multiple - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and - * scx_selcpu_topo_numa). - * - * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because - * we never call ops.select_cpu() for them, see select_task_rq(). - */ -static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *found) -{ - const struct cpumask *llc_cpus = NULL; - const struct cpumask *numa_cpus = NULL; - s32 cpu; - - *found = false; - - /* - * This is necessary to protect llc_cpus. - */ - rcu_read_lock(); - - /* - * Determine the scheduling domain only if the task is allowed to run - * on all CPUs. - * - * This is done primarily for efficiency, as it avoids the overhead of - * updating a cpumask every time we need to select an idle CPU (which - * can be costly in large SMP systems), but it also aligns logically: - * if a task's scheduling domain is restricted by user-space (through - * CPU affinity), the task will simply use the flat scheduling domain - * defined by user-space. - */ - if (p->nr_cpus_allowed >= num_possible_cpus()) { - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = numa_span(prev_cpu); - - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) - llc_cpus = llc_span(prev_cpu); - } - - /* - * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. - */ - if (wake_flags & SCX_WAKE_SYNC) { - cpu = smp_processor_id(); - - /* - * If the waker's CPU is cache affine and prev_cpu is idle, - * then avoid a migration. - */ - if (cpus_share_cache(cpu, prev_cpu) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * If the waker's local DSQ is empty, and the system is under - * utilized, try to wake up @p to the local DSQ of the waker. - * - * Checking only for an empty local DSQ is insufficient as it - * could give the wakee an unfair advantage when the system is - * oversaturated. - * - * Checking only for the presence of idle CPUs is also - * insufficient as the local DSQ of the waker could have tasks - * piled up on it even if there is an idle core elsewhere on - * the system. - */ - if (!cpumask_empty(idle_masks.cpu) && - !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) - goto cpu_found; - } - } - - /* - * If CPU has SMT, any wholly idle CPU is likely a better pick than - * partially idle @prev_cpu. - */ - if (sched_smt_active()) { - /* - * Keep using @prev_cpu if it's part of a fully idle core. - */ - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any fully idle core in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any fully idle core in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any full idle core usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Use @prev_cpu if it's idle. - */ - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any idle CPU in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - goto cpu_found; - - rcu_read_unlock(); - return prev_cpu; - -cpu_found: - rcu_read_unlock(); - - *found = true; - return cpu; -} - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + bool rq_bypass; + /* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it * can be a good migration opportunity with low cache and memory @@ -3645,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { + rq_bypass = scx_rq_bypassing(task_rq(p)); + if (SCX_HAS_OP(select_cpu) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3655,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, select_cpu, p, prev_cpu, wake_flags); + p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; } else { - bool found; s32 cpu; - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); - if (found) { + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + if (cpu >= 0) { p->scx.slice = SCX_SLICE_DFL; p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } else { + cpu = prev_cpu; } + p->scx.selected_cpu = cpu; + + if (rq_bypass) + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -3696,90 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p, (struct cpumask *)p->cpus_ptr); } -static void reset_idle_masks(void) -{ - /* - * Consider all online cpus idle. Should converge to the actual state - * quickly. - */ - cpumask_copy(idle_masks.cpu, cpu_online_mask); - cpumask_copy(idle_masks.smt, cpu_online_mask); -} - -static void update_builtin_idle(int cpu, bool idle) -{ - assign_cpu(cpu, idle_masks.cpu, idle); - -#ifdef CONFIG_SCHED_SMT - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - if (idle) { - /* - * idle_masks.smt handling is racy but that's fine as - * it's only for optimization and self-correcting. - */ - if (!cpumask_subset(smt, idle_masks.cpu)) - return; - cpumask_or(idle_masks.smt, idle_masks.smt, smt); - } else { - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - } - } -#endif -} - -/* - * Update the idle state of a CPU to @idle. - * - * If @do_notify is true, ops.update_idle() is invoked to notify the scx - * scheduler of an actual idle state transition (idle to busy or vice - * versa). If @do_notify is false, only the idle state in the idle masks is - * refreshed without invoking ops.update_idle(). - * - * This distinction is necessary, because an idle CPU can be "reserved" and - * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as - * busy even if no tasks are dispatched. In this case, the CPU may return - * to idle without a true state transition. Refreshing the idle masks - * without invoking ops.update_idle() ensures accurate idle state tracking - * while avoiding unnecessary updates and maintaining balanced state - * transitions. - */ -void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) -{ - int cpu = cpu_of(rq); - - lockdep_assert_rq_held(rq); - - /* - * Trigger ops.update_idle() only when transitioning from a task to - * the idle thread and vice versa. - * - * Idle transitions are indicated by do_notify being set to true, - * managed by put_prev_task_idle()/set_next_task_idle(). - */ - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - - /* - * Update the idle masks: - * - for real idle transitions (do_notify == true) - * - for idle-to-idle transitions (indicated by the previous task - * being the idle thread, managed by pick_task_idle()) - * - * Skip updating idle masks if the previous task is not the idle - * thread, since set_next_task_idle() has already handled it when - * transitioning from a task to the idle thread (calling this - * function with do_notify == true). - * - * In this way we can avoid updating the idle masks twice, - * unnecessarily. - */ - if (static_branch_likely(&scx_builtin_idle_enabled)) - if (do_notify || is_idle_task(rq->curr)) - update_builtin_idle(cpu, idle); -} - static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); @@ -3787,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); if (scx_enabled()) - update_selcpu_topology(); + scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); @@ -3819,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#else /* CONFIG_SMP */ - -static bool test_and_clear_cpu_idle(int cpu) { return false; } -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -static void reset_idle_masks(void) {} - #endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) @@ -4749,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj, } SCX_ATTR(ops); +#define scx_attr_event_show(buf, at, events, kind) ({ \ + sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ +}) + +static ssize_t scx_attr_events_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct scx_event_stats events; + int at = 0; + + scx_bpf_events(&events, sizeof(events)); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + return at; +} +SCX_ATTR(events); + static struct attribute *scx_sched_attrs[] = { &scx_attr_ops.attr, + &scx_attr_events.attr, NULL, }; ATTRIBUTE_GROUPS(scx_sched); @@ -4862,6 +4535,8 @@ static void scx_clear_softlockup(void) static void scx_ops_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); + static unsigned long bypass_timestamp; + int cpu; unsigned long flags; @@ -4871,11 +4546,15 @@ static void scx_ops_bypass(bool bypass) WARN_ON_ONCE(scx_ops_bypass_depth <= 0); if (scx_ops_bypass_depth != 1) goto unlock; + bypass_timestamp = ktime_get_ns(); + scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); } else { scx_ops_bypass_depth--; WARN_ON_ONCE(scx_ops_bypass_depth < 0); if (scx_ops_bypass_depth != 0) goto unlock; + scx_add_event(SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } atomic_inc(&scx_ops_breather_depth); @@ -5095,11 +4774,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_allow_queued_wakeup); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); - static_branch_disable(&scx_builtin_idle_enabled); + scx_idle_disable(); synchronize_rcu(); if (ei->kind >= SCX_EXIT_ERROR) { @@ -5349,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) .at_jiffies = jiffies, }; struct seq_buf s; + struct scx_event_stats events; unsigned long flags; char *buf; int cpu; @@ -5457,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) rq_unlock(rq, &rf); } + dump_newline(&s); + dump_line(&s, "Event counters"); + dump_line(&s, "--------------"); + + scx_bpf_events(&events, sizeof(events)); + scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); + scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); + scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); + scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); + scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), trunc_marker, sizeof(trunc_marker)); @@ -5546,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops) return -EINVAL; } + /* + * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle + * selection policy to be enabled. + */ + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && + (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); + return -EINVAL; + } + return 0; } @@ -5564,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_lock(&scx_ops_enable_mutex); + /* + * Clear event counters so a new scx scheduler gets + * fresh event counter values. + */ + for_each_possible_cpu(cpu) { + struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); + memset(e, 0, sizeof(*e)); + } + if (!scx_ops_helper) { WRITE_ONCE(scx_ops_helper, scx_create_rt_helper("sched_ext_ops_helper")); @@ -5661,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable_cpuslocked(&scx_has_op[i]); check_hotplug_seq(ops); -#ifdef CONFIG_SMP - update_selcpu_topology(); -#endif + scx_idle_update_selcpu_topology(ops); + cpus_read_unlock(); ret = validate_ops(ops); @@ -5702,9 +5416,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable(&scx_has_op[i]); + if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + static_branch_enable(&scx_ops_allow_queued_wakeup); if (ops->flags & SCX_OPS_ENQ_LAST) static_branch_enable(&scx_ops_enq_last); - if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) @@ -5712,12 +5427,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { - reset_idle_masks(); - static_branch_enable(&scx_builtin_idle_enabled); - } else { - static_branch_disable(&scx_builtin_idle_enabled); - } + scx_idle_enable(ops); /* * Lock out forks, cgroup on/offlining and moves before opening the @@ -6356,10 +6066,8 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -#ifdef CONFIG_SMP - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -#endif + scx_idle_init_masks(); + scx_kick_cpus_pnt_seqs = __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, __alignof__(scx_kick_cpus_pnt_seqs[0])); @@ -6367,15 +6075,16 @@ void __init init_sched_ext_class(void) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); @@ -6392,65 +6101,6 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -#include <linux/btf_ids.h> - -__bpf_kfunc_start_defs(); - -static bool check_builtin_idle_enabled(void) -{ - if (static_branch_likely(&scx_builtin_idle_enabled)) - return true; - - scx_ops_error("built-in idle tracking is disabled"); - return false; -} - -/** - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() - * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags - * @is_idle: out parameter indicating whether the returned CPU is idle - * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). - * - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is - * currently idle and thus a good candidate for direct dispatching. - */ -__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) -{ - if (!ops_cpu_valid(prev_cpu, NULL)) - goto prev_cpu; - - if (!check_builtin_idle_enabled()) - goto prev_cpu; - - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) - goto prev_cpu; - -#ifdef CONFIG_SMP - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#endif - -prev_cpu: - *is_idle = false; - return prev_cpu; -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) - -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_select_cpu, -}; - static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) @@ -6972,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. + * + * Also skip re-enqueueing tasks that can only run on this + * CPU, as they would just be re-added to the same local + * DSQ without any benefit. */ - if (p->migration_pending) + if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) continue; dispatch_dequeue(rq, p); @@ -7470,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) } /** + * scx_bpf_nr_node_ids - Return the number of possible node IDs + * + * All valid node IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_node_ids(void) +{ + return nr_node_ids; +} + +/** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * * All valid CPU IDs in the system are smaller than the returned value. @@ -7510,142 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) } /** - * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking - * per-CPU cpumask. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, - * per-physical-core cpumask. Can be used to determine if an entire physical - * core is free. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - if (sched_smt_active()) - return idle_masks.smt; - else - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to - * either the percpu, or SMT idle-tracking cpumask. - * @idle_mask: &cpumask to use - */ -__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -{ - /* - * Empty function body because we aren't actually acquiring or releasing - * a reference to a global idle cpumask, which is read-only in the - * caller and is never released. The acquire / release semantics here - * are just used to make the cpumask a trusted pointer in the caller. - */ -} - -/** - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state - * @cpu: cpu to test and clear idle for - * - * Returns %true if @cpu was idle and its idle state was successfully cleared. - * %false otherwise. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -{ - if (!check_builtin_idle_enabled()) - return false; - - if (ops_cpu_valid(cpu, NULL)) - return test_and_clear_cpu_idle(cpu); - else - return false; -} - -/** - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu - * number on success. -%EBUSY if no matching cpu was found. - * - * Idle CPU tracking may race against CPU scheduling state transitions. For - * example, this function may return -%EBUSY as CPUs are transitioning into the - * idle state. If the caller then assumes that there will be dispatch events on - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch - * event in the near future. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - if (!check_builtin_idle_enabled()) - return -EBUSY; - - return scx_pick_idle_cpu(cpus_allowed, flags); -} - -/** - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is - * empty. - * - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not - * set, this function can't tell which CPUs are idle and will always pick any - * CPU. - */ -__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - s32 cpu; - - if (static_branch_likely(&scx_builtin_idle_enabled)) { - cpu = scx_pick_idle_cpu(cpus_allowed, flags); - if (cpu >= 0) - return cpu; - } - - cpu = cpumask_any_distribute(cpus_allowed); - if (cpu < nr_cpu_ids) - return cpu; - else - return -EBUSY; -} - -/** * scx_bpf_task_running - Is task currently running? * @p: task of interest */ @@ -7765,6 +7293,43 @@ __bpf_kfunc u64 scx_bpf_now(void) return clock; } +/* + * scx_bpf_events - Get a system-wide event counter to + * @events: output buffer from a BPF program + * @events__sz: @events len, must end in '__sz'' for the verifier + */ +__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, + size_t events__sz) +{ + struct scx_event_stats e_sys, *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into the system-wide counters. */ + memset(&e_sys, 0, sizeof(e_sys)); + for_each_possible_cpu(cpu) { + e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); + scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); + } + + /* + * We cannot entirely trust a BPF-provided size since a BPF program + * might be compiled against a different vmlinux.h, of which + * scx_event_stats would be larger (a newer vmlinux.h) or smaller + * (an older vmlinux.h). Hence, we use the smaller size to avoid + * memory corruption. + */ + events__sz = min(events__sz, sizeof(*events)); + memcpy(events, &e_sys, events__sz); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -7780,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) @@ -7797,6 +7363,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) +BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -7820,8 +7387,6 @@ static int __init scx_init(void) * check using scx_kf_allowed(). */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, - &scx_kfunc_set_select_cpu)) || - (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_dispatch)) || @@ -7841,6 +7406,12 @@ static int __init scx_init(void) return ret; } + ret = scx_idle_init(); + if (ret) { + pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); + return ret; + } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) { pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 1079b56b0f7a..1bda96b19a1b 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,6 +8,8 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); + void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + return !scx_enabled() || + static_branch_likely(&scx_ops_allow_queued_wakeup) || + p->sched_class != &ext_sched_class; +} + #ifdef CONFIG_SCHED_CORE bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi); @@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {} static inline void scx_rq_deactivate(struct rq *rq) {} static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; } static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c new file mode 100644 index 000000000000..52c36a70a3d0 --- /dev/null +++ b/kernel/sched/ext_idle.c @@ -0,0 +1,1171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Built-in idle CPU tracking policy. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#include "ext_idle.h" + +/* Enable/disable built-in idle CPU selection policy */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +/* Enable/disable per-node idle cpumasks */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP +/* Enable/disable LLC aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); + +/* Enable/disable NUMA aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); + +/* + * cpumasks to track idle CPUs within each NUMA node. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask + * from is used to track all the idle CPUs in the system. + */ +struct scx_idle_cpus { + cpumask_var_t cpu; + cpumask_var_t smt; +}; + +/* + * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE + * is not enabled). + */ +static struct scx_idle_cpus scx_idle_global_masks; + +/* + * Per-node idle cpumasks. + */ +static struct scx_idle_cpus **scx_idle_node_masks; + +/* + * Return the idle masks associated to a target @node. + * + * NUMA_NO_NODE identifies the global idle cpumask. + */ +static struct scx_idle_cpus *idle_cpumask(int node) +{ + return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node]; +} + +/* + * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if + * per-node idle cpumasks are disabled. + */ +static int scx_cpu_node_if_enabled(int cpu) +{ + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +} + +bool scx_idle_test_and_clear_cpu(int cpu) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from the idle SMT mask. Ensure that + * @cpu is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. + */ + if (cpumask_intersects(smt, idle_smts)) + cpumask_andnot(idle_smts, idle_smts, smt); + else if (cpumask_test_cpu(cpu, idle_smts)) + __cpumask_clear_cpu(cpu, idle_smts); + } +#endif + + return cpumask_test_and_clear_cpu(cpu, idle_cpus); +} + +/* + * Pick an idle CPU in a specific NUMA node. + */ +static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (scx_idle_test_and_clear_cpu(cpu)) + return cpu; + else + goto retry; +} + +/* + * Tracks nodes that have not yet been visited when searching for an idle + * CPU across all available nodes. + */ +static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); + +/* + * Search for an idle CPU across all nodes, excluding @node. + */ +static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + nodemask_t *unvisited; + s32 cpu = -EBUSY; + + preempt_disable(); + unvisited = this_cpu_ptr(&per_cpu_unvisited); + + /* + * Restrict the search to the online nodes (excluding the current + * node that has been visited already). + */ + nodes_copy(*unvisited, node_states[N_ONLINE]); + node_clear(node, *unvisited); + + /* + * Traverse all nodes in order of increasing distance, starting + * from @node. + * + * This loop is O(N^2), with N being the amount of NUMA nodes, + * which might be quite expensive in large NUMA systems. However, + * this complexity comes into play only when a scheduler enables + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU + * without specifying a target NUMA node, so it shouldn't be a + * bottleneck is most cases. + * + * As a future optimization we may want to cache the list of nodes + * in a per-node array, instead of actually traversing them every + * time. + */ + for_each_node_numadist(node, *unvisited) { + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + break; + } + preempt_enable(); + + return cpu; +} + +/* + * Find an idle CPU in the system, starting from @node. + */ +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + s32 cpu; + + /* + * Always search in the starting node first (this is an + * optimization that can save some cycles even when the search is + * not limited to a single node). + */ + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + /* + * Stop the search if we are using only a single global cpumask + * (NUMA_NO_NODE) or if the search is restricted to the first node + * only. + */ + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) + return -EBUSY; + + /* + * Extend the search to the other online nodes. + */ + return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags); +} + +/* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) + return true; + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) +{ + bool enable_llc = false, enable_numa = false; + unsigned int nr_cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) + enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA + * optimization, as we would naturally select idle CPUs within + * specific NUMA nodes querying the corresponding per-node cpumask. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), nr_cpus); + } + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + str_enabled_disabled(enable_llc)); + pr_debug("sched_ext: NUMA idle selection %s\n", + str_enabled_disabled(enable_numa)); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * 5. Pick any idle CPU usable by the task. + * + * Step 3 and 4 are performed only if the system has, respectively, + * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. + * + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always + * begin in @prev_cpu's node and proceed to other nodes in order of + * increasing distance. + * + * Return the picked CPU if idle, or a negative value otherwise. + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags) +{ + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; + int node = scx_cpu_node_if_enabled(prev_cpu); + s32 cpu; + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = numa_span(prev_cpu); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) + llc_cpus = llc_span(prev_cpu); + } + + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + int waker_node; + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + cpu = smp_processor_id(); + if (cpus_share_cache(cpu, prev_cpu) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + waker_node = cpu_to_node(cpu); + if (!(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0 && + (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && + !cpumask_empty(idle_cpumask(waker_node)->cpu)) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto out_unlock; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ + if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any full-idle core usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always + * begin in prev_cpu's node and proceed to other nodes in + * order of increasing distance. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + + /* + * Give up if we're strictly looking for a full-idle SMT + * core. + */ + if (flags & SCX_PICK_IDLE_CORE) { + cpu = prev_cpu; + goto out_unlock; + } + } + + /* + * Use @prev_cpu if it's idle. + */ + if (scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin + * in prev_cpu's node and proceed to other nodes in order of + * increasing distance. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); + if (cpu >= 0) + goto out_unlock; + +out_unlock: + rcu_read_unlock(); + + return cpu; +} + +/* + * Initialize global and per-node idle cpumasks. + */ +void scx_idle_init_masks(void) +{ + int node; + + /* Allocate global idle cpumasks */ + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); + + /* Allocate per-node idle cpumasks */ + scx_idle_node_masks = kcalloc(num_possible_nodes(), + sizeof(*scx_idle_node_masks), GFP_KERNEL); + BUG_ON(!scx_idle_node_masks); + + for_each_node(node) { + scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, node); + BUG_ON(!scx_idle_node_masks[node]); + + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); + } +} + +static void update_builtin_idle(int cpu, bool idle) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + + assign_cpu(cpu, idle_cpus, idle); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + if (idle) { + /* + * idle_smt handling is racy but that's fine as it's + * only for optimization and self-correcting. + */ + if (!cpumask_subset(smt, idle_cpus)) + return; + cpumask_or(idle_smts, idle_smts, smt); + } else { + cpumask_andnot(idle_smts, idle_smts, smt); + } + } +#endif +} + +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + +static void reset_idle_masks(struct sched_ext_ops *ops) +{ + int node; + + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); + return; + } + + for_each_node(node) { + const struct cpumask *node_mask = cpumask_of_node(node); + + cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); + cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); + } +} +#endif /* CONFIG_SMP */ + +void scx_idle_enable(struct sched_ext_ops *ops) +{ + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) + static_branch_enable(&scx_builtin_idle_enabled); + else + static_branch_disable(&scx_builtin_idle_enabled); + + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) + static_branch_enable(&scx_builtin_idle_per_node); + else + static_branch_disable(&scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP + reset_idle_masks(ops); +#endif +} + +void scx_idle_disable(void) +{ + static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_per_node); +} + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ + +static int validate_node(int node) +{ + if (!static_branch_likely(&scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is disabled"); + return -EOPNOTSUPP; + } + + /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ + if (node == NUMA_NO_NODE) + return -ENOENT; + + /* Make sure node is in a valid range */ + if (node < 0 || node >= nr_node_ids) { + scx_ops_error("invalid node %d", node); + return -EINVAL; + } + + /* Make sure the node is part of the set of possible nodes */ + if (!node_possible(node)) { + scx_ops_error("unavailable node %d", node); + return -EINVAL; + } + + return node; +} + +__bpf_kfunc_start_defs(); + +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_ops_error("built-in idle tracking is disabled"); + return false; +} + +/** + * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or + * trigger an error if @cpu is invalid + * @cpu: target CPU + */ +__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +{ +#ifdef CONFIG_NUMA + if (!ops_cpu_valid(cpu, NULL)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +#else + return 0; +#endif +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can only be called from ops.select_cpu() if the built-in CPU selection is + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ +#ifdef CONFIG_SMP + s32 cpu; +#endif + if (!ops_cpu_valid(prev_cpu, NULL)) + goto prev_cpu; + + if (!check_builtin_idle_enabled()) + goto prev_cpu; + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + +#ifdef CONFIG_SMP + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + if (cpu >= 0) { + *is_idle = true; + return cpu; + } +#endif + +prev_cpu: + *is_idle = false; + return prev_cpu; +} + +/** + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the + * idle-tracking per-CPU cpumask of a target NUMA node. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the + * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be + * used to determine if an entire physical core is free. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(node)->smt; + else + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(NUMA_NO_NODE)->smt; + else + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global idle cpumask, which is read-only in the + * caller and is never released. The acquire / release semantics here + * are just used to make the cpumask a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!check_builtin_idle_enabled()) + return false; + + if (ops_cpu_valid(cpu, NULL)) + return scx_idle_test_and_clear_cpu(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_* flags + * + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. + * + * Returns the picked idle cpu number on success, or -%EBUSY if no matching + * cpu was found. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node). + * + * Always returns an error if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if + * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + node = validate_node(node); + if (node < 0) + return node; + + return scx_pick_idle_cpu(cpus_allowed, node, flags); +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_idle_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); +} + +/** + * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available + * or pick any CPU from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node, regardless of + * the CPU idle state). + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + s32 cpu; + + node = validate_node(node); + if (node < 0) + return node; + + cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + if (flags & SCX_PICK_IDLE_IN_NODE) + cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed); + else + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_any_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_idle) +BTF_ID_FLAGS(func, scx_bpf_cpu_node) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_idle) + +static const struct btf_kfunc_id_set scx_kfunc_set_idle = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_idle, +}; + +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, +}; + +int scx_idle_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + + return ret; +} diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h new file mode 100644 index 000000000000..511cc2221f7a --- /dev/null +++ b/kernel/sched/ext_idle.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#ifndef _KERNEL_SCHED_EXT_IDLE_H +#define _KERNEL_SCHED_EXT_IDLE_H + +struct sched_ext_ops; + +#ifdef CONFIG_SMP +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); +void scx_idle_init_masks(void); +bool scx_idle_test_and_clear_cpu(int cpu); +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); +#else /* !CONFIG_SMP */ +static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} +static inline void scx_idle_init_masks(void) {} +static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } +static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* CONFIG_SMP */ + +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags); +void scx_idle_enable(struct sched_ext_ops *ops); +void scx_idle_disable(void); +int scx_idle_init(void); + +#endif /* _KERNEL_SCHED_EXT_IDLE_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c798d2795243..e43993a4e580 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -74,12 +74,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; static int __init setup_sched_thermal_decay_shift(char *str) { @@ -399,7 +399,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) static inline void assert_list_leaf_cfs_rq(struct rq *rq) { - SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); + WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } /* Iterate through all leaf cfs_rq's on a runqueue */ @@ -696,7 +696,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { s64 vlag, limit; - SCHED_WARN_ON(!se->on_rq); + WARN_ON_ONCE(!se->on_rq); vlag = avg_vruntime(cfs_rq) - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); @@ -884,6 +884,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ +static inline void set_protect_slice(struct sched_entity *se) +{ + se->vlag = se->deadline; +} + +static inline bool protect_slice(struct sched_entity *se) +{ + return se->vlag == se->deadline; +} + +static inline void cancel_protect_slice(struct sched_entity *se) +{ + if (protect_slice(se)) + se->vlag = se->deadline + 1; +} + +/* * Earliest Eligible Virtual Deadline First * * In order to provide latency guarantees for different request sizes @@ -919,11 +939,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - /* - * Once selected, run a task until it either becomes non-eligible or - * until it gets a new slice. See the HACK in set_next_entity(). - */ - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -967,7 +983,6 @@ found: return best; } -#ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -994,7 +1009,6 @@ int sched_update_scaling(void) return 0; } #endif -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -3301,7 +3315,7 @@ static void task_numa_work(struct callback_head *work) bool vma_pids_skipped; bool vma_pids_forced = false; - SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* @@ -4020,7 +4034,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa) * Make sure that rounding and/or propagation of PELT values never * break this. */ - SCHED_WARN_ON(sa->load_avg || + WARN_ON_ONCE(sa->load_avg || sa->util_avg || sa->runnable_avg); @@ -5444,7 +5458,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); if (flags & DEQUEUE_DELAYED) { - SCHED_WARN_ON(!se->sched_delayed); + WARN_ON_ONCE(!se->sched_delayed); } else { bool delay = sleep; /* @@ -5454,7 +5468,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_SPECIAL) delay = false; - SCHED_WARN_ON(delay && se->sched_delayed); + WARN_ON_ONCE(delay && se->sched_delayed); if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { @@ -5530,15 +5544,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. - */ - se->vlag = se->deadline; + + set_protect_slice(se); } update_stats_curr_start(cfs_rq, se); - SCHED_WARN_ON(cfs_rq->curr); + WARN_ON_ONCE(cfs_rq->curr); cfs_rq->curr = se; /* @@ -5579,7 +5590,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) if (sched_feat(PICK_BUDDY) && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ - SCHED_WARN_ON(cfs_rq->next->sched_delayed); + WARN_ON_ONCE(cfs_rq->next->sched_delayed); return cfs_rq->next; } @@ -5615,7 +5626,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } - SCHED_WARN_ON(cfs_rq->curr != prev); + WARN_ON_ONCE(cfs_rq->curr != prev); cfs_rq->curr = NULL; } @@ -5838,7 +5849,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_self = 0; - if (SCHED_WARN_ON((s64)delta < 0)) + if (WARN_ON_ONCE((s64)delta < 0)) delta = 0; cfs_rq->throttled_clock_self_time += delta; @@ -5858,7 +5869,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); - SCHED_WARN_ON(cfs_rq->throttled_clock_self); + WARN_ON_ONCE(cfs_rq->throttled_clock_self); if (cfs_rq->nr_queued) cfs_rq->throttled_clock_self = rq_clock(rq); } @@ -5967,7 +5978,7 @@ done: * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - SCHED_WARN_ON(cfs_rq->throttled_clock); + WARN_ON_ONCE(cfs_rq->throttled_clock); if (cfs_rq->nr_queued) cfs_rq->throttled_clock = rq_clock(rq); return true; @@ -6123,7 +6134,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) } /* Already enqueued */ - if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list))) return; first = list_empty(&rq->cfsb_csd_list); @@ -6142,7 +6153,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { lockdep_assert_rq_held(rq_of(cfs_rq)); - if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) || cfs_rq->runtime_remaining <= 0)) return; @@ -6178,7 +6189,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) goto next; /* By the above checks, this should never be true */ - SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; @@ -6199,7 +6210,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) * We currently only expect to be unthrottling * a single cfs_rq locally. */ - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle); } @@ -6224,7 +6235,7 @@ next: rq_unlock_irqrestore(rq, &rf); } - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); rcu_read_unlock(); @@ -6541,14 +6552,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); /* Add a random offset so that timers interleave */ hrtimer_set_expires(&cfs_b->period_timer, get_random_u32_below(cfs_b->period)); - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; + hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cfs_b->slack_started = false; } @@ -6776,7 +6787,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - SCHED_WARN_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; @@ -6887,8 +6898,8 @@ requeue_delayed_entity(struct sched_entity *se) * Because a delayed entity is one that is still on * the runqueue competing until elegibility. */ - SCHED_WARN_ON(!se->sched_delayed); - SCHED_WARN_ON(!se->on_rq); + WARN_ON_ONCE(!se->sched_delayed); + WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { update_entity_lag(cfs_rq, se); @@ -6991,6 +7002,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable += h_nr_runnable; @@ -7120,6 +7133,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable -= h_nr_runnable; @@ -7144,8 +7159,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) rq->next_balance = jiffies; if (p && task_delayed) { - SCHED_WARN_ON(!task_sleep); - SCHED_WARN_ON(p->on_rq != 1); + WARN_ON_ONCE(!task_sleep); + WARN_ON_ONCE(p->on_rq != 1); /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); @@ -8723,7 +8738,7 @@ static inline void set_task_max_allowed_capacity(struct task_struct *p) {} static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) + if (WARN_ON_ONCE(!se->on_rq)) return; if (se_is_idle(se)) return; @@ -8783,8 +8798,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ - if (cse_is_idle && !pse_is_idle) + if (cse_is_idle && !pse_is_idle) { + /* + * When non-idle entity preempt an idle entity, + * don't give idle entity slice protection. + */ + cancel_protect_slice(se); goto preempt; + } + if (cse_is_idle != pse_is_idle) return; @@ -8803,8 +8825,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Note that even if @p does not turn out to be the most eligible * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) - se->vlag = se->deadline + 1; + if (do_preempt_short(cfs_rq, pse, se)) + cancel_protect_slice(se); /* * If @p has become the most eligible task, force preemption. @@ -9417,12 +9439,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { - env->flags |= LBF_DST_PINNED; - env->new_dst_cpu = cpu; - break; - } + cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); + + if (cpu < nr_cpu_ids) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; } return 0; @@ -12461,7 +12482,7 @@ unlock: void nohz_balance_exit_idle(struct rq *rq) { - SCHED_WARN_ON(rq != this_rq()); + WARN_ON_ONCE(rq != this_rq()); if (likely(!rq->nohz_tick_stopped)) return; @@ -12497,7 +12518,7 @@ void nohz_balance_enter_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - SCHED_WARN_ON(cpu != smp_processor_id()); + WARN_ON_ONCE(cpu != smp_processor_id()); /* If this CPU is going down, then nothing needs to be done: */ if (!cpu_active(cpu)) @@ -12580,7 +12601,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) int balance_cpu; struct rq *rq; - SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); /* * We assume there will be no idle load after this update and clear @@ -13020,7 +13041,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, struct cfs_rq *cfs_rqb; s64 delta; - SCHED_WARN_ON(task_rq(b)->core != rq->core); + WARN_ON_ONCE(task_rq(b)->core != rq->core); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -13223,7 +13244,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) static void switched_to_fair(struct rq *rq, struct task_struct *p) { - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); @@ -13258,7 +13279,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs if (!first) return; - SCHED_WARN_ON(se->sched_delayed); + WARN_ON_ONCE(se->sched_delayed); if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); @@ -13645,7 +13666,6 @@ DEFINE_SCHED_CLASS(fair) = { #endif }; -#ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq, *pos; @@ -13679,7 +13699,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4b8e33c615b1..fa03ec3ed56a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -127,9 +127,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_HARD); - rt_b->rt_period_timer.function = sched_rt_period_timer; + hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); } static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -169,9 +168,8 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_SCHED_DEBUG WARN_ON_ONCE(!rt_entity_is_task(rt_se)); -#endif + return container_of(rt_se, struct task_struct, rt); } @@ -1713,7 +1711,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; - if (SCHED_WARN_ON(list_empty(queue))) + if (WARN_ON_ONCE(list_empty(queue))) return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); @@ -2910,6 +2908,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff int ret; mutex_lock(&mutex); + sched_domains_mutex_lock(); old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; @@ -2936,6 +2935,7 @@ undo: sysctl_sched_rt_period = old_period; sysctl_sched_rt_runtime = old_runtime; } + sched_domains_mutex_unlock(); mutex_unlock(&mutex); return ret; @@ -2967,7 +2967,6 @@ static int sched_rr_handler(const struct ctl_table *table, int write, void *buff } #endif /* CONFIG_SYSCTL */ -#ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; @@ -2978,4 +2977,3 @@ void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 023b844159c9..47972f34ea70 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -91,12 +91,6 @@ struct cpuidle_state; #include "cpupri.h" #include "cpudeadline.h" -#ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -#else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -#endif - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -998,7 +992,7 @@ struct root_domain { * Also, some corner cases, like 'wrap around' is dangerous, but given * that u64 is 'big enough'. So that shouldn't be a concern. */ - u64 visit_gen; + u64 visit_cookie; #ifdef HAVE_RT_PUSH_IPI /* @@ -1180,10 +1174,8 @@ struct rq { atomic_t nr_iowait; -#ifdef CONFIG_SCHED_DEBUG u64 last_seen_need_resched_ns; int ticks_without_resched; -#endif #ifdef CONFIG_MEMBARRIER int membarrier_state; @@ -1571,7 +1563,7 @@ static inline void update_idle_core(struct rq *rq) { } static inline struct task_struct *task_of(struct sched_entity *se) { - SCHED_WARN_ON(!entity_is_task(se)); + WARN_ON_ONCE(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -1652,7 +1644,7 @@ static inline void assert_clock_updated(struct rq *rq) * The only reason for not seeing a clock update since the * last rq_pin_lock() is if we're currently skipping updates. */ - SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP); } static inline u64 rq_clock(struct rq *rq) @@ -1699,7 +1691,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) static inline void rq_clock_start_loop_update(struct rq *rq) { lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP); rq->clock_update_flags |= RQCF_ACT_SKIP; } @@ -1712,14 +1704,12 @@ static inline void rq_clock_stop_loop_update(struct rq *rq) struct rq_flags { unsigned long flags; struct pin_cookie cookie; -#ifdef CONFIG_SCHED_DEBUG /* * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the * current pin context is stashed here in case it needs to be * restored in rq_repin_lock(). */ unsigned int clock_update_flags; -#endif }; extern struct balance_callback balance_push_callback; @@ -1770,21 +1760,18 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); -#ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -# ifdef CONFIG_SMP - SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); -# endif +#ifdef CONFIG_SMP + WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); #endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SCHED_DEBUG if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED; -#endif + scx_rq_clock_invalidate(rq); lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } @@ -1793,12 +1780,10 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { lockdep_repin_lock(__rq_lockp(rq), rf->cookie); -#ifdef CONFIG_SCHED_DEBUG /* * Restore the value we stashed in @rf for this pin context. */ rq->clock_update_flags |= rf->clock_update_flags; -#endif } extern @@ -2072,9 +2057,7 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ -#ifdef CONFIG_SCHED_DEBUG int id; -#endif unsigned long cpumask[]; /* Balance mask */ }; @@ -2114,13 +2097,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) extern int group_balance_cpu(struct sched_group *sg); -#ifdef CONFIG_SCHED_DEBUG extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); -#else -static inline void update_sched_domain_debugfs(void) { } -static inline void dirty_sched_domain_sysctl(int cpu) { } -#endif extern int sched_update_scaling(void); @@ -2200,13 +2178,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + * Tunables: */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug const -#endif #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -2218,13 +2191,11 @@ enum { #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG - /* * To support run-time toggling of sched features, all the translation units * (but core.c) reference the sysctl_sched_features defined in core.c. */ -extern const_debug unsigned int sysctl_sched_features; +extern __read_mostly unsigned int sysctl_sched_features; #ifdef CONFIG_JUMP_LABEL @@ -2246,24 +2217,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* !CONFIG_JUMP_LABEL */ -#else /* !SCHED_DEBUG: */ - -/* - * Each translation unit has its own copy of sysctl_sched_features to allow - * constants propagation at compile time and compiler optimization based on - * features default. - */ -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | -static const_debug __maybe_unused unsigned int sysctl_sched_features = -#include "features.h" - 0; -#undef SCHED_FEAT - -#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - -#endif /* !SCHED_DEBUG */ - extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -2685,7 +2638,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); return rq->idle_state; } @@ -2843,12 +2796,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); # define SCHED_NR_MIGRATE_BREAK 32 #endif -extern const_debug unsigned int sysctl_sched_nr_migrate; -extern const_debug unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; +extern __read_mostly unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; -#ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; @@ -2859,7 +2811,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; -#endif #ifdef CONFIG_SCHED_HRTICK @@ -2932,7 +2883,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SCHED_DEBUG /* * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to * acquire rq lock instead of rq_lock(). So at the end of these two functions @@ -2947,9 +2897,6 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); #endif } -#else -static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { } -#endif #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ __DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ @@ -3162,7 +3109,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); -#ifdef CONFIG_SCHED_DEBUG extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); @@ -3173,15 +3119,12 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); -# ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf); -# endif /* CONFIG_NUMA_BALANCING */ -#else /* !CONFIG_SCHED_DEBUG: */ -static inline void resched_latency_warn(int cpu, u64 latency) { } -#endif /* !CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -3394,6 +3337,31 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; } unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} + +/* + * Enabling static branches would get the cpus_read_lock(), + * check whether uclamp_is_used before enable it to avoid always + * calling cpus_read_lock(). Because we never disable this + * static key once enable it. + */ +static inline void sched_uclamp_enable(void) +{ + if (!uclamp_is_used()) + static_branch_enable(&sched_uclamp_used); +} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3417,7 +3385,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) unsigned long rq_util; unsigned long max_util; - if (!static_branch_likely(&sched_uclamp_used)) + if (!uclamp_is_used()) return false; rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); @@ -3426,19 +3394,6 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; } -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) -{ - return static_branch_likely(&sched_uclamp_used); -} - #define for_each_clamp_id(clamp_id) \ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) @@ -3486,6 +3441,8 @@ static inline bool uclamp_is_used(void) return false; } +static inline void sched_uclamp_enable(void) {} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3619,6 +3576,7 @@ extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 19cdbe96f93d..452826df6ae1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -144,7 +144,7 @@ static inline void psi_enqueue(struct task_struct *p, int flags) if (p->se.sched_delayed) { /* CPU migration of "sleeping" task */ - SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); + WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; if (p->in_iowait) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 456d339be98f..c326de1344fb 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -368,7 +368,7 @@ static int uclamp_validate(struct task_struct *p, * blocking operation which obviously cannot be done while holding * scheduler locks. */ - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); return 0; } @@ -875,7 +875,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; @@ -984,7 +984,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, struct sched_attr attr; int retval; - if (!uattr || pid < 0 || flags) + if (unlikely(!uattr || pid < 0 || flags)) return -EINVAL; retval = sched_copy_attr(uattr, &attr); @@ -1049,7 +1049,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) struct task_struct *p; int retval; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; scoped_guard (rcu) { @@ -1085,8 +1085,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, struct task_struct *p; int retval; - if (!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags) + if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags)) return -EINVAL; scoped_guard (rcu) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c49aea8c1025..f1ebc60d967f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -6,13 +6,19 @@ #include <linux/bsearch.h> DEFINE_MUTEX(sched_domains_mutex); +void sched_domains_mutex_lock(void) +{ + mutex_lock(&sched_domains_mutex); +} +void sched_domains_mutex_unlock(void) +{ + mutex_unlock(&sched_domains_mutex); +} /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; -#ifdef CONFIG_SCHED_DEBUG - static int __init sched_debug_setup(char *str) { sched_debug_verbose = true; @@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } } -#else /* !CONFIG_SCHED_DEBUG */ - -# define sched_debug_verbose 0 -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | @@ -560,7 +557,7 @@ static int init_rootdomain(struct root_domain *rd) rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif - rd->visit_gen = 0; + rd->visit_cookie = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -2275,9 +2272,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; -#ifdef CONFIG_SCHED_DEBUG sgc->id = j; -#endif *per_cpu_ptr(sdd->sgc, j) = sgc; } @@ -2680,7 +2675,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], +static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; @@ -2712,21 +2707,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) { - struct root_domain *rd; - - /* - * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. - * Tasks contribution will be then recomputed - * in function dl_update_tasks_root_domain(), - * dl_servers contribution in function - * dl_restore_server_root_domain(). - */ - rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; - dl_clear_root_domain(rd); + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; - } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); @@ -2783,6 +2765,7 @@ match3: ndoms_cur = ndoms_new; update_sched_domain_debugfs(); + dl_rebuild_rd_accounting(); } /* @@ -2791,7 +2774,7 @@ match3: void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7bbb408431eb..41aa761c7738 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,13 +29,11 @@ #include <linux/syscalls.h> #include <linux/sysctl.h> +#include <asm/syscall.h> + /* Not exposed in headers: strictly internal use only. */ #define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) -#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -#include <asm/syscall.h> -#endif - #ifdef CONFIG_SECCOMP_FILTER #include <linux/file.h> #include <linux/filter.h> @@ -576,6 +574,9 @@ void seccomp_filter_release(struct task_struct *tsk) if (WARN_ON((tsk->flags & PF_EXITING) == 0)) return; + if (READ_ONCE(tsk->seccomp.filter) == NULL) + return; + spin_lock_irq(&tsk->sighand->siglock); orig = tsk->seccomp.filter; /* Detach task from its filter tree. */ @@ -601,6 +602,13 @@ static inline void seccomp_sync_threads(unsigned long flags) BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); assert_spin_locked(¤t->sighand->siglock); + /* + * Don't touch any of the threads if the process is being killed. + * This allows for a lockless check in seccomp_filter_release. + */ + if (current->signal->flags & SIGNAL_GROUP_EXIT) + return; + /* Synchronize all threads. */ caller = current; for_each_thread(caller, thread) { @@ -1074,6 +1082,13 @@ void secure_computing_strict(int this_syscall) else BUG(); } +int __secure_computing(void) +{ + int this_syscall = syscall_get_nr(current, current_pt_regs()); + + secure_computing_strict(this_syscall); + return 0; +} #else #ifdef CONFIG_SECCOMP_FILTER @@ -1225,13 +1240,12 @@ out: return -1; } -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_data sd; struct seccomp_filter *match = NULL; int data; - struct seccomp_data sd_local; /* * Make sure that any changes to mode from another thread have @@ -1239,12 +1253,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ smp_rmb(); - if (!sd) { - populate_seccomp_data(&sd_local); - sd = &sd_local; - } + populate_seccomp_data(&sd); - filter_ret = seccomp_run_filters(sd, &match); + filter_ret = seccomp_run_filters(&sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -1302,13 +1313,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, * a reload of all registers. This does not goto skip since * a skip would have already been reported. */ - if (__seccomp_filter(this_syscall, NULL, true)) + if (__seccomp_filter(this_syscall, true)) return -1; return 0; case SECCOMP_RET_USER_NOTIF: - if (seccomp_do_user_notification(this_syscall, match, sd)) + if (seccomp_do_user_notification(this_syscall, match, &sd)) goto skip; return 0; @@ -1350,8 +1361,7 @@ skip: return -1; } #else -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { BUG(); @@ -1359,7 +1369,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, } #endif -int __secure_computing(const struct seccomp_data *sd) +int __secure_computing(void) { int mode = current->seccomp.mode; int this_syscall; @@ -1368,15 +1378,14 @@ int __secure_computing(const struct seccomp_data *sd) unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return 0; - this_syscall = sd ? sd->nr : - syscall_get_nr(current, current_pt_regs()); + this_syscall = syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ return 0; case SECCOMP_MODE_FILTER: - return __seccomp_filter(this_syscall, sd, false); + return __seccomp_filter(this_syscall, false); /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ case SECCOMP_MODE_DEAD: WARN_ON_ONCE(1); diff --git a/kernel/signal.c b/kernel/signal.c index 875e97f6205a..86ba66d95da5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2092,7 +2092,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ - if (tmr->it_signal && tmr->it_sig_periodic) + if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); @@ -2180,8 +2180,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* - * tsk is a group leader and has no threads, wake up the - * non-PIDFD_THREAD waiters. + * Notify for thread-group leaders without subthreads. */ if (thread_group_empty(tsk)) do_notify_pidfd(tsk); @@ -4009,6 +4008,47 @@ static struct pid *pidfd_to_pid(const struct file *file) (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) +static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, + siginfo_t __user *info, unsigned int flags) +{ + kernel_siginfo_t kinfo; + + switch (flags) { + case PIDFD_SIGNAL_THREAD: + type = PIDTYPE_PID; + break; + case PIDFD_SIGNAL_THREAD_GROUP: + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } + + if (info) { + int ret; + + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + return ret; + + if (unlikely(sig != kinfo.si_signo)) + return -EINVAL; + + /* Only allow sending arbitrary signals to yourself. */ + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + return -EPERM; + } else { + prepare_kill_siginfo(sig, &kinfo, type); + } + + if (type == PIDTYPE_PGID) + return kill_pgrp_info(sig, &kinfo, pid); + + return kill_pid_info_type(sig, &kinfo, pid, type); +} + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -4026,9 +4066,7 @@ static struct pid *pidfd_to_pid(const struct file *file) SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { - int ret; struct pid *pid; - kernel_siginfo_t kinfo; enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ @@ -4039,57 +4077,39 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; - CLASS(fd, f)(pidfd); - if (fd_empty(f)) - return -EBADF; + switch (pidfd) { + case PIDFD_SELF_THREAD: + pid = get_task_pid(current, PIDTYPE_PID); + type = PIDTYPE_PID; + break; + case PIDFD_SELF_THREAD_GROUP: + pid = get_task_pid(current, PIDTYPE_TGID); + type = PIDTYPE_TGID; + break; + default: { + CLASS(fd, f)(pidfd); + if (fd_empty(f)) + return -EBADF; - /* Is this a pidfd? */ - pid = pidfd_to_pid(fd_file(f)); - if (IS_ERR(pid)) - return PTR_ERR(pid); + /* Is this a pidfd? */ + pid = pidfd_to_pid(fd_file(f)); + if (IS_ERR(pid)) + return PTR_ERR(pid); - if (!access_pidfd_pidns(pid)) - return -EINVAL; + if (!access_pidfd_pidns(pid)) + return -EINVAL; - switch (flags) { - case 0: /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_THREAD: - type = PIDTYPE_PID; - break; - case PIDFD_SIGNAL_THREAD_GROUP: - type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_PROCESS_GROUP: - type = PIDTYPE_PGID; - break; - } - if (info) { - ret = copy_siginfo_from_user_any(&kinfo, info); - if (unlikely(ret)) - return ret; - - if (unlikely(sig != kinfo.si_signo)) - return -EINVAL; - - /* Only allow sending arbitrary signals to yourself. */ - if ((task_pid(current) != pid || type > PIDTYPE_TGID) && - (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) - return -EPERM; - } else { - prepare_kill_siginfo(sig, &kinfo, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); + } } - if (type == PIDTYPE_PGID) - return kill_pgrp_info(sig, &kinfo, pid); - else - return kill_pid_info_type(sig, &kinfo, pid, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); } static int diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 8896d844d738..5d2d0562115b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -250,6 +250,7 @@ static int multi_cpu_stop(void *data) * be detected and reported on their side. */ touch_nmi_watchdog(); + /* Also suppress RCU CPU stall warnings. */ rcu_momentary_eqs(); } } while (curstate != MULTI_STOP_EXIT); diff --git a/kernel/sys.c b/kernel/sys.c index cb366ff8703a..c434968e9f5d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; + struct pid *pids[PIDTYPE_MAX] = { 0 }; struct pid *pgrp; int err; @@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) goto out; if (task_pgrp(p) != pgrp) - change_pid(p, PIDTYPE_PGID, pgrp); + change_pid(pids, p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); + free_pids(pids); return err; } @@ -1222,21 +1224,22 @@ out: return retval; } -static void set_special_pids(struct pid *pid) +static void set_special_pids(struct pid **pids, struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); + change_pid(pids, curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); + change_pid(pids, curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); + struct pid *pids[PIDTYPE_MAX] = { 0 }; pid_t session = pid_vnr(sid); int err = -EPERM; @@ -1252,13 +1255,14 @@ int ksys_setsid(void) goto out; group_leader->signal->leader = 1; - set_special_pids(sid); + set_special_pids(pids, sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); + free_pids(pids); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); @@ -2811,6 +2815,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = arch_lock_shadow_stack_status(me, arg2); break; + case PR_TIMER_CREATE_RESTORE_IDS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = posixtimer_create_prctl(arg2); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb57da499ebb..4ebe6136b08d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -54,7 +54,6 @@ #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/perf_event.h> #include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> @@ -91,12 +90,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ - -#ifdef CONFIG_PERF_EVENTS -static const int six_hundred_forty_kb = 640 * 1024; -#endif - - static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1831,16 +1824,6 @@ static const struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - #if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ defined(CONFIG_DEBUG_STACKOVERFLOW) { @@ -1851,43 +1834,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #if defined(CONFIG_MMU) { .procname = "randomize_va_space", @@ -1906,15 +1852,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", @@ -1933,63 +1870,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_event_max_sample_rate_handler, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "perf_cpu_time_max_percent", - .data = &sysctl_perf_cpu_time_max_percent, - .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), - .mode = 0644, - .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "perf_event_max_stack", - .data = &sysctl_perf_event_max_stack, - .maxlen = sizeof(sysctl_perf_event_max_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&six_hundred_forty_kb, - }, - { - .procname = "perf_event_max_contexts_per_stack", - .data = &sysctl_perf_event_max_contexts_per_stack, - .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_THOUSAND, - }, -#endif { .procname = "panic_on_warn", .data = &panic_on_warn, diff --git a/kernel/time/Makefile b/kernel/time/Makefile index fe0ae82124fe..e6e9b85d4db5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 + +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2a7802ec480c..e0eeacbe2521 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1510,7 +1510,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strscpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index deb1aa32814e..22376a1a75b9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; - static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) { if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -1587,19 +1577,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; - - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; + switch (clock_id) { + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; -} - -static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) -{ - return HRTIMER_NORESTART; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952a..e3642278df43 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -165,26 +165,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * HVCLOCK * VVAR * - * The check for vdso_data->clock_mode is in the unlikely path of + * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) { - struct timens_offset *offset = vdata->offset; + struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -219,7 +219,8 @@ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { - struct vdso_data *vdata; + struct vdso_time_data *vdata; + struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) @@ -235,10 +236,11 @@ static void timens_set_vvar_page(struct task_struct *task, goto out; ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); + timens_setup_vdso_clock_data(&vc[i], ns); out: mutex_unlock(&offset_lock); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 163e7a2033b6..b837d3d9d325 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -678,8 +678,7 @@ void ntp_notify_cmos_timer(bool offset_set) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 1af0bb2cc45c..7f4e4fb7381e 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp, return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock_context *pccontext = fp->private_data; - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(pccontext, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; @@ -171,11 +151,9 @@ static const struct file_operations posix_clock_file_operations = { .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1b675aee99a9..6222112533a7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -9,28 +9,23 @@ * * These are all the functions necessary to implement POSIX clocks & timers */ -#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/jhash.h> #include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/mutex.h> -#include <linux/sched/task.h> - -#include <linux/uaccess.h> #include <linux/list.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/hash.h> +#include <linux/memblock.h> +#include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> +#include <linux/sched/task.h> +#include <linux/slab.h> #include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/workqueue.h> -#include <linux/export.h> -#include <linux/hashtable.h> -#include <linux/compat.h> -#include <linux/nospec.h> +#include <linux/time.h> #include <linux/time_namespace.h> +#include <linux/uaccess.h> #include "timekeeping.h" #include "posix-timers.h" @@ -46,39 +41,65 @@ static struct kmem_cache *posix_timers_cache; * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); +struct timer_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; + +static struct { + struct timer_hash_bucket *buckets; + unsigned long mask; +} __timer_data __ro_after_init __aligned(2*sizeof(long)); + +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; +#define TIMER_ANY_ID INT_MIN + /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id); -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ +#define lock_timer(tid) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ + __timr; \ }) -static int hash(struct signal_struct *sig, unsigned int nr) +static inline void unlock_timer(struct k_itimer *timr) { - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); + if (likely((timr))) + spin_unlock_irq(&timr->it_lock); } -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) +#define scoped_timer_get_or_fail(_id) \ + scoped_cond_guard(lock_timer, return -EINVAL, _id) + +#define scoped_timer (scope) + +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS_IS_COND_GUARD(lock_timer); + +static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) { + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; @@ -86,46 +107,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, return NULL; } -static struct k_itimer *posix_timer_by_id(timer_t id) +static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) { - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + unsigned long val = (unsigned long)timer->it_signal; - return __posix_timers_find(head, sig, id); + /* + * Mask out bit 0, which acts as invalid marker to prevent + * posix_timer_by_id() detecting it as valid. + */ + return (struct signal_struct *)(val & ~1UL); } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, + timer_t id) { - struct signal_struct *sig = current->signal; - struct hlist_head *head; - unsigned int cnt, id; + struct hlist_head *head = &bucket->head; + struct k_itimer *timer; - /* - * FIXME: Replace this by a per signal struct xarray once there is - * a plan to handle the resulting CRIU regression gracefully. - */ - for (cnt = 0; cnt <= INT_MAX; cnt++) { - spin_lock(&hash_lock); - id = sig->next_posix_timer_id; + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) + return true; + } + return false; +} - /* Write the next ID back. Clamp it to the positive space */ - sig->next_posix_timer_id = (id + 1) & INT_MAX; +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) +{ + struct timer_hash_bucket *bucket = hash_bucket(sig, id); - head = &posix_timers_hashtable[hash(sig, id)]; - if (!__posix_timers_find(head, sig, id)) { - hlist_add_head_rcu(&timer->t_hash, head); - spin_unlock(&hash_lock); - return id; + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; } - spin_unlock(&hash_lock); } - /* POSIX return code when no timer ID could be allocated */ - return -EAGAIN; + return false; } -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +static int posix_timer_add(struct k_itimer *timer, int req_id) { - spin_unlock_irqrestore(&timr->it_lock, flags); + struct signal_struct *sig = current->signal; + + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { + /* Get the next timer ID and clamp it to positive space */ + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; + + if (posix_timer_add_at(timer, sig, id)) + return id; + cond_resched(); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) @@ -222,9 +285,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof(struct k_itimer), 0, - SLAB_PANIC | SLAB_ACCOUNT, NULL); + posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), + __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); @@ -259,7 +321,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it * since the signal was queued. In either case, don't rearm and * drop the signal. */ - if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) return false; if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) @@ -304,6 +366,9 @@ void posix_timer_queue_signal(struct k_itimer *timr) { lockdep_assert_held(&timr->it_lock); + if (!posixtimer_valid(timr)) + return; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; posixtimer_send_sigqueue(timr); } @@ -324,6 +389,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; + } + return -EINVAL; +} + static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); @@ -350,8 +430,12 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer *alloc_posix_timer(void) { - struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr; + if (unlikely(!posix_timers_cache)) + return NULL; + + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; @@ -373,15 +457,16 @@ void posixtimer_free_timer(struct k_itimer *tmr) static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - spin_lock(&hash_lock); - hlist_del_rcu(&tmr->t_hash); - spin_unlock(&hash_lock); + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); + + scoped_guard (spinlock, &bucket->lock) + hlist_del_rcu(&tmr->t_hash); posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -390,6 +475,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; @@ -404,26 +490,32 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_init(&new_timer->it_lock); + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + /* * Add the timer to the hash table. The timer is not yet valid - * because new_timer::it_signal is still NULL. The timer id is also - * not yet visible to user space. + * after insertion, but has a unique ID allocated. */ - new_timer_id = posix_timer_add(new_timer); + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { posixtimer_free_timer(new_timer); return new_timer_id; } - new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(event)); - rcu_read_unlock(); + scoped_guard (rcu) + new_timer->it_pid = get_pid(good_sigevent(event)); if (!new_timer->it_pid) { error = -EINVAL; goto out; @@ -434,7 +526,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq.info.si_signo = SIGALRM; - memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } @@ -453,7 +544,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } /* * After succesful copy out, the timer ID is visible to user space - * now but not yet valid because new_timer::signal is still NULL. + * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create * callback. @@ -462,14 +553,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - /* This makes the timer valid in the hash table */ - WRITE_ONCE(new_timer->it_signal, current->signal); - hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); /* - * After unlocking sighand::siglock @new_timer is subject to - * concurrent removal and cannot be touched anymore + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* + * new_timer::it_signal contains the signal pointer with + * bit 0 set, which makes it invalid for syscall operations. + * Store the unmodified signal pointer to make it valid. + */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ return 0; out: @@ -507,7 +609,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id) { struct k_itimer *timr; @@ -522,11 +624,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where - * timr::it_signal == NULL. timer::it_signal is only set after the - * rest of the initialization succeeded. + * timr::it_signal is marked invalid. timer::it_signal is only set + * after the rest of the initialization succeeded. * * Timer destruction happens in steps: - * 1) Set timr::it_signal to NULL with timr::it_lock held + * 1) Set timr::it_signal marked invalid with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Put the reference count. @@ -543,25 +645,21 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id - * can't change, but timr::it_signal becomes NULL during - * destruction. + * can't change, but timr::it_signal can become invalid during + * destruction, which makes the locked check fail. */ - rcu_read_lock(); + guard(rcu)(); timr = posix_timer_by_id(timer_id); if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); + spin_lock_irq(&timr->it_lock); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ - if (timr->it_signal == current->signal) { - rcu_read_unlock(); + if (timr->it_signal == current->signal) return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); + spin_unlock_irq(&timr->it_lock); } - rcu_read_unlock(); - return NULL; } @@ -652,24 +750,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - memset(setting, 0, sizeof(*setting)); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, setting); - - unlock_timer(timr, flags); - return ret; + scoped_timer_get_or_fail(timer_id) + scoped_timer->kclock->timer_get(scoped_timer, setting); + return 0; } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -723,18 +807,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { - struct k_itimer *timr; - unsigned long flags; - int overrun; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timer_overrun_to_int(timr); - unlock_timer(timr, flags); - - return overrun; + scoped_timer_get_or_fail(timer_id) + return timer_overrun_to_int(scoped_timer); } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, @@ -747,7 +821,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -756,8 +830,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) expires = ktime_add_safe(expires, timer->base->get_time()); @@ -791,26 +864,13 @@ static void common_timer_wait_running(struct k_itimer *timer) * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ -static struct k_itimer *timer_wait_running(struct k_itimer *timer, - unsigned long *flags) +static void timer_wait_running(struct k_itimer *timer) { - const struct k_clock *kc = READ_ONCE(timer->kclock); - timer_t timer_id = READ_ONCE(timer->it_id); - - /* Prevent kfree(timer) after dropping the lock */ - rcu_read_lock(); - unlock_timer(timer, *flags); - /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ - if (!WARN_ON_ONCE(!kc->timer_wait_running)) - kc->timer_wait_running(timer); - - rcu_read_unlock(); - /* Relock the timer. It might be not longer hashed. */ - return lock_timer(timer_id, flags); + timer->kclock->timer_wait_running(timer); } /* @@ -865,15 +925,9 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int tmr_flags, - struct itimerspec64 *new_spec64, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int error; - if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; @@ -881,33 +935,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); - timr = lock_timer(timer_id, &flags); -retry: - if (!timr) - return -EINVAL; + for (; ; old_spec64 = NULL) { + struct k_itimer *timr; - if (old_spec64) - old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + scoped_timer_get_or_fail(timer_id) { + timr = scoped_timer; - /* Prevent signal delivery and rearming. */ - timr->it_signal_seq++; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - - if (error == TIMER_RETRY) { - // We already got the old time... - old_spec64 = NULL; - /* Unlocks and relocks the timer if it still exists */ - timr = timer_wait_running(timr, &flags); - goto retry; - } - unlock_timer(timr, flags); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; - return error; + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); + if (ret != TIMER_RETRY) + return ret; + + /* Protect the timer from being freed when leaving the lock scope */ + rcu_read_lock(); + } + timer_wait_running(timr); + rcu_read_unlock(); + } } /* Set a POSIX.1b interval timer */ @@ -978,110 +1027,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) } } -static inline int timer_delete_hook(struct k_itimer *timer) +static void posix_timer_delete(struct k_itimer *timer) { - const struct k_clock *kc = timer->kclock; - - /* Prevent signal delivery and rearming. */ + /* + * Invalidate the timer, remove it from the linked list and remove + * it from the ignored list if pending. + * + * The invalidation must be written with siglock held so that the + * signal code observes the invalidated timer::it_signal in + * do_sigaction(), which prevents it from moving a pending signal + * of a deleted timer to the ignore list. + * + * The invalidation also prevents signal queueing, signal delivery + * and therefore rearming from the signal delivery path. + * + * A concurrent lookup can still find the timer in the hash, but it + * will check timer::it_signal with timer::it_lock held and observe + * bit 0 set, which invalidates it. That also prevents the timer ID + * from being handed out before this timer is completely gone. + */ timer->it_signal_seq++; - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); + scoped_guard (spinlock, ¤t->sighand->siglock) { + unsigned long sig = (unsigned long)timer->it_signal | 1UL; + + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); + hlist_del_rcu(&timer->list); + posix_timer_cleanup_ignored(timer); + } + + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { + guard(rcu)(); + spin_unlock_irq(&timer->it_lock); + timer_wait_running(timer); + spin_lock_irq(&timer->it_lock); + } } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; - unsigned long flags; - timer = lock_timer(timer_id, &flags); - -retry_delete: - if (!timer) - return -EINVAL; - - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { - /* Unlocks and relocks the timer if it still exists */ - timer = timer_wait_running(timer, &flags); - goto retry_delete; + scoped_timer_get_or_fail(timer_id) { + timer = scoped_timer; + posix_timer_delete(timer); } - - spin_lock(¤t->sighand->siglock); - hlist_del(&timer->list); - posix_timer_cleanup_ignored(timer); - /* - * A concurrent lookup could check timer::it_signal lockless. It - * will reevaluate with timer::it_lock held and observe the NULL. - * - * It must be written with siglock held so that the signal code - * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), - * which prevents it from moving a pending signal of a deleted - * timer to the ignore list. - */ - WRITE_ONCE(timer->it_signal, NULL); - spin_unlock(¤t->sighand->siglock); - - unlock_timer(timer, flags); + /* Remove it from the hash, which frees up the timer ID */ posix_timer_unhash_and_free(timer); return 0; } /* - * Delete a timer if it is armed, remove it from the hash and schedule it - * for RCU freeing. - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - - /* - * irqsave is required to make timer_wait_running() work. - */ - spin_lock_irqsave(&timer->it_lock, flags); - -retry_delete: - /* - * Even if the timer is not longer accessible from other tasks - * it still might be armed and queued in the underlying timer - * mechanism. Worse, that timer mechanism might run the expiry - * function concurrently. - */ - if (timer_delete_hook(timer) == TIMER_RETRY) { - /* - * Timer is expired concurrently, prevent livelocks - * and pointless spinning on RT. - * - * timer_wait_running() drops timer::it_lock, which opens - * the possibility for another task to delete the timer. - * - * That's not possible here because this is invoked from - * do_exit() only for the last thread of the thread group. - * So no other task can access and delete that timer. - */ - if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) - return; - - goto retry_delete; - } - hlist_del(&timer->list); - - posix_timer_cleanup_ignored(timer); - - /* - * Setting timer::it_signal to NULL is technically not required - * here as nothing can access the timer anymore legitimately via - * the hash table. Set it to NULL nevertheless so that all deletion - * paths are consistent. - */ - WRITE_ONCE(timer->it_signal, NULL); - - spin_unlock_irqrestore(&timer->it_lock, flags); - posix_timer_unhash_and_free(timer); -} - -/* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. @@ -1089,18 +1086,26 @@ retry_delete: void exit_itimers(struct task_struct *tsk) { struct hlist_head timers; + struct hlist_node *next; + struct k_itimer *timer; + + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ - spin_lock_irq(&tsk->sighand->siglock); - hlist_move_list(&tsk->signal->posix_timers, &timers); - spin_unlock_irq(&tsk->sighand->siglock); + scoped_guard (spinlock_irq, &tsk->sighand->siglock) + hlist_move_list(&tsk->signal->posix_timers, &timers); /* The timers are not longer accessible via tsk::signal */ - while (!hlist_empty(&timers)) - itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + hlist_for_each_entry_safe(timer, next, &timers, list) { + scoped_guard (spinlock_irq, &timer->it_lock) + posix_timer_delete(timer); + posix_timer_unhash_and_free(timer); + cond_resched(); + } /* * There should be no timers on the ignored list. itimer_delete() has @@ -1545,3 +1550,26 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id) return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } + +static int __init posixtimer_init(void) +{ + unsigned long i, size; + unsigned int shift; + + if (IS_ENABLED(CONFIG_BASE_SMALL)) + size = 512; + else + size = roundup_pow_of_two(512 * num_possible_cpus()); + + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), + size, 0, 0, &shift, NULL, size, size); + size = 1UL << shift; + timer_hashmask = size - 1; + + for (i = 0; i < size; i++) { + spin_lock_init(&timer_buckets[i].lock); + INIT_HLIST_HEAD(&timer_buckets[i].head); + } + return 0; +} +core_initcall(posixtimer_init); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fcca4e72f1ef..cc15fe293719 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index e28f9210f8a1..a88b72b0f35e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa058510af9c..c527b421c865 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1573,12 +1573,10 @@ void tick_setup_sched_timer(bool hrtimer) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); - ts->sched_timer.function = tick_nohz_handler; - } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e67d076f195..929846b8b45a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -682,20 +682,19 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act } /** - * timekeeping_forward_now - update clock to the current time + * timekeeping_forward - update clock to given cycle now value * @tk: Pointer to the timekeeper to update + * @cycle_now: Current clocksource read value * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void timekeeping_forward_now(struct timekeeper *tk) +static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now) { - u64 cycle_now, delta; + u64 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); - cycle_now = tk_clock_read(&tk->tkr_mono); - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, - tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -711,6 +710,21 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** + * timekeeping_forward_now - update clock to the current time + * @tk: Pointer to the timekeeper to update + * + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. + */ +static void timekeeping_forward_now(struct timekeeper *tk) +{ + u64 cycle_now = tk_clock_read(&tk->tkr_mono); + + timekeeping_forward(tk, cycle_now); +} + +/** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * @@ -2151,6 +2165,54 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, return offset; } +static u64 timekeeping_accumulate(struct timekeeper *tk, u64 offset, + enum timekeeping_adv_mode mode, + unsigned int *clock_set) +{ + int shift = 0, maxshift; + + /* + * TK_ADV_FREQ indicates that adjtimex(2) directly set the + * frequency or the tick length. + * + * Accumulate the offset, so that the new multiplier starts from + * now. This is required as otherwise for offsets, which are + * smaller than tk::cycle_interval, timekeeping_adjust() could set + * xtime_nsec backwards, which subsequently causes time going + * backwards in the coarse time getters. But even for the case + * where offset is greater than tk::cycle_interval the periodic + * accumulation does not have much value. + * + * Also reset tk::ntp_error as it does not make sense to keep the + * old accumulated error around in this case. + */ + if (mode == TK_ADV_FREQ) { + timekeeping_forward(tk, tk->tkr_mono.cycle_last + offset); + tk->ntp_error = 0; + return 0; + } + + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller than the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. + */ + shift = ilog2(offset) - ilog2(tk->cycle_interval); + shift = max(0, shift); + /* Bound shift to one less than what overflows tick_length */ + maxshift = (64 - (ilog2(ntp_tick_length()) + 1)) - 1; + shift = min(shift, maxshift); + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift, clock_set); + if (offset < tk->cycle_interval << shift) + shift--; + } + return offset; +} + /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length @@ -2160,7 +2222,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; - int shift = 0, maxshift; u64 offset; guard(raw_spinlock_irqsave)(&tk_core.lock); @@ -2177,24 +2238,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; - /* - * With NO_HZ we may have to accumulate many cycle_intervals - * (think "ticks") worth of time at once. To do this efficiently, - * we calculate the largest doubling multiple of cycle_intervals - * that is smaller than the offset. We then accumulate that - * chunk in one go, and then try to consume the next smaller - * doubled multiple. - */ - shift = ilog2(offset) - ilog2(tk->cycle_interval); - shift = max(0, shift); - /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; - shift = min(shift, maxshift); - while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, &clock_set); - if (offset < tk->cycle_interval<<shift) - shift--; - } + offset = timekeeping_accumulate(tk, offset, mode, &clock_set); /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1c311c46da50..cfbb46cc4e76 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,7 +98,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 05d383143165..01c2ab1e8971 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,29 +15,29 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; + vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; + vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + vc[CS_RAW].mask = tk->tkr_raw.mask; + vc[CS_RAW].mult = tk->tkr_raw.mult; + vc[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -55,7 +55,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -65,19 +65,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -86,21 +87,21 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; @@ -108,32 +109,31 @@ void update_vsyscall(struct timekeeper *tk) /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); __arch_update_vsyscall(vdata); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } /** @@ -150,7 +150,7 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); @@ -167,9 +167,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } diff --git a/kernel/torture.c b/kernel/torture.c index dede150aef01..3a0a8cc60401 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -792,6 +792,8 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static unsigned long torture_init_jiffies; + static void torture_print_module_parms(void) { @@ -821,6 +823,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + WRITE_ONCE(torture_init_jiffies, jiffies); // Lockless reads. torture_print_module_parms(); return true; } @@ -837,6 +840,15 @@ void torture_init_end(void) EXPORT_SYMBOL_GPL(torture_init_end); /* + * Get the torture_init_begin()-time value of the jiffies counter. + */ +unsigned long get_torture_init_jiffies(void) +{ + return READ_ONCE(torture_init_jiffies); +} +EXPORT_SYMBOL_GPL(get_torture_init_jiffies); + +/* * Clean up torture module. Please note that this is -not- invoked via * the usual module_exit() mechanism, but rather by an explicit call from * the client torture module. Returns true if a race with system shutdown diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index adc947587eb8..997fb2a47c92 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1038,27 +1038,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_X86_KERNEL_IBT -static unsigned long get_entry_ip(unsigned long fentry_ip) +static inline unsigned long get_entry_ip(unsigned long fentry_ip) { - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) +#ifdef CONFIG_X86_KERNEL_IBT + if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; +#endif return fentry_ip; } -#else -#define get_entry_ip(fentry_ip) fentry_ip -#endif BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e6d517e74e0..fd3cb2b2ab82 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4100,12 +4100,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, - preempt_model_none() ? "server" : - preempt_model_voluntary() ? "desktop" : - preempt_model_full() ? "preempt" : - preempt_model_lazy() ? "lazy" : - preempt_model_rt() ? "preempt_rt" : - "unknown", + preempt_model_str(), /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index e27305d31fc5..985ff98272da 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -920,13 +920,8 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { data->tpoint = tp; - if (!data->mod) { + if (!data->mod) data->mod = mod; - if (!try_module_get(data->mod)) { - data->tpoint = NULL; - data->mod = NULL; - } - } } } @@ -938,13 +933,7 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) data->tpoint = tp; } -/* - * Find a tracepoint from kernel and module. If the tracepoint is in a module, - * this increments the module refcount to prevent unloading until the - * trace_fprobe is registered to the list. After registering the trace_fprobe - * on the trace_fprobe list, the module refcount is decremented because - * tracepoint_probe_module_cb will handle it. - */ +/* Find a tracepoint from kernel and module. */ static struct tracepoint *find_tracepoint(const char *tp_name, struct module **tp_mod) { @@ -973,6 +962,7 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf) } } +/* Find a tracepoint from specified module. */ static struct tracepoint *find_tracepoint_in_module(struct module *mod, const char *tp_name) { @@ -1008,10 +998,13 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, reenable_trace_fprobe(tf); } } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { - tracepoint_probe_unregister(tf->tpoint, + unregister_fprobe(&tf->fp); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; + tf->tpoint = TRACEPOINT_STUB; + tf->mod = NULL; + } } } mutex_unlock(&event_mutex); @@ -1176,6 +1169,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_tracepoint) { ctx->flags |= TPARG_FL_TPOINT; tpoint = find_tracepoint(symbol, &tp_mod); + /* lock module until register this tprobe. */ + if (tp_mod && !try_module_get(tp_mod)) { + tpoint = NULL; + tp_mod = NULL; + } if (tpoint) { ctx->funcname = kallsyms_lookup( (unsigned long)tpoint->probestub, diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index f3a2722ee4c0..92e16f03fa4e 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1542,27 +1542,25 @@ static int run_osnoise(void) /* * In some cases, notably when running on a nohz_full CPU with - * a stopped tick PREEMPT_RCU has no way to account for QSs. - * This will eventually cause unwarranted noise as PREEMPT_RCU - * will force preemption as the means of ending the current - * grace period. We avoid this problem by calling - * rcu_momentary_eqs(), which performs a zero duration - * EQS allowing PREEMPT_RCU to end the current grace period. - * This call shouldn't be wrapped inside an RCU critical - * section. + * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to + * account for QSs. This will eventually cause unwarranted + * noise as RCU forces preemption as the means of ending the + * current grace period. We avoid this by calling + * rcu_momentary_eqs(), which performs a zero duration EQS + * allowing RCU to end the current grace period. This call + * shouldn't be wrapped inside an RCU critical section. * - * Note that in non PREEMPT_RCU kernels QSs are handled through - * cond_resched() + * Normally QSs for other cases are handled through cond_resched(). + * For simplicity, however, we call rcu_momentary_eqs() for all + * configurations here. */ - if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - if (!disable_irq) - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); - rcu_momentary_eqs(); + rcu_momentary_eqs(); - if (!disable_irq) - local_irq_enable(); - } + if (!disable_irq) + local_irq_enable(); /* * For the non-preemptive kernel config: let threads runs, if @@ -1901,8 +1899,7 @@ static int timerlat_main(void *data) tlat->count = 0; tlat->tracing_thread = false; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); tlat->kthread = current; osn_var->pid = current->pid; /* @@ -2456,8 +2453,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file) tlat = this_cpu_tmr_var(); tlat->count = 0; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); migrate_enable(); return 0; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa0b2e47f2f2..682f40d5632d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -238,7 +238,7 @@ EXPORT_SYMBOL(__put_user_ns); struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ - u32 count; /* == 0 unless used with map_id_range_down() */ + u32 count; }; /* @@ -343,16 +343,19 @@ u32 map_id_down(struct uid_gid_map *map, u32 id) * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; - u32 first, last; + u32 first, last, id2; + + id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) + if (id >= first && id <= last && + (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; @@ -363,28 +366,28 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; - key.count = 1; + key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } -u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - extent = map_id_up_base(extents, map, id); + extent = map_id_range_up_base(extents, map, id, count); else - extent = map_id_up_max(extents, map, id); + extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) @@ -395,6 +398,11 @@ u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return map_id_range_up(map, id, 1); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5267adeaa403..7e45559521af 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue, struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; - unsigned int head, tail, mask, note, offset, len; + unsigned int head, tail, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) @@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue, memcpy(p + offset, n, len); kunmap_atomic(p); - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; @@ -147,7 +146,7 @@ out: return done; lost: - buf = &pipe->bufs[(head - 1) & mask]; + buf = pipe_buf(pipe, head - 1); buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } @@ -269,6 +268,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) if (ret < 0) goto error; + /* + * pipe_resize_ring() does not update nr_accounted for watch_queue + * pipes, because the above vastly overprovisions. Set nr_accounted on + * and max_usage this pipe to the number that was actually charged to + * the user above via account_pipe_buffers. + */ + pipe->max_usage = nr_pages; + pipe->nr_accounted = nr_pages; + ret = -ENOMEM; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b2da7de39d06..9fa2af9dbf2c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -static void __lockup_detector_cleanup(void); - #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM enum stats_per_group { STATS_SYSTEM, @@ -797,8 +795,7 @@ static void watchdog_enable(unsigned int cpu) * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hrtimer->function = watchdog_timer_fn; + hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); @@ -886,11 +883,6 @@ static void __lockup_detector_reconfigure(void) watchdog_hardlockup_start(); cpus_read_unlock(); - /* - * Must be called outside the cpus locked section to prevent - * recursive locking in the perf code. - */ - __lockup_detector_cleanup(); } void lockup_detector_reconfigure(void) @@ -940,24 +932,6 @@ static inline void lockup_detector_setup(void) } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_cleanup(void) -{ - lockdep_assert_held(&watchdog_mutex); - hardlockup_detector_perf_cleanup(); -} - -/** - * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes - * - * Caller must not hold the cpu hotplug rwsem. - */ -void lockup_detector_cleanup(void) -{ - mutex_lock(&watchdog_mutex); - __lockup_detector_cleanup(); - mutex_unlock(&watchdog_mutex); -} - /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 59c1d86a73a2..a78ff092d636 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -21,8 +21,6 @@ #include <linux/perf_event.h> static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); -static struct cpumask dead_events_mask; static atomic_t watchdog_cpus = ATOMIC_INIT(0); @@ -146,6 +144,7 @@ static int hardlockup_detector_event_create(void) PTR_ERR(evt)); return PTR_ERR(evt); } + WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); this_cpu_write(watchdog_ev, evt); return 0; } @@ -181,37 +180,13 @@ void watchdog_hardlockup_disable(unsigned int cpu) if (event) { perf_event_disable(event); + perf_event_release_kernel(event); this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); - cpumask_set_cpu(smp_processor_id(), &dead_events_mask); atomic_dec(&watchdog_cpus); } } /** - * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them - * - * Called from lockup_detector_cleanup(). Serialized by the caller. - */ -void hardlockup_detector_perf_cleanup(void) -{ - int cpu; - - for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); - - /* - * Required because for_each_cpu() reports unconditionally - * CPU0 as set on UP kernels. Sigh. - */ - if (event) - perf_event_release_kernel(event); - per_cpu(dead_event, cpu) = NULL; - } - cpumask_clear(&dead_events_mask); -} - -/** * hardlockup_detector_perf_stop - Globally stop watchdog events * * Special interface for x86 to handle the perf HT bug. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 35796c290ca3..b1b92a9a8f24 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -545,6 +545,17 @@ config FRAME_POINTER config OBJTOOL bool +config OBJTOOL_WERROR + bool "Upgrade objtool warnings to errors" + depends on OBJTOOL && !COMPILE_TEST + help + Fail the build on objtool warnings. + + Objtool warnings can indicate kernel instability, including boot + failures. This option is highly recommended. + + If unsure, say Y. + config STACK_VALIDATION bool "Compile-time stack metadata validation" depends on HAVE_STACK_VALIDATION && UNWINDER_FRAME_POINTER @@ -808,6 +819,15 @@ config ARCH_HAS_DEBUG_VM_PGTABLE An architecture should select this when it can successfully build and run DEBUG_VM_PGTABLE. +config DEBUG_VFS + bool "Debug VFS" + depends on DEBUG_KERNEL + help + Enable this to turn on extended checks in the VFS layer that may impact + performance. + + If unsure, say N. + config DEBUG_VM_IRQSOFF def_bool DEBUG_VM && !PREEMPT_RT @@ -1301,15 +1321,6 @@ endmenu # "Debug lockups and hangs" menu "Scheduler Debugging" -config SCHED_DEBUG - bool "Collect scheduler debugging info" - depends on DEBUG_KERNEL && DEBUG_FS - default y - help - If you say Y here, the /sys/kernel/debug/sched file will be provided - that can help debug the scheduler. The runtime overhead of this - option is minimal. - config SCHED_INFO bool default n @@ -2427,6 +2438,24 @@ config ASYNC_RAID6_TEST config TEST_HEXDUMP tristate "Test functions located in the hexdump module at runtime" +config PRINTF_KUNIT_TEST + tristate "KUnit test printf() family of functions at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test the printf functions at runtime. + + If unsure, say N. + +config SCANF_KUNIT_TEST + tristate "KUnit test scanf() family of functions at runtime" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test the scanf functions at runtime. + + If unsure, say N. + config STRING_KUNIT_TEST tristate "KUnit test string functions at runtime" if !KUNIT_ALL_TESTS depends on KUNIT @@ -2440,12 +2469,6 @@ config STRING_HELPERS_KUNIT_TEST config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" -config TEST_PRINTF - tristate "Test printf() family of functions at runtime" - -config TEST_SCANF - tristate "Test scanf() family of functions at runtime" - config TEST_BITMAP tristate "Test bitmap_*() family of functions at runtime" help @@ -2691,6 +2714,20 @@ config SYSCTL_KUNIT_TEST If unsure, say N. +config KFIFO_KUNIT_TEST + tristate "KUnit Test for the generic kernel FIFO implementation" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds the generic FIFO implementation KUnit test suite. + It tests that the API and basic functionality of the kfifo type + and associated macros. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + config LIST_KUNIT_TEST tristate "KUnit Test for Kernel Linked-list structures" if !KUNIT_ALL_TESTS depends on KUNIT @@ -3166,7 +3203,7 @@ config TEST_OBJPOOL If unsure, say N. -config INT_POW_TEST +config INT_POW_KUNIT_TEST tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS depends on KUNIT default KUNIT_ALL_TESTS @@ -3197,6 +3234,44 @@ config INT_SQRT_KUNIT_TEST If unsure, say N +config INT_LOG_KUNIT_TEST + tristate "Integer log (int_log) test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the int_log library, which + provides two functions to compute the integer logarithm in base 2 and + base 10, called respectively as intlog2 and intlog10. + + If unsure, say N + +config GCD_KUNIT_TEST + tristate "Greatest common divisor test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the gcd() function, + which computes the greatest common divisor of two numbers. + + This test suite verifies the correctness of gcd() across various + scenarios, including edge cases. + + If unsure, say N + +config PRIME_NUMBERS_KUNIT_TEST + tristate "Prime number generator test" if !KUNIT_ALL_TESTS + depends on KUNIT + select PRIME_NUMBERS + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the {is,next}_prime_number + functions. + + Enabling this option will include tests that compare the prime number + generator functions against a brute force implementation. + + If unsure, say N + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 1d4aa7a83b3a..4216b3a4ff21 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -116,21 +116,22 @@ config UBSAN_UNREACHABLE This option enables -fsanitize=unreachable which checks for control flow reaching an expected-to-be-unreachable position. -config UBSAN_SIGNED_WRAP - bool "Perform checking for signed arithmetic wrap-around" +config UBSAN_INTEGER_WRAP + bool "Perform checking for integer arithmetic wrap-around" default UBSAN depends on !COMPILE_TEST - # The no_sanitize attribute was introduced in GCC with version 8. - depends on !CC_IS_GCC || GCC_VERSION >= 80000 + depends on $(cc-option,-fsanitize-undefined-ignore-overflow-pattern=all) depends on $(cc-option,-fsanitize=signed-integer-overflow) - help - This option enables -fsanitize=signed-integer-overflow which checks - for wrap-around of any arithmetic operations with signed integers. - This currently performs nearly no instrumentation due to the - kernel's use of -fno-strict-overflow which converts all would-be - arithmetic undefined behavior into wrap-around arithmetic. Future - sanitizer versions will allow for wrap-around checking (rather than - exclusively undefined behavior). + depends on $(cc-option,-fsanitize=unsigned-integer-overflow) + depends on $(cc-option,-fsanitize=implicit-signed-integer-truncation) + depends on $(cc-option,-fsanitize=implicit-unsigned-integer-truncation) + depends on $(cc-option,-fsanitize-ignorelist=/dev/null) + help + This option enables all of the sanitizers involved in integer overflow + (wrap-around) mitigation: signed-integer-overflow, unsigned-integer-overflow, + implicit-signed-integer-truncation, and implicit-unsigned-integer-truncation. + This is currently limited only to the size_t type while testing and + compiler development continues. config UBSAN_BOOL bool "Perform checking for non-boolean values used as boolean" diff --git a/lib/Makefile b/lib/Makefile index d5cfc7afbbb8..b752fcbc83d2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -5,6 +5,11 @@ ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_smp_processor_id.o += -DDISABLE_BRANCH_PROFILING +endif + # These files are disabled because they produce lots of non-interesting and/or # flaky coverage that is not a function of syscall inputs. For example, # rbtree can be global and individual rotations don't correlate with inputs. @@ -52,9 +57,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ percpu-refcount.o rhashtable.o base64.o \ once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \ generic-radix-tree.o bitmap-str.o -obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o obj-y += string_helpers.o -obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o obj-y += hexdump.o obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o @@ -65,27 +68,20 @@ obj-$(CONFIG_TEST_DHRY) += test_dhry.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_BITOPS) += test_bitops.o CFLAGS_test_bitops.o += -Werror -obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o -obj-$(CONFIG_TEST_IOV_ITER) += kunit_iov_iter.o -obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o obj-$(CONFIG_TEST_IDA) += test_ida.o obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) CFLAGS_test_ubsan.o += $(call cc-disable-warning, unused-but-set-variable) UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o -obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_MIN_HEAP) += test_min_heap.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o -obj-$(CONFIG_TEST_SORT) += test_sort.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o obj-$(CONFIG_TEST_DYNAMIC_DEBUG) += test_dynamic_debug.o -obj-$(CONFIG_TEST_PRINTF) += test_printf.o -obj-$(CONFIG_TEST_SCANF) += test_scanf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_KASAN),yy) @@ -98,7 +94,6 @@ obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_MAPLE_TREE) += test_maple_tree.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o -obj-$(CONFIG_TEST_RUNTIME) += tests/ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o @@ -107,10 +102,7 @@ obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o obj-$(CONFIG_TEST_HMM) += test_hmm.o obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o -obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_TEST_REF_TRACKER) += test_ref_tracker.o -CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE) -obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o obj-$(CONFIG_TEST_FPU) += test_fpu.o @@ -132,7 +124,7 @@ endif obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any) -obj-y += math/ crypto/ +obj-y += math/ crypto/ tests/ vdso/ obj-$(CONFIG_GENERIC_IOMAP) += iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o @@ -368,32 +360,6 @@ obj-$(CONFIG_OBJAGG) += objagg.o # pldmfw library obj-$(CONFIG_PLDMFW) += pldmfw/ -# KUnit tests -CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) -obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o -obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o -obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o -obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o -obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o -obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o -obj-$(CONFIG_BITS_TEST) += test_bits.o -obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o -obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o -obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o -obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o -CFLAGS_overflow_kunit.o = $(call cc-disable-warning, tautological-constant-out-of-range-compare) -obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o -CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) -obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o -CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) -CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread) -CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation) -CFLAGS_fortify_kunit.o += $(DISABLE_STRUCTLEAK_PLUGIN) -obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o -obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o -obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o -obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o - obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o obj-$(CONFIG_FIRMWARE_TABLE) += fw_table.o diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 759ea1783cc5..d726068358c7 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -254,7 +254,7 @@ static __init int test_atomics_init(void) pr_info("passed for %s platform %s CX8 and %s SSE\n", #ifdef CONFIG_X86_64 "x86-64", -#elif defined(CONFIG_X86_CMPXCHG64) +#elif defined(CONFIG_X86_CX8) "i586+", #else "i386+", diff --git a/lib/cpumask.c b/lib/cpumask.c index 57274ba8b6d9..5adb9874fbd0 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -7,38 +7,6 @@ #include <linux/memblock.h> #include <linux/numa.h> -/** - * cpumask_next_wrap - helper to implement for_each_cpu_wrap - * @n: the cpu prior to the place to search - * @mask: the cpumask pointer - * @start: the start point of the iteration - * @wrap: assume @n crossing @start terminates the iteration - * - * Return: >= nr_cpu_ids on completion - * - * Note: the @wrap argument is required for the start condition when - * we cannot assume @start is set in @mask. - */ -unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) -{ - unsigned int next; - -again: - next = cpumask_next(n, mask); - - if (wrap && n < start && next >= start) { - return nr_cpumask_bits; - - } else if (next >= nr_cpumask_bits) { - wrap = true; - n = -1; - goto again; - } - - return next; -} -EXPORT_SYMBOL(cpumask_next_wrap); - /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** @@ -171,8 +139,7 @@ unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); - next = find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), - nr_cpumask_bits, prev + 1); + next = cpumask_next_and_wrap(prev, src1p, src2p); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); @@ -192,7 +159,7 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp) /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); - next = find_next_bit_wrap(cpumask_bits(srcp), nr_cpumask_bits, prev + 1); + next = cpumask_next_wrap(prev, srcp); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 388da1aea14a..b3a85fe8b673 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -54,7 +54,7 @@ void __init dump_stack_set_arch_desc(const char *fmt, ...) */ void dump_stack_print_info(const char *log_lvl) { - printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s" BUILD_ID_FMT "\n", + printk("%sCPU: %d UID: %u PID: %d Comm: %.20s %s%s %s %.*s %s " BUILD_ID_FMT "\n", log_lvl, raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), current->pid, current->comm, @@ -62,7 +62,7 @@ void dump_stack_print_info(const char *log_lvl) print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, BUILD_ID_VAL); + init_utsname()->version, preempt_model_str(), BUILD_ID_VAL); if (get_taint()) printk("%s%s\n", log_lvl, print_tainted_verbose()); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 65f550cb5081..8c7fdb7d8c8f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1190,8 +1190,12 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, if (!n) return -ENOMEM; p = *pages; - for (int k = 0; k < n; k++) - get_page(p[k] = page + k); + for (int k = 0; k < n; k++) { + struct folio *folio = page_folio(page); + p[k] = page + k; + if (!folio_test_slab(folio)) + folio_get(folio); + } maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; diff --git a/lib/math/Makefile b/lib/math/Makefile index 853f023ae537..d1caba23baa0 100644 --- a/lib/math/Makefile +++ b/lib/math/Makefile @@ -5,8 +5,7 @@ obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o obj-$(CONFIG_RATIONAL) += rational.o -obj-$(CONFIG_INT_POW_TEST) += tests/int_pow_kunit.o obj-$(CONFIG_TEST_DIV64) += test_div64.o obj-$(CONFIG_TEST_MULDIV64) += test_mul_u64_u64_div_u64.o -obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o -obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += tests/int_sqrt_kunit.o
\ No newline at end of file + +obj-y += tests/ diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c index 9a17ee9af93a..95a6f7960db9 100644 --- a/lib/math/prime_numbers.c +++ b/lib/math/prime_numbers.c @@ -1,16 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "prime numbers: " fmt #include <linux/module.h> #include <linux/mutex.h> #include <linux/prime_numbers.h> #include <linux/slab.h> -struct primes { - struct rcu_head rcu; - unsigned long last, sz; - unsigned long primes[]; -}; +#include "prime_numbers_private.h" #if BITS_PER_LONG == 64 static const struct primes small_primes = { @@ -62,9 +57,25 @@ static const struct primes small_primes = { static DEFINE_MUTEX(lock); static const struct primes __rcu *primes = RCU_INITIALIZER(&small_primes); -static unsigned long selftest_max; +#if IS_ENABLED(CONFIG_PRIME_NUMBERS_KUNIT_TEST) +/* + * Calls the callback under RCU lock. The callback must not retain + * the primes pointer. + */ +void with_primes(void *ctx, primes_fn fn) +{ + rcu_read_lock(); + fn(ctx, rcu_dereference(primes)); + rcu_read_unlock(); +} +EXPORT_SYMBOL(with_primes); + +EXPORT_SYMBOL(slow_is_prime_number); -static bool slow_is_prime_number(unsigned long x) +#else +static +#endif +bool slow_is_prime_number(unsigned long x) { unsigned long y = int_sqrt(x); @@ -239,77 +250,13 @@ bool is_prime_number(unsigned long x) } EXPORT_SYMBOL(is_prime_number); -static void dump_primes(void) -{ - const struct primes *p; - char *buf; - - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - - rcu_read_lock(); - p = rcu_dereference(primes); - - if (buf) - bitmap_print_to_pagebuf(true, buf, p->primes, p->sz); - pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s\n", - p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf); - - rcu_read_unlock(); - - kfree(buf); -} - -static int selftest(unsigned long max) -{ - unsigned long x, last; - - if (!max) - return 0; - - for (last = 0, x = 2; x < max; x++) { - bool slow = slow_is_prime_number(x); - bool fast = is_prime_number(x); - - if (slow != fast) { - pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!\n", - x, slow ? "yes" : "no", fast ? "yes" : "no"); - goto err; - } - - if (!slow) - continue; - - if (next_prime_number(last) != x) { - pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu\n", - last, x, next_prime_number(last)); - goto err; - } - last = x; - } - - pr_info("%s(%lu) passed, last prime was %lu\n", __func__, x, last); - return 0; - -err: - dump_primes(); - return -EINVAL; -} - -static int __init primes_init(void) -{ - return selftest(selftest_max); -} - static void __exit primes_exit(void) { free_primes(); } -module_init(primes_init); module_exit(primes_exit); -module_param_named(selftest, selftest_max, ulong, 0400); - MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("Prime number library"); MODULE_LICENSE("GPL"); diff --git a/lib/math/prime_numbers_private.h b/lib/math/prime_numbers_private.h new file mode 100644 index 000000000000..f3ebf5386e6b --- /dev/null +++ b/lib/math/prime_numbers_private.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/types.h> + +struct primes { + struct rcu_head rcu; + unsigned long last, sz; + unsigned long primes[]; +}; + +#if IS_ENABLED(CONFIG_PRIME_NUMBERS_KUNIT_TEST) +typedef void (*primes_fn)(void *, const struct primes *); + +void with_primes(void *ctx, primes_fn fn); +bool slow_is_prime_number(unsigned long x); +#endif diff --git a/lib/math/tests/Makefile b/lib/math/tests/Makefile index e1a79f093b2d..13dc96e48408 100644 --- a/lib/math/tests/Makefile +++ b/lib/math/tests/Makefile @@ -1,4 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_INT_POW_TEST) += int_pow_kunit.o -obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += int_sqrt_kunit.o +obj-$(CONFIG_GCD_KUNIT_TEST) += gcd_kunit.o +obj-$(CONFIG_INT_LOG_KUNIT_TEST) += int_log_kunit.o +obj-$(CONFIG_INT_POW_KUNIT_TEST) += int_pow_kunit.o +obj-$(CONFIG_INT_SQRT_KUNIT_TEST) += int_sqrt_kunit.o +obj-$(CONFIG_PRIME_NUMBERS_KUNIT_TEST) += prime_numbers_kunit.o +obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational_kunit.o diff --git a/lib/math/tests/gcd_kunit.c b/lib/math/tests/gcd_kunit.c new file mode 100644 index 000000000000..ede1883583b1 --- /dev/null +++ b/lib/math/tests/gcd_kunit.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <kunit/test.h> +#include <linux/gcd.h> +#include <linux/limits.h> + +struct test_case_params { + unsigned long val1; + unsigned long val2; + unsigned long expected_result; + const char *name; +}; + +static const struct test_case_params params[] = { + { 48, 18, 6, "GCD of 48 and 18" }, + { 18, 48, 6, "GCD of 18 and 48" }, + { 56, 98, 14, "GCD of 56 and 98" }, + { 17, 13, 1, "Coprime numbers" }, + { 101, 103, 1, "Coprime numbers" }, + { 270, 192, 6, "GCD of 270 and 192" }, + { 0, 5, 5, "GCD with zero" }, + { 7, 0, 7, "GCD with zero reversed" }, + { 36, 36, 36, "GCD of identical numbers" }, + { ULONG_MAX, 1, 1, "GCD of max ulong and 1" }, + { ULONG_MAX, ULONG_MAX, ULONG_MAX, "GCD of max ulong values" }, +}; + +static void get_desc(const struct test_case_params *tc, char *desc) +{ + strscpy(desc, tc->name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(gcd, params, get_desc); + +static void gcd_test(struct kunit *test) +{ + const struct test_case_params *tc = (const struct test_case_params *)test->param_value; + + KUNIT_EXPECT_EQ(test, tc->expected_result, gcd(tc->val1, tc->val2)); +} + +static struct kunit_case math_gcd_test_cases[] = { + KUNIT_CASE_PARAM(gcd_test, gcd_gen_params), + {} +}; + +static struct kunit_suite gcd_test_suite = { + .name = "math-gcd", + .test_cases = math_gcd_test_cases, +}; + +kunit_test_suite(gcd_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("math.gcd KUnit test suite"); +MODULE_AUTHOR("Yu-Chun Lin <eleanor15x@gmail.com>"); diff --git a/lib/math/tests/int_log_kunit.c b/lib/math/tests/int_log_kunit.c new file mode 100644 index 000000000000..14e854146cb4 --- /dev/null +++ b/lib/math/tests/int_log_kunit.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <kunit/test.h> +#include <linux/int_log.h> + +struct test_case_params { + u32 value; + unsigned int expected_result; + const char *name; +}; + + +/* The expected result takes into account the log error */ +static const struct test_case_params intlog2_params[] = { + {0, 0, "Log base 2 of 0"}, + {1, 0, "Log base 2 of 1"}, + {2, 16777216, "Log base 2 of 2"}, + {3, 26591232, "Log base 2 of 3"}, + {4, 33554432, "Log base 2 of 4"}, + {8, 50331648, "Log base 2 of 8"}, + {16, 67108864, "Log base 2 of 16"}, + {32, 83886080, "Log base 2 of 32"}, + {U32_MAX, 536870911, "Log base 2 of MAX"}, +}; + +static const struct test_case_params intlog10_params[] = { + {0, 0, "Log base 10 of 0"}, + {1, 0, "Log base 10 of 1"}, + {6, 13055203, "Log base 10 of 6"}, + {10, 16777225, "Log base 10 of 10"}, + {100, 33554450, "Log base 10 of 100"}, + {1000, 50331675, "Log base 10 of 1000"}, + {10000, 67108862, "Log base 10 of 10000"}, + {U32_MAX, 161614247, "Log base 10 of MAX"} +}; + +static void get_desc(const struct test_case_params *tc, char *desc) +{ + strscpy(desc, tc->name, KUNIT_PARAM_DESC_SIZE); +} + + +KUNIT_ARRAY_PARAM(intlog2, intlog2_params, get_desc); + +static void intlog2_test(struct kunit *test) +{ + const struct test_case_params *tc = (const struct test_case_params *)test->param_value; + + KUNIT_EXPECT_EQ(test, tc->expected_result, intlog2(tc->value)); +} + +KUNIT_ARRAY_PARAM(intlog10, intlog10_params, get_desc); + +static void intlog10_test(struct kunit *test) +{ + const struct test_case_params *tc = (const struct test_case_params *)test->param_value; + + KUNIT_EXPECT_EQ(test, tc->expected_result, intlog10(tc->value)); +} + +static struct kunit_case math_int_log_test_cases[] = { + KUNIT_CASE_PARAM(intlog2_test, intlog2_gen_params), + KUNIT_CASE_PARAM(intlog10_test, intlog10_gen_params), + {} +}; + +static struct kunit_suite int_log_test_suite = { + .name = "math-int_log", + .test_cases = math_int_log_test_cases, +}; + +kunit_test_suites(&int_log_test_suite); + +MODULE_DESCRIPTION("math.int_log KUnit test suite"); +MODULE_LICENSE("GPL"); diff --git a/lib/math/tests/prime_numbers_kunit.c b/lib/math/tests/prime_numbers_kunit.c new file mode 100644 index 000000000000..2f1643208c66 --- /dev/null +++ b/lib/math/tests/prime_numbers_kunit.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <kunit/test.h> +#include <linux/module.h> +#include <linux/prime_numbers.h> + +#include "../prime_numbers_private.h" + +static void dump_primes(void *ctx, const struct primes *p) +{ + static char buf[PAGE_SIZE]; + struct kunit_suite *suite = ctx; + + bitmap_print_to_pagebuf(true, buf, p->primes, p->sz); + kunit_info(suite, "primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s", + p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf); +} + +static void prime_numbers_test(struct kunit *test) +{ + const unsigned long max = 65536; + unsigned long x, last, next; + + for (last = 0, x = 2; x < max; x++) { + const bool slow = slow_is_prime_number(x); + const bool fast = is_prime_number(x); + + KUNIT_ASSERT_EQ_MSG(test, slow, fast, "is-prime(%lu)", x); + + if (!slow) + continue; + + next = next_prime_number(last); + KUNIT_ASSERT_EQ_MSG(test, next, x, "next-prime(%lu)", last); + last = next; + } +} + +static void kunit_suite_exit(struct kunit_suite *suite) +{ + with_primes(suite, dump_primes); +} + +static struct kunit_case prime_numbers_cases[] = { + KUNIT_CASE(prime_numbers_test), + {}, +}; + +static struct kunit_suite prime_numbers_suite = { + .name = "math-prime_numbers", + .suite_exit = kunit_suite_exit, + .test_cases = prime_numbers_cases, +}; + +kunit_test_suite(prime_numbers_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Prime number library"); +MODULE_LICENSE("GPL"); diff --git a/lib/math/rational-test.c b/lib/math/tests/rational_kunit.c index 47486a95f088..47486a95f088 100644 --- a/lib/math/rational-test.c +++ b/lib/math/tests/rational_kunit.c diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 65a75d58ed9e..c83829ef557f 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -100,34 +100,6 @@ __check_eq_pbl(const char *srcfile, unsigned int line, return true; } -static bool __init -__check_eq_u32_array(const char *srcfile, unsigned int line, - const u32 *exp_arr, unsigned int exp_len, - const u32 *arr, unsigned int len) __used; -static bool __init -__check_eq_u32_array(const char *srcfile, unsigned int line, - const u32 *exp_arr, unsigned int exp_len, - const u32 *arr, unsigned int len) -{ - if (exp_len != len) { - pr_warn("[%s:%u] array length differ: expected %u, got %u\n", - srcfile, line, - exp_len, len); - return false; - } - - if (memcmp(exp_arr, arr, len*sizeof(*arr))) { - pr_warn("[%s:%u] array contents differ\n", srcfile, line); - print_hex_dump(KERN_WARNING, " exp: ", DUMP_PREFIX_OFFSET, - 32, 4, exp_arr, exp_len*sizeof(*exp_arr), false); - print_hex_dump(KERN_WARNING, " got: ", DUMP_PREFIX_OFFSET, - 32, 4, arr, len*sizeof(*arr), false); - return false; - } - - return true; -} - static bool __init __check_eq_clump8(const char *srcfile, unsigned int line, const unsigned int offset, const unsigned int size, diff --git a/lib/test_objpool.c b/lib/test_objpool.c index 896c0131c9a8..8f688187fa87 100644 --- a/lib/test_objpool.c +++ b/lib/test_objpool.c @@ -190,8 +190,7 @@ static int ot_init_hrtimer(struct ot_item *item, unsigned long hrtimer) return -ENOENT; item->hrtcycle = ktime_set(0, hrtimer * 1000000UL); - hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrt->function = ot_hrtimer_handler; + hrtimer_setup(hrt, ot_hrtimer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return 0; } diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 5d7b10e98610..8772e5edaa4f 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -15,7 +15,7 @@ static void test_ubsan_add_overflow(void) { volatile int val = INT_MAX; - UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP); + UBSAN_TEST(CONFIG_UBSAN_INTEGER_WRAP); val += 2; } @@ -24,7 +24,7 @@ static void test_ubsan_sub_overflow(void) volatile int val = INT_MIN; volatile int val2 = 2; - UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP); + UBSAN_TEST(CONFIG_UBSAN_INTEGER_WRAP); val -= val2; } @@ -32,7 +32,7 @@ static void test_ubsan_mul_overflow(void) { volatile int val = INT_MAX / 2; - UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP); + UBSAN_TEST(CONFIG_UBSAN_INTEGER_WRAP); val *= 3; } @@ -40,7 +40,7 @@ static void test_ubsan_negate_overflow(void) { volatile int val = INT_MIN; - UBSAN_TEST(CONFIG_UBSAN_SIGNED_WRAP); + UBSAN_TEST(CONFIG_UBSAN_INTEGER_WRAP); val = -val; } @@ -53,6 +53,15 @@ static void test_ubsan_divrem_overflow(void) val /= val2; } +static void test_ubsan_truncate_signed(void) +{ + volatile long val = LONG_MAX; + volatile int val2 = 0; + + UBSAN_TEST(CONFIG_UBSAN_INTEGER_WRAP); + val2 = val; +} + static void test_ubsan_shift_out_of_bounds(void) { volatile int neg = -1, wrap = 4; @@ -127,6 +136,7 @@ static const test_ubsan_fp test_ubsan_array[] = { test_ubsan_sub_overflow, test_ubsan_mul_overflow, test_ubsan_negate_overflow, + test_ubsan_truncate_signed, test_ubsan_shift_out_of_bounds, test_ubsan_out_of_bounds, test_ubsan_load_invalid_value, diff --git a/lib/tests/Makefile b/lib/tests/Makefile index 8e4f42cb9c54..498915255860 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -1 +1,44 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for tests of kernel library functions. + +# KUnit tests +CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN) +obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o +obj-$(CONFIG_BITS_TEST) += test_bits.o +obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o +obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o +obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o +obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o +CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced) +CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread) +CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation) +CFLAGS_fortify_kunit.o += $(DISABLE_STRUCTLEAK_PLUGIN) +obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o +CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE) +obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o +obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o +obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o +obj-$(CONFIG_TEST_IOV_ITER) += kunit_iov_iter.o +obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o +obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o +obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o +obj-$(CONFIG_KFIFO_KUNIT_TEST) += kfifo_kunit.o +obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o +obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o +obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o +CFLAGS_overflow_kunit.o = $(call cc-disable-warning, tautological-constant-out-of-range-compare) +obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o +obj-$(CONFIG_PRINTF_KUNIT_TEST) += printf_kunit.o +obj-$(CONFIG_SCANF_KUNIT_TEST) += scanf_kunit.o +obj-$(CONFIG_SIPHASH_KUNIT_TEST) += siphash_kunit.o +obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o +obj-$(CONFIG_TEST_SORT) += test_sort.o +CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable) +obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o +obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o +obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o +obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o +obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o + obj-$(CONFIG_TEST_RUNTIME_MODULE) += module/ diff --git a/lib/bitfield_kunit.c b/lib/tests/bitfield_kunit.c index 5ccd86f61896..5ccd86f61896 100644 --- a/lib/bitfield_kunit.c +++ b/lib/tests/bitfield_kunit.c diff --git a/lib/checksum_kunit.c b/lib/tests/checksum_kunit.c index be04aa42125c..be04aa42125c 100644 --- a/lib/checksum_kunit.c +++ b/lib/tests/checksum_kunit.c diff --git a/lib/cmdline_kunit.c b/lib/tests/cmdline_kunit.c index c1602f797637..c1602f797637 100644 --- a/lib/cmdline_kunit.c +++ b/lib/tests/cmdline_kunit.c diff --git a/lib/cpumask_kunit.c b/lib/tests/cpumask_kunit.c index 6b62a6bdd50e..6b62a6bdd50e 100644 --- a/lib/cpumask_kunit.c +++ b/lib/tests/cpumask_kunit.c diff --git a/lib/crc_kunit.c b/lib/tests/crc_kunit.c index 6a61d4b5fd45..6a61d4b5fd45 100644 --- a/lib/crc_kunit.c +++ b/lib/tests/crc_kunit.c diff --git a/lib/fortify_kunit.c b/lib/tests/fortify_kunit.c index ecb638d4cde1..29ffc62a71e3 100644 --- a/lib/fortify_kunit.c +++ b/lib/tests/fortify_kunit.c @@ -60,6 +60,7 @@ static int fortify_write_overflows; static const char array_of_10[] = "this is 10"; static const char *ptr_of_11 = "this is 11!"; +static const char * const unchanging_12 = "this is 12!!"; static char array_unknown[] = "compiler thinks I might change"; void fortify_add_kunit_error(int write) @@ -83,12 +84,28 @@ void fortify_add_kunit_error(int write) static void fortify_test_known_sizes(struct kunit *test) { + char stack[80] = "Test!"; + + KUNIT_EXPECT_FALSE(test, __is_constexpr(__builtin_strlen(stack))); + KUNIT_EXPECT_EQ(test, __compiletime_strlen(stack), 5); + + KUNIT_EXPECT_TRUE(test, __is_constexpr(__builtin_strlen("88888888"))); KUNIT_EXPECT_EQ(test, __compiletime_strlen("88888888"), 8); + + KUNIT_EXPECT_TRUE(test, __is_constexpr(__builtin_strlen(array_of_10))); KUNIT_EXPECT_EQ(test, __compiletime_strlen(array_of_10), 10); + + KUNIT_EXPECT_FALSE(test, __is_constexpr(__builtin_strlen(ptr_of_11))); KUNIT_EXPECT_EQ(test, __compiletime_strlen(ptr_of_11), 11); + KUNIT_EXPECT_TRUE(test, __is_constexpr(__builtin_strlen(unchanging_12))); + KUNIT_EXPECT_EQ(test, __compiletime_strlen(unchanging_12), 12); + + KUNIT_EXPECT_FALSE(test, __is_constexpr(__builtin_strlen(array_unknown))); KUNIT_EXPECT_EQ(test, __compiletime_strlen(array_unknown), SIZE_MAX); + /* Externally defined and dynamically sized string pointer: */ + KUNIT_EXPECT_FALSE(test, __is_constexpr(__builtin_strlen(test->name))); KUNIT_EXPECT_EQ(test, __compiletime_strlen(test->name), SIZE_MAX); } @@ -394,8 +411,6 @@ struct fortify_padding { char buf[32]; unsigned long bytes_after; }; -/* Force compiler into not being able to resolve size at compile-time. */ -static volatile int unconst; static void fortify_test_strlen(struct kunit *test) { @@ -520,57 +535,56 @@ static void fortify_test_strncpy(struct kunit *test) { struct fortify_padding pad = { }; char src[] = "Copy me fully into a small buffer and I will overflow!"; + size_t sizeof_buf = sizeof(pad.buf); + + OPTIMIZER_HIDE_VAR(sizeof_buf); /* Destination is %NUL-filled to start with. */ KUNIT_EXPECT_EQ(test, pad.bytes_before, 0); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 3], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 3], '\0'); KUNIT_EXPECT_EQ(test, pad.bytes_after, 0); /* Legitimate strncpy() 1 less than of max size. */ - KUNIT_ASSERT_TRUE(test, strncpy(pad.buf, src, - sizeof(pad.buf) + unconst - 1) + KUNIT_ASSERT_TRUE(test, strncpy(pad.buf, src, sizeof_buf - 1) == pad.buf); KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0); /* Only last byte should be %NUL */ - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 3], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 3], '\0'); /* Legitimate (though unterminated) max-size strncpy. */ - KUNIT_ASSERT_TRUE(test, strncpy(pad.buf, src, - sizeof(pad.buf) + unconst) + KUNIT_ASSERT_TRUE(test, strncpy(pad.buf, src, sizeof_buf) == pad.buf); KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0); /* No trailing %NUL -- thanks strncpy API. */ - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); /* But we will not have gone beyond. */ KUNIT_EXPECT_EQ(test, pad.bytes_after, 0); /* Now verify that FORTIFY is working... */ - KUNIT_ASSERT_TRUE(test, strncpy(pad.buf, src, - sizeof(pad.buf) + unconst + 1) + KUNIT_ASSERT_TRUE(test, strncpy(pad.buf, src, sizeof_buf + 1) == pad.buf); /* Should catch the overflow. */ KUNIT_EXPECT_EQ(test, fortify_write_overflows, 1); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); /* And we will not have gone beyond. */ KUNIT_EXPECT_EQ(test, pad.bytes_after, 0); /* And further... */ - KUNIT_ASSERT_TRUE(test, strncpy(pad.buf, src, - sizeof(pad.buf) + unconst + 2) + KUNIT_ASSERT_TRUE(test, strncpy(pad.buf, src, sizeof_buf + 2) == pad.buf); /* Should catch the overflow. */ KUNIT_EXPECT_EQ(test, fortify_write_overflows, 2); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); /* And we will not have gone beyond. */ KUNIT_EXPECT_EQ(test, pad.bytes_after, 0); } @@ -579,55 +593,56 @@ static void fortify_test_strscpy(struct kunit *test) { struct fortify_padding pad = { }; char src[] = "Copy me fully into a small buffer and I will overflow!"; + size_t sizeof_buf = sizeof(pad.buf); + size_t sizeof_src = sizeof(src); + + OPTIMIZER_HIDE_VAR(sizeof_buf); + OPTIMIZER_HIDE_VAR(sizeof_src); /* Destination is %NUL-filled to start with. */ KUNIT_EXPECT_EQ(test, pad.bytes_before, 0); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 3], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 3], '\0'); KUNIT_EXPECT_EQ(test, pad.bytes_after, 0); /* Legitimate strscpy() 1 less than of max size. */ - KUNIT_ASSERT_EQ(test, strscpy(pad.buf, src, - sizeof(pad.buf) + unconst - 1), + KUNIT_ASSERT_EQ(test, strscpy(pad.buf, src, sizeof_buf - 1), -E2BIG); KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0); /* Keeping space for %NUL, last two bytes should be %NUL */ - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 3], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 3], '\0'); /* Legitimate max-size strscpy. */ - KUNIT_ASSERT_EQ(test, strscpy(pad.buf, src, - sizeof(pad.buf) + unconst), + KUNIT_ASSERT_EQ(test, strscpy(pad.buf, src, sizeof_buf), -E2BIG); KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0); /* A trailing %NUL will exist. */ - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); /* Now verify that FORTIFY is working... */ - KUNIT_ASSERT_EQ(test, strscpy(pad.buf, src, - sizeof(pad.buf) + unconst + 1), + KUNIT_ASSERT_EQ(test, strscpy(pad.buf, src, sizeof_buf + 1), -E2BIG); /* Should catch the overflow. */ KUNIT_EXPECT_EQ(test, fortify_write_overflows, 1); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); /* And we will not have gone beyond. */ KUNIT_EXPECT_EQ(test, pad.bytes_after, 0); /* And much further... */ - KUNIT_ASSERT_EQ(test, strscpy(pad.buf, src, - sizeof(src) * 2 + unconst), + KUNIT_ASSERT_EQ(test, strscpy(pad.buf, src, sizeof_src * 2), -E2BIG); /* Should catch the overflow. */ KUNIT_EXPECT_EQ(test, fortify_write_overflows, 2); - KUNIT_EXPECT_EQ(test, pad.buf[sizeof(pad.buf) - 1], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); - KUNIT_EXPECT_NE(test, pad.buf[sizeof(pad.buf) - 2], '\0'); + KUNIT_EXPECT_EQ(test, pad.buf[sizeof_buf - 1], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); + KUNIT_EXPECT_NE(test, pad.buf[sizeof_buf - 2], '\0'); /* And we will not have gone beyond. */ KUNIT_EXPECT_EQ(test, pad.bytes_after, 0); } @@ -767,7 +782,9 @@ static void fortify_test_strlcat(struct kunit *test) struct fortify_padding pad = { }; char src[sizeof(pad.buf)] = { }; int i, partial; - int len = sizeof(pad.buf) + unconst; + int len = sizeof(pad.buf); + + OPTIMIZER_HIDE_VAR(len); /* Fill 15 bytes with valid characters. */ partial = sizeof(src) / 2 - 1; @@ -857,28 +874,32 @@ struct fortify_zero_sized { #define __fortify_test(memfunc) \ static void fortify_test_##memfunc(struct kunit *test) \ { \ - struct fortify_zero_sized zero = { }; \ + struct fortify_zero_sized empty = { }; \ struct fortify_padding pad = { }; \ char srcA[sizeof(pad.buf) + 2]; \ char srcB[sizeof(pad.buf) + 2]; \ - size_t len = sizeof(pad.buf) + unconst; \ + size_t len = sizeof(pad.buf); \ + size_t zero = 0; \ + \ + OPTIMIZER_HIDE_VAR(len); \ + OPTIMIZER_HIDE_VAR(zero); \ \ memset(srcA, 'A', sizeof(srcA)); \ KUNIT_ASSERT_EQ(test, srcA[0], 'A'); \ memset(srcB, 'B', sizeof(srcB)); \ KUNIT_ASSERT_EQ(test, srcB[0], 'B'); \ \ - memfunc(pad.buf, srcA, 0 + unconst); \ + memfunc(pad.buf, srcA, zero); \ KUNIT_EXPECT_EQ(test, pad.buf[0], '\0'); \ KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0); \ KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0); \ - memfunc(pad.buf + 1, srcB, 1 + unconst); \ + memfunc(pad.buf + 1, srcB, zero + 1); \ KUNIT_EXPECT_EQ(test, pad.buf[0], '\0'); \ KUNIT_EXPECT_EQ(test, pad.buf[1], 'B'); \ KUNIT_EXPECT_EQ(test, pad.buf[2], '\0'); \ KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0); \ KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0); \ - memfunc(pad.buf, srcA, 1 + unconst); \ + memfunc(pad.buf, srcA, zero + 1); \ KUNIT_EXPECT_EQ(test, pad.buf[0], 'A'); \ KUNIT_EXPECT_EQ(test, pad.buf[1], 'B'); \ KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0); \ @@ -904,10 +925,10 @@ static void fortify_test_##memfunc(struct kunit *test) \ /* Reset error counter. */ \ fortify_write_overflows = 0; \ /* Copy nothing into nothing: no errors. */ \ - memfunc(zero.buf, srcB, 0 + unconst); \ + memfunc(empty.buf, srcB, zero); \ KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0); \ KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0); \ - memfunc(zero.buf, srcB, 1 + unconst); \ + memfunc(empty.buf, srcB, zero + 1); \ KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0); \ KUNIT_EXPECT_EQ(test, fortify_write_overflows, 1); \ } @@ -919,7 +940,9 @@ static void fortify_test_memscan(struct kunit *test) char haystack[] = "Where oh where is my memory range?"; char *mem = haystack + strlen("Where oh where is "); char needle = 'm'; - size_t len = sizeof(haystack) + unconst; + size_t len = sizeof(haystack); + + OPTIMIZER_HIDE_VAR(len); KUNIT_ASSERT_PTR_EQ(test, memscan(haystack, needle, len), mem); @@ -938,7 +961,9 @@ static void fortify_test_memchr(struct kunit *test) char haystack[] = "Where oh where is my memory range?"; char *mem = haystack + strlen("Where oh where is "); char needle = 'm'; - size_t len = sizeof(haystack) + unconst; + size_t len = sizeof(haystack); + + OPTIMIZER_HIDE_VAR(len); KUNIT_ASSERT_PTR_EQ(test, memchr(haystack, needle, len), mem); @@ -957,7 +982,9 @@ static void fortify_test_memchr_inv(struct kunit *test) char haystack[] = "Where oh where is my memory range?"; char *mem = haystack + 1; char needle = 'W'; - size_t len = sizeof(haystack) + unconst; + size_t len = sizeof(haystack); + + OPTIMIZER_HIDE_VAR(len); /* Normal search is okay. */ KUNIT_ASSERT_PTR_EQ(test, memchr_inv(haystack, needle, len), @@ -976,8 +1003,11 @@ static void fortify_test_memcmp(struct kunit *test) { char one[] = "My mind is going ..."; char two[] = "My mind is going ... I can feel it."; - size_t one_len = sizeof(one) + unconst - 1; - size_t two_len = sizeof(two) + unconst - 1; + size_t one_len = sizeof(one) - 1; + size_t two_len = sizeof(two) - 1; + + OPTIMIZER_HIDE_VAR(one_len); + OPTIMIZER_HIDE_VAR(two_len); /* We match the first string (ignoring the %NUL). */ KUNIT_ASSERT_EQ(test, memcmp(one, two, one_len), 0); @@ -998,7 +1028,9 @@ static void fortify_test_kmemdup(struct kunit *test) { char src[] = "I got Doom running on it!"; char *copy; - size_t len = sizeof(src) + unconst; + size_t len = sizeof(src); + + OPTIMIZER_HIDE_VAR(len); /* Copy is within bounds. */ copy = kmemdup(src, len, GFP_KERNEL); diff --git a/lib/hashtable_test.c b/lib/tests/hashtable_test.c index 3521de6bad15..3521de6bad15 100644 --- a/lib/hashtable_test.c +++ b/lib/tests/hashtable_test.c diff --git a/lib/is_signed_type_kunit.c b/lib/tests/is_signed_type_kunit.c index 88adbe813f3a..88adbe813f3a 100644 --- a/lib/is_signed_type_kunit.c +++ b/lib/tests/is_signed_type_kunit.c diff --git a/lib/tests/kfifo_kunit.c b/lib/tests/kfifo_kunit.c new file mode 100644 index 000000000000..a85eedc3195a --- /dev/null +++ b/lib/tests/kfifo_kunit.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for the generic kernel FIFO implementation. + * + * Copyright (C) 2024 Diego Vieira <diego.daniel.professional@gmail.com> + */ +#include <kunit/test.h> + +#include <linux/kfifo.h> + +#define KFIFO_SIZE 32 +#define N_ELEMENTS 5 + +static void kfifo_test_reset_should_clear_the_fifo(struct kunit *test) +{ + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + kfifo_put(&my_fifo, 1); + kfifo_put(&my_fifo, 2); + kfifo_put(&my_fifo, 3); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3); + + kfifo_reset(&my_fifo); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0); + KUNIT_EXPECT_TRUE(test, kfifo_is_empty(&my_fifo)); +} + +static void kfifo_test_define_should_define_an_empty_fifo(struct kunit *test) +{ + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + KUNIT_EXPECT_TRUE(test, kfifo_initialized(&my_fifo)); + KUNIT_EXPECT_TRUE(test, kfifo_is_empty(&my_fifo)); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0); +} + +static void kfifo_test_len_should_ret_n_of_stored_elements(struct kunit *test) +{ + u8 buffer1[N_ELEMENTS]; + + for (int i = 0; i < N_ELEMENTS; i++) + buffer1[i] = i + 1; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0); + + kfifo_in(&my_fifo, buffer1, N_ELEMENTS); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), N_ELEMENTS); + + kfifo_in(&my_fifo, buffer1, N_ELEMENTS); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), N_ELEMENTS * 2); + + kfifo_reset(&my_fifo); + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0); +} + +static void kfifo_test_put_should_insert_and_get_should_pop(struct kunit *test) +{ + u8 out_data = 0; + int processed_elements; + u8 elements[] = { 3, 5, 11 }; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + // If the fifo is empty, get returns 0 + processed_elements = kfifo_get(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 0); + KUNIT_EXPECT_EQ(test, out_data, 0); + + for (int i = 0; i < 3; i++) + kfifo_put(&my_fifo, elements[i]); + + for (int i = 0; i < 3; i++) { + processed_elements = kfifo_get(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 1); + KUNIT_EXPECT_EQ(test, out_data, elements[i]); + } +} + +static void kfifo_test_in_should_insert_multiple_elements(struct kunit *test) +{ + u8 in_buffer[] = { 11, 25, 65 }; + u8 out_data; + int processed_elements; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + kfifo_in(&my_fifo, in_buffer, 3); + + for (int i = 0; i < 3; i++) { + processed_elements = kfifo_get(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 1); + KUNIT_EXPECT_EQ(test, out_data, in_buffer[i]); + } +} + +static void kfifo_test_out_should_pop_multiple_elements(struct kunit *test) +{ + u8 in_buffer[] = { 11, 25, 65 }; + u8 out_buffer[3]; + int copied_elements; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + for (int i = 0; i < 3; i++) + kfifo_put(&my_fifo, in_buffer[i]); + + copied_elements = kfifo_out(&my_fifo, out_buffer, 3); + KUNIT_EXPECT_EQ(test, copied_elements, 3); + + for (int i = 0; i < 3; i++) + KUNIT_EXPECT_EQ(test, out_buffer[i], in_buffer[i]); + KUNIT_EXPECT_TRUE(test, kfifo_is_empty(&my_fifo)); +} + +static void kfifo_test_dec_init_should_define_an_empty_fifo(struct kunit *test) +{ + DECLARE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + INIT_KFIFO(my_fifo); + + // my_fifo is a struct with an inplace buffer + KUNIT_EXPECT_FALSE(test, __is_kfifo_ptr(&my_fifo)); + + KUNIT_EXPECT_TRUE(test, kfifo_initialized(&my_fifo)); +} + +static void kfifo_test_define_should_equal_declare_init(struct kunit *test) +{ + // declare a variable my_fifo of type struct kfifo of u8 + DECLARE_KFIFO(my_fifo1, u8, KFIFO_SIZE); + // initialize the my_fifo variable + INIT_KFIFO(my_fifo1); + + // DEFINE_KFIFO declares the variable with the initial value + // essentially the same as calling DECLARE_KFIFO and INIT_KFIFO + DEFINE_KFIFO(my_fifo2, u8, KFIFO_SIZE); + + // my_fifo1 and my_fifo2 have the same size + KUNIT_EXPECT_EQ(test, sizeof(my_fifo1), sizeof(my_fifo2)); + KUNIT_EXPECT_EQ(test, kfifo_initialized(&my_fifo1), + kfifo_initialized(&my_fifo2)); + KUNIT_EXPECT_EQ(test, kfifo_is_empty(&my_fifo1), + kfifo_is_empty(&my_fifo2)); +} + +static void kfifo_test_alloc_should_initiliaze_a_ptr_fifo(struct kunit *test) +{ + int ret; + DECLARE_KFIFO_PTR(my_fifo, u8); + + INIT_KFIFO(my_fifo); + + // kfifo_initialized returns false signaling the buffer pointer is NULL + KUNIT_EXPECT_FALSE(test, kfifo_initialized(&my_fifo)); + + // kfifo_alloc allocates the buffer + ret = kfifo_alloc(&my_fifo, KFIFO_SIZE, GFP_KERNEL); + KUNIT_EXPECT_EQ_MSG(test, ret, 0, "Memory allocation should succeed"); + KUNIT_EXPECT_TRUE(test, kfifo_initialized(&my_fifo)); + + // kfifo_free frees the buffer + kfifo_free(&my_fifo); +} + +static void kfifo_test_peek_should_not_remove_elements(struct kunit *test) +{ + u8 out_data; + int processed_elements; + + DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE); + + // If the fifo is empty, peek returns 0 + processed_elements = kfifo_peek(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 0); + + kfifo_put(&my_fifo, 3); + kfifo_put(&my_fifo, 5); + kfifo_put(&my_fifo, 11); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3); + + processed_elements = kfifo_peek(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 1); + KUNIT_EXPECT_EQ(test, out_data, 3); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3); + + // Using peek doesn't remove the element + // so the read element and the fifo length + // remains the same + processed_elements = kfifo_peek(&my_fifo, &out_data); + KUNIT_EXPECT_EQ(test, processed_elements, 1); + KUNIT_EXPECT_EQ(test, out_data, 3); + + KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3); +} + +static struct kunit_case kfifo_test_cases[] = { + KUNIT_CASE(kfifo_test_reset_should_clear_the_fifo), + KUNIT_CASE(kfifo_test_define_should_define_an_empty_fifo), + KUNIT_CASE(kfifo_test_len_should_ret_n_of_stored_elements), + KUNIT_CASE(kfifo_test_put_should_insert_and_get_should_pop), + KUNIT_CASE(kfifo_test_in_should_insert_multiple_elements), + KUNIT_CASE(kfifo_test_out_should_pop_multiple_elements), + KUNIT_CASE(kfifo_test_dec_init_should_define_an_empty_fifo), + KUNIT_CASE(kfifo_test_define_should_equal_declare_init), + KUNIT_CASE(kfifo_test_alloc_should_initiliaze_a_ptr_fifo), + KUNIT_CASE(kfifo_test_peek_should_not_remove_elements), + {}, +}; + +static struct kunit_suite kfifo_test_module = { + .name = "kfifo", + .test_cases = kfifo_test_cases, +}; + +kunit_test_suites(&kfifo_test_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Diego Vieira <diego.daniel.professional@gmail.com>"); +MODULE_DESCRIPTION("KUnit test for the kernel FIFO"); diff --git a/lib/kunit_iov_iter.c b/lib/tests/kunit_iov_iter.c index 48342736d016..48342736d016 100644 --- a/lib/kunit_iov_iter.c +++ b/lib/tests/kunit_iov_iter.c diff --git a/lib/list-test.c b/lib/tests/list-test.c index 9135cdc1bb39..9135cdc1bb39 100644 --- a/lib/list-test.c +++ b/lib/tests/list-test.c diff --git a/lib/memcpy_kunit.c b/lib/tests/memcpy_kunit.c index d36933554e46..d36933554e46 100644 --- a/lib/memcpy_kunit.c +++ b/lib/tests/memcpy_kunit.c diff --git a/lib/overflow_kunit.c b/lib/tests/overflow_kunit.c index 5222c6393f11..894691b4411a 100644 --- a/lib/overflow_kunit.c +++ b/lib/tests/overflow_kunit.c @@ -1185,22 +1185,40 @@ struct bar { static void DEFINE_FLEX_test(struct kunit *test) { - /* Using _RAW_ on a __counted_by struct will initialize "counter" to zero */ - DEFINE_RAW_FLEX(struct foo, two_but_zero, array, 2); -#ifdef CONFIG_CC_HAS_COUNTED_BY - int expected_raw_size = sizeof(struct foo); -#else - int expected_raw_size = sizeof(struct foo) + 2 * sizeof(s16); -#endif - /* Without annotation, it will always be on-stack size. */ DEFINE_RAW_FLEX(struct bar, two, array, 2); DEFINE_FLEX(struct foo, eight, array, counter, 8); DEFINE_FLEX(struct foo, empty, array, counter, 0); + /* Using _RAW_ on a __counted_by struct will initialize "counter" to zero */ + DEFINE_RAW_FLEX(struct foo, two_but_zero, array, 2); + int array_size_override = 0; - KUNIT_EXPECT_EQ(test, __struct_size(two_but_zero), expected_raw_size); + KUNIT_EXPECT_EQ(test, sizeof(*two), sizeof(struct bar)); KUNIT_EXPECT_EQ(test, __struct_size(two), sizeof(struct bar) + 2 * sizeof(s16)); - KUNIT_EXPECT_EQ(test, __struct_size(eight), 24); + KUNIT_EXPECT_EQ(test, __member_size(two), sizeof(struct bar) + 2 * sizeof(s16)); + KUNIT_EXPECT_EQ(test, __struct_size(two->array), 2 * sizeof(s16)); + KUNIT_EXPECT_EQ(test, __member_size(two->array), 2 * sizeof(s16)); + + KUNIT_EXPECT_EQ(test, sizeof(*eight), sizeof(struct foo)); + KUNIT_EXPECT_EQ(test, __struct_size(eight), sizeof(struct foo) + 8 * sizeof(s16)); + KUNIT_EXPECT_EQ(test, __member_size(eight), sizeof(struct foo) + 8 * sizeof(s16)); + KUNIT_EXPECT_EQ(test, __struct_size(eight->array), 8 * sizeof(s16)); + KUNIT_EXPECT_EQ(test, __member_size(eight->array), 8 * sizeof(s16)); + + KUNIT_EXPECT_EQ(test, sizeof(*empty), sizeof(struct foo)); KUNIT_EXPECT_EQ(test, __struct_size(empty), sizeof(struct foo)); + KUNIT_EXPECT_EQ(test, __member_size(empty), sizeof(struct foo)); + KUNIT_EXPECT_EQ(test, __struct_size(empty->array), 0); + KUNIT_EXPECT_EQ(test, __member_size(empty->array), 0); + + /* If __counted_by is not being used, array size will have the on-stack size. */ + if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY)) + array_size_override = 2 * sizeof(s16); + + KUNIT_EXPECT_EQ(test, sizeof(*two_but_zero), sizeof(struct foo)); + KUNIT_EXPECT_EQ(test, __struct_size(two_but_zero), sizeof(struct foo) + 2 * sizeof(s16)); + KUNIT_EXPECT_EQ(test, __member_size(two_but_zero), sizeof(struct foo) + 2 * sizeof(s16)); + KUNIT_EXPECT_EQ(test, __struct_size(two_but_zero->array), array_size_override); + KUNIT_EXPECT_EQ(test, __member_size(two_but_zero->array), array_size_override); } static struct kunit_case overflow_test_cases[] = { diff --git a/lib/test_printf.c b/lib/tests/printf_kunit.c index 59dbe4f9a4cb..2c9f6170bacd 100644 --- a/lib/test_printf.c +++ b/lib/tests/printf_kunit.c @@ -3,9 +3,7 @@ * Test cases for printf facility. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/init.h> +#include <kunit/test.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/printk.h> @@ -25,8 +23,6 @@ #include <linux/property.h> -#include "../tools/testing/selftests/kselftest_module.h" - #define BUF_SIZE 256 #define PAD_SIZE 16 #define FILL_CHAR '$' @@ -37,14 +33,14 @@ block \ __diag_pop(); -KSTM_MODULE_GLOBALS(); +static unsigned int total_tests; -static char *test_buffer __initdata; -static char *alloced_buffer __initdata; +static char *test_buffer; +static char *alloced_buffer; -static int __printf(4, 0) __init -do_test(int bufsize, const char *expect, int elen, - const char *fmt, va_list ap) +static void __printf(7, 0) +do_test(struct kunit *kunittest, const char *file, const int line, int bufsize, const char *expect, + int elen, const char *fmt, va_list ap) { va_list aq; int ret, written; @@ -57,62 +53,70 @@ do_test(int bufsize, const char *expect, int elen, va_end(aq); if (ret != elen) { - pr_warn("vsnprintf(buf, %d, \"%s\", ...) returned %d, expected %d\n", - bufsize, fmt, ret, elen); - return 1; + KUNIT_FAIL(kunittest, + "%s:%d: vsnprintf(buf, %d, \"%s\", ...) returned %d, expected %d\n", + file, line, bufsize, fmt, ret, elen); + return; } if (memchr_inv(alloced_buffer, FILL_CHAR, PAD_SIZE)) { - pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote before buffer\n", bufsize, fmt); - return 1; + KUNIT_FAIL(kunittest, + "%s:%d: vsnprintf(buf, %d, \"%s\", ...) wrote before buffer\n", + file, line, bufsize, fmt); + return; } if (!bufsize) { if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE + PAD_SIZE)) { - pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n", - fmt); - return 1; + KUNIT_FAIL(kunittest, + "%s:%d: vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n", + file, line, fmt); } - return 0; + return; } written = min(bufsize-1, elen); if (test_buffer[written]) { - pr_warn("vsnprintf(buf, %d, \"%s\", ...) did not nul-terminate buffer\n", - bufsize, fmt); - return 1; + KUNIT_FAIL(kunittest, + "%s:%d: vsnprintf(buf, %d, \"%s\", ...) did not nul-terminate buffer\n", + file, line, bufsize, fmt); + return; } if (memchr_inv(test_buffer + written + 1, FILL_CHAR, bufsize - (written + 1))) { - pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote beyond the nul-terminator\n", - bufsize, fmt); - return 1; + KUNIT_FAIL(kunittest, + "%s:%d: vsnprintf(buf, %d, \"%s\", ...) wrote beyond the nul-terminator\n", + file, line, bufsize, fmt); + return; } if (memchr_inv(test_buffer + bufsize, FILL_CHAR, BUF_SIZE + PAD_SIZE - bufsize)) { - pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote beyond buffer\n", bufsize, fmt); - return 1; + KUNIT_FAIL(kunittest, + "%s:%d: vsnprintf(buf, %d, \"%s\", ...) wrote beyond buffer\n", + file, line, bufsize, fmt); + return; } if (memcmp(test_buffer, expect, written)) { - pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n", - bufsize, fmt, test_buffer, written, expect); - return 1; + KUNIT_FAIL(kunittest, + "%s:%d: vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n", + file, line, bufsize, fmt, test_buffer, written, expect); + return; } - return 0; } -static void __printf(3, 4) __init -__test(const char *expect, int elen, const char *fmt, ...) +static void __printf(6, 7) +__test(struct kunit *kunittest, const char *file, const int line, const char *expect, int elen, + const char *fmt, ...) { va_list ap; int rand; char *p; if (elen >= BUF_SIZE) { - pr_err("error in test suite: expected output length %d too long. Format was '%s'.\n", - elen, fmt); - failed_tests++; + KUNIT_FAIL(kunittest, + "%s:%d: error in test suite: expected length (%d) >= BUF_SIZE (%d). fmt=\"%s\"\n", + file, line, elen, BUF_SIZE, fmt); return; } @@ -124,19 +128,19 @@ __test(const char *expect, int elen, const char *fmt, ...) * enough and 0), and then we also test that kvasprintf would * be able to print it as expected. */ - failed_tests += do_test(BUF_SIZE, expect, elen, fmt, ap); + do_test(kunittest, file, line, BUF_SIZE, expect, elen, fmt, ap); rand = get_random_u32_inclusive(1, elen + 1); /* Since elen < BUF_SIZE, we have 1 <= rand <= BUF_SIZE. */ - failed_tests += do_test(rand, expect, elen, fmt, ap); - failed_tests += do_test(0, expect, elen, fmt, ap); + do_test(kunittest, file, line, rand, expect, elen, fmt, ap); + do_test(kunittest, file, line, 0, expect, elen, fmt, ap); p = kvasprintf(GFP_KERNEL, fmt, ap); if (p) { total_tests++; if (memcmp(p, expect, elen+1)) { - pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n", - fmt, p, expect); - failed_tests++; + KUNIT_FAIL(kunittest, + "%s:%d: kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n", + file, line, fmt, p, expect); } kfree(p); } @@ -144,10 +148,10 @@ __test(const char *expect, int elen, const char *fmt, ...) } #define test(expect, fmt, ...) \ - __test(expect, strlen(expect), fmt, ##__VA_ARGS__) + __test(kunittest, __FILE__, __LINE__, expect, strlen(expect), fmt, ##__VA_ARGS__) -static void __init -test_basic(void) +static void +test_basic(struct kunit *kunittest) { /* Work around annoying "warning: zero-length gnu_printf format string". */ char nul = '\0'; @@ -155,11 +159,11 @@ test_basic(void) test("", &nul); test("100%", "100%%"); test("xxx%yyy", "xxx%cyyy", '%'); - __test("xxx\0yyy", 7, "xxx%cyyy", '\0'); + __test(kunittest, __FILE__, __LINE__, "xxx\0yyy", 7, "xxx%cyyy", '\0'); } -static void __init -test_number(void) +static void +test_number(struct kunit *kunittest) { test("0x1234abcd ", "%#-12x", 0x1234abcd); test(" 0x1234abcd", "%#12x", 0x1234abcd); @@ -180,8 +184,8 @@ test_number(void) test("00|0|0|0|0", "%.2d|%.1d|%.0d|%.*d|%1.0d", 0, 0, 0, 0, 0, 0); } -static void __init -test_string(void) +static void +test_string(struct kunit *kunittest) { test("", "%s%.0s", "", "123"); test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456"); @@ -218,29 +222,6 @@ test_string(void) #define ZEROS "00000000" /* hex 32 zero bits */ #define ONES "ffffffff" /* hex 32 one bits */ -static int __init -plain_format(void) -{ - char buf[PLAIN_BUF_SIZE]; - int nchars; - - nchars = snprintf(buf, PLAIN_BUF_SIZE, "%p", PTR); - - if (nchars != PTR_WIDTH) - return -1; - - if (strncmp(buf, PTR_VAL_NO_CRNG, PTR_WIDTH) == 0) { - pr_warn("crng possibly not yet initialized. plain 'p' buffer contains \"%s\"", - PTR_VAL_NO_CRNG); - return 0; - } - - if (strncmp(buf, ZEROS, strlen(ZEROS)) != 0) - return -1; - - return 0; -} - #else #define PTR_WIDTH 8 @@ -250,92 +231,47 @@ plain_format(void) #define ZEROS "" #define ONES "" -static int __init -plain_format(void) -{ - /* Format is implicitly tested for 32 bit machines by plain_hash() */ - return 0; -} - #endif /* BITS_PER_LONG == 64 */ -static int __init -plain_hash_to_buffer(const void *p, char *buf, size_t len) +static void +plain_hash_to_buffer(struct kunit *kunittest, const void *p, char *buf, size_t len) { - int nchars; - - nchars = snprintf(buf, len, "%p", p); - - if (nchars != PTR_WIDTH) - return -1; + KUNIT_ASSERT_EQ(kunittest, snprintf(buf, len, "%p", p), PTR_WIDTH); if (strncmp(buf, PTR_VAL_NO_CRNG, PTR_WIDTH) == 0) { - pr_warn("crng possibly not yet initialized. plain 'p' buffer contains \"%s\"", - PTR_VAL_NO_CRNG); - return 0; + kunit_skip(kunittest, + "crng possibly not yet initialized. plain 'p' buffer contains \"%s\"\n", + PTR_VAL_NO_CRNG); } - - return 0; } -static int __init -plain_hash(void) +static void +hash_pointer(struct kunit *kunittest) { - char buf[PLAIN_BUF_SIZE]; - int ret; - - ret = plain_hash_to_buffer(PTR, buf, PLAIN_BUF_SIZE); - if (ret) - return ret; - - if (strncmp(buf, PTR_STR, PTR_WIDTH) == 0) - return -1; - - return 0; -} + if (no_hash_pointers) + kunit_skip(kunittest, "hash pointers disabled"); -/* - * We can't use test() to test %p because we don't know what output to expect - * after an address is hashed. - */ -static void __init -plain(void) -{ - int err; + char buf[PLAIN_BUF_SIZE]; - if (no_hash_pointers) { - pr_warn("skipping plain 'p' tests"); - skipped_tests += 2; - return; - } + plain_hash_to_buffer(kunittest, PTR, buf, PLAIN_BUF_SIZE); - err = plain_hash(); - if (err) { - pr_warn("plain 'p' does not appear to be hashed\n"); - failed_tests++; - return; - } + /* + * The hash of %p is unpredictable, therefore test() cannot be used. + * + * Instead verify that the first 32 bits are zeros on a 64-bit system + * and that the non-hashed value is not printed. + */ - err = plain_format(); - if (err) { - pr_warn("hashing plain 'p' has unexpected format\n"); - failed_tests++; - } + KUNIT_EXPECT_MEMEQ(kunittest, buf, ZEROS, strlen(ZEROS)); + KUNIT_EXPECT_MEMNEQ(kunittest, buf, PTR_STR, PTR_WIDTH); } -static void __init -test_hashed(const char *fmt, const void *p) +static void +test_hashed(struct kunit *kunittest, const char *fmt, const void *p) { char buf[PLAIN_BUF_SIZE]; - int ret; - /* - * No need to increase failed test counter since this is assumed - * to be called after plain(). - */ - ret = plain_hash_to_buffer(p, buf, PLAIN_BUF_SIZE); - if (ret) - return; + plain_hash_to_buffer(kunittest, p, buf, PLAIN_BUF_SIZE); test(buf, fmt, p); } @@ -343,8 +279,8 @@ test_hashed(const char *fmt, const void *p) /* * NULL pointers aren't hashed. */ -static void __init -null_pointer(void) +static void +null_pointer(struct kunit *kunittest) { test(ZEROS "00000000", "%p", NULL); test(ZEROS "00000000", "%px", NULL); @@ -354,8 +290,8 @@ null_pointer(void) /* * Error pointers aren't hashed. */ -static void __init -error_pointer(void) +static void +error_pointer(struct kunit *kunittest) { test(ONES "fffffff5", "%p", ERR_PTR(-11)); test(ONES "fffffff5", "%px", ERR_PTR(-11)); @@ -364,27 +300,27 @@ error_pointer(void) #define PTR_INVALID ((void *)0x000000ab) -static void __init -invalid_pointer(void) +static void +invalid_pointer(struct kunit *kunittest) { - test_hashed("%p", PTR_INVALID); + test_hashed(kunittest, "%p", PTR_INVALID); test(ZEROS "000000ab", "%px", PTR_INVALID); test("(efault)", "%pE", PTR_INVALID); } -static void __init -symbol_ptr(void) +static void +symbol_ptr(struct kunit *kunittest) { } -static void __init -kernel_ptr(void) +static void +kernel_ptr(struct kunit *kunittest) { /* We can't test this without access to kptr_restrict. */ } -static void __init -struct_resource(void) +static void +struct_resource(struct kunit *kunittest) { struct resource test_resource = { .start = 0xc0ffee00, @@ -432,8 +368,8 @@ struct_resource(void) "%pR", &test_resource); } -static void __init -struct_range(void) +static void +struct_range(struct kunit *kunittest) { struct range test_range = DEFINE_RANGE(0xc0ffee00ba5eba11, 0xc0ffee00ba5eba11); @@ -448,18 +384,18 @@ struct_range(void) "%pra", &test_range); } -static void __init -addr(void) +static void +addr(struct kunit *kunittest) { } -static void __init -escaped_str(void) +static void +escaped_str(struct kunit *kunittest) { } -static void __init -hex_string(void) +static void +hex_string(struct kunit *kunittest) { const char buf[3] = {0xc0, 0xff, 0xee}; @@ -469,8 +405,8 @@ hex_string(void) "%*ph|%*phC|%*phD|%*phN", 3, buf, 3, buf, 3, buf, 3, buf); } -static void __init -mac(void) +static void +mac(struct kunit *kunittest) { const u8 addr[6] = {0x2d, 0x48, 0xd6, 0xfc, 0x7a, 0x05}; @@ -481,8 +417,8 @@ mac(void) test("057afcd6482d", "%pmR", addr); } -static void __init -ip4(void) +static void +ip4(struct kunit *kunittest) { struct sockaddr_in sa; @@ -496,20 +432,13 @@ ip4(void) test("001.002.003.004:12345|1.2.3.4:12345", "%piSp|%pISp", &sa, &sa); } -static void __init -ip6(void) +static void +ip6(struct kunit *kunittest) { } -static void __init -ip(void) -{ - ip4(); - ip6(); -} - -static void __init -uuid(void) +static void +uuid(struct kunit *kunittest) { const char uuid[16] = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}; @@ -520,7 +449,7 @@ uuid(void) test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid); } -static struct dentry test_dentry[4] __initdata = { +static struct dentry test_dentry[4] = { { .d_parent = &test_dentry[0], .d_name = QSTR_INIT(test_dentry[0].d_iname, 3), .d_iname = "foo" }, @@ -535,8 +464,8 @@ static struct dentry test_dentry[4] __initdata = { .d_iname = "romeo" }, }; -static void __init -dentry(void) +static void +dentry(struct kunit *kunittest) { test("foo", "%pd", &test_dentry[0]); test("foo", "%pd2", &test_dentry[0]); @@ -556,13 +485,13 @@ dentry(void) test(" bravo/alfa| bravo/alfa", "%12pd2|%*pd2", &test_dentry[2], 12, &test_dentry[2]); } -static void __init -struct_va_format(void) +static void +struct_va_format(struct kunit *kunittest) { } -static void __init -time_and_date(void) +static void +time_and_date(struct kunit *kunittest) { /* 1543210543 */ const struct rtc_time tm = { @@ -595,13 +524,13 @@ time_and_date(void) test("15:32:23|0119-00-04", "%ptTtrs|%ptTdrs", &t, &t); } -static void __init -struct_clk(void) +static void +struct_clk(struct kunit *kunittest) { } -static void __init -large_bitmap(void) +static void +large_bitmap(struct kunit *kunittest) { const int nbits = 1 << 16; unsigned long *bits = bitmap_zalloc(nbits, GFP_KERNEL); @@ -614,8 +543,8 @@ large_bitmap(void) bitmap_free(bits); } -static void __init -bitmap(void) +static void +bitmap(struct kunit *kunittest) { DECLARE_BITMAP(bits, 20); const int primes[] = {2,3,5,7,11,13,17,19}; @@ -634,11 +563,11 @@ bitmap(void) test("fffff|fffff", "%20pb|%*pb", bits, 20, bits); test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits); - large_bitmap(); + large_bitmap(kunittest); } -static void __init -netdev_features(void) +static void +netdev_features(struct kunit *kunittest) { } @@ -663,9 +592,9 @@ static const struct page_flags_test pft[] = { "%#x", "kasantag"}, }; -static void __init -page_flags_test(int section, int node, int zone, int last_cpupid, - int kasan_tag, unsigned long flags, const char *name, +static void +page_flags_test(struct kunit *kunittest, int section, int node, int zone, + int last_cpupid, int kasan_tag, unsigned long flags, const char *name, char *cmp_buf) { unsigned long values[] = {section, node, zone, last_cpupid, kasan_tag}; @@ -701,26 +630,25 @@ page_flags_test(int section, int node, int zone, int last_cpupid, test(cmp_buf, "%pGp", &flags); } -static void __init -flags(void) +static void +flags(struct kunit *kunittest) { unsigned long flags; char *cmp_buffer; gfp_t gfp; - cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); - if (!cmp_buffer) - return; + cmp_buffer = kunit_kmalloc(kunittest, BUF_SIZE, GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(kunittest, cmp_buffer); flags = 0; - page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer); + page_flags_test(kunittest, 0, 0, 0, 0, 0, flags, "", cmp_buffer); flags = 1UL << NR_PAGEFLAGS; - page_flags_test(0, 0, 0, 0, 0, flags, "", cmp_buffer); + page_flags_test(kunittest, 0, 0, 0, 0, 0, flags, "", cmp_buffer); flags |= 1UL << PG_uptodate | 1UL << PG_dirty | 1UL << PG_lru | 1UL << PG_active | 1UL << PG_swapbacked; - page_flags_test(1, 1, 1, 0x1fffff, 1, flags, + page_flags_test(kunittest, 1, 1, 1, 0x1fffff, 1, flags, "uptodate|dirty|lru|active|swapbacked", cmp_buffer); @@ -745,11 +673,9 @@ flags(void) (unsigned long) gfp); gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); - - kfree(cmp_buffer); } -static void __init fwnode_pointer(void) +static void fwnode_pointer(struct kunit *kunittest) { const struct software_node first = { .name = "first" }; const struct software_node second = { .name = "second", .parent = &first }; @@ -763,8 +689,7 @@ static void __init fwnode_pointer(void) rval = software_node_register_node_group(group); if (rval) { - pr_warn("cannot register softnodes; rval %d\n", rval); - return; + kunit_skip(kunittest, "cannot register softnodes; rval %d\n", rval); } test(full_name_second, "%pfw", software_node_fwnode(&second)); @@ -776,7 +701,7 @@ static void __init fwnode_pointer(void) software_node_unregister_node_group(group); } -static void __init fourcc_pointer(void) +static void fourcc_pointer(struct kunit *kunittest) { struct { u32 code; @@ -793,14 +718,14 @@ static void __init fourcc_pointer(void) test(try[i].str, "%p4cc", &try[i].code); } -static void __init -errptr(void) +static void +errptr(struct kunit *kunittest) { test("-1234", "%pe", ERR_PTR(-1234)); /* Check that %pe with a non-ERR_PTR gets treated as ordinary %p. */ BUILD_BUG_ON(IS_ERR(PTR)); - test_hashed("%pe", PTR); + test_hashed(kunittest, "%pe", PTR); #ifdef CONFIG_SYMBOLIC_ERRNAME test("(-ENOTSOCK)", "(%pe)", ERR_PTR(-ENOTSOCK)); @@ -813,51 +738,66 @@ errptr(void) #endif } -static void __init -test_pointer(void) -{ - plain(); - null_pointer(); - error_pointer(); - invalid_pointer(); - symbol_ptr(); - kernel_ptr(); - struct_resource(); - struct_range(); - addr(); - escaped_str(); - hex_string(); - mac(); - ip(); - uuid(); - dentry(); - struct_va_format(); - time_and_date(); - struct_clk(); - bitmap(); - netdev_features(); - flags(); - errptr(); - fwnode_pointer(); - fourcc_pointer(); -} - -static void __init selftest(void) +static int printf_suite_init(struct kunit_suite *suite) { + total_tests = 0; + alloced_buffer = kmalloc(BUF_SIZE + 2*PAD_SIZE, GFP_KERNEL); if (!alloced_buffer) - return; + return -ENOMEM; test_buffer = alloced_buffer + PAD_SIZE; - test_basic(); - test_number(); - test_string(); - test_pointer(); + return 0; +} +static void printf_suite_exit(struct kunit_suite *suite) +{ kfree(alloced_buffer); -} -KSTM_MODULE_LOADERS(test_printf); + kunit_info(suite, "ran %u tests\n", total_tests); +} + +static struct kunit_case printf_test_cases[] = { + KUNIT_CASE(test_basic), + KUNIT_CASE(test_number), + KUNIT_CASE(test_string), + KUNIT_CASE(hash_pointer), + KUNIT_CASE(null_pointer), + KUNIT_CASE(error_pointer), + KUNIT_CASE(invalid_pointer), + KUNIT_CASE(symbol_ptr), + KUNIT_CASE(kernel_ptr), + KUNIT_CASE(struct_resource), + KUNIT_CASE(struct_range), + KUNIT_CASE(addr), + KUNIT_CASE(escaped_str), + KUNIT_CASE(hex_string), + KUNIT_CASE(mac), + KUNIT_CASE(ip4), + KUNIT_CASE(ip6), + KUNIT_CASE(uuid), + KUNIT_CASE(dentry), + KUNIT_CASE(struct_va_format), + KUNIT_CASE(time_and_date), + KUNIT_CASE(struct_clk), + KUNIT_CASE(bitmap), + KUNIT_CASE(netdev_features), + KUNIT_CASE(flags), + KUNIT_CASE(errptr), + KUNIT_CASE(fwnode_pointer), + KUNIT_CASE(fourcc_pointer), + {} +}; + +static struct kunit_suite printf_test_suite = { + .name = "printf", + .suite_init = printf_suite_init, + .suite_exit = printf_suite_exit, + .test_cases = printf_test_cases, +}; + +kunit_test_suite(printf_test_suite); + MODULE_AUTHOR("Rasmus Villemoes <linux@rasmusvillemoes.dk>"); MODULE_DESCRIPTION("Test cases for printf facility"); MODULE_LICENSE("GPL"); diff --git a/lib/test_scanf.c b/lib/tests/scanf_kunit.c index 44f8508c9d88..e9a36ed80575 100644 --- a/lib/test_scanf.c +++ b/lib/tests/scanf_kunit.c @@ -3,152 +3,138 @@ * Test cases for sscanf facility. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - +#include <kunit/test.h> #include <linux/bitops.h> -#include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/overflow.h> -#include <linux/printk.h> #include <linux/prandom.h> #include <linux/slab.h> #include <linux/string.h> -#include "../tools/testing/selftests/kselftest_module.h" - #define BUF_SIZE 1024 -KSTM_MODULE_GLOBALS(); -static char *test_buffer __initdata; -static char *fmt_buffer __initdata; -static struct rnd_state rnd_state __initdata; +static char *test_buffer; +static char *fmt_buffer; +static struct rnd_state rnd_state; -typedef int (*check_fn)(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap); +typedef void (*check_fn)(struct kunit *test, const char *file, const int line, + const void *check_data, const char *string, const char *fmt, int n_args, + va_list ap); -static void __scanf(4, 6) __init -_test(check_fn fn, const void *check_data, const char *string, const char *fmt, - int n_args, ...) +static void __scanf(7, 9) +_test(struct kunit *test, const char *file, const int line, check_fn fn, const void *check_data, + const char *string, const char *fmt, int n_args, ...) { va_list ap, ap_copy; int ret; - total_tests++; - va_start(ap, n_args); va_copy(ap_copy, ap); ret = vsscanf(string, fmt, ap_copy); va_end(ap_copy); if (ret != n_args) { - pr_warn("vsscanf(\"%s\", \"%s\", ...) returned %d expected %d\n", - string, fmt, ret, n_args); - goto fail; + KUNIT_FAIL(test, "%s:%d: vsscanf(\"%s\", \"%s\", ...) returned %d expected %d", + file, line, string, fmt, ret, n_args); + } else { + (*fn)(test, file, line, check_data, string, fmt, n_args, ap); } - ret = (*fn)(check_data, string, fmt, n_args, ap); - if (ret) - goto fail; - - va_end(ap); - - return; - -fail: - failed_tests++; va_end(ap); } #define _check_numbers_template(arg_fmt, expect, str, fmt, n_args, ap) \ do { \ - pr_debug("\"%s\", \"%s\" ->\n", str, fmt); \ for (; n_args > 0; n_args--, expect++) { \ typeof(*expect) got = *va_arg(ap, typeof(expect)); \ - pr_debug("\t" arg_fmt "\n", got); \ if (got != *expect) { \ - pr_warn("vsscanf(\"%s\", \"%s\", ...) expected " arg_fmt " got " arg_fmt "\n", \ - str, fmt, *expect, got); \ - return 1; \ + KUNIT_FAIL(test, \ + "%s:%d: vsscanf(\"%s\", \"%s\", ...) expected " arg_fmt " got " arg_fmt, \ + file, line, str, fmt, *expect, got); \ + return; \ } \ } \ - return 0; \ } while (0) -static int __init check_ull(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_ull(struct kunit *test, const char *file, const int line, const void *check_data, + const char *string, const char *fmt, int n_args, va_list ap) { const unsigned long long *pval = check_data; _check_numbers_template("%llu", pval, string, fmt, n_args, ap); } -static int __init check_ll(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_ll(struct kunit *test, const char *file, const int line, const void *check_data, + const char *string, const char *fmt, int n_args, va_list ap) { const long long *pval = check_data; _check_numbers_template("%lld", pval, string, fmt, n_args, ap); } -static int __init check_ulong(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_ulong(struct kunit *test, const char *file, const int line, + const void *check_data, const char *string, const char *fmt, int n_args, + va_list ap) { const unsigned long *pval = check_data; _check_numbers_template("%lu", pval, string, fmt, n_args, ap); } -static int __init check_long(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_long(struct kunit *test, const char *file, const int line, const void *check_data, + const char *string, const char *fmt, int n_args, va_list ap) { const long *pval = check_data; _check_numbers_template("%ld", pval, string, fmt, n_args, ap); } -static int __init check_uint(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_uint(struct kunit *test, const char *file, const int line, const void *check_data, + const char *string, const char *fmt, int n_args, va_list ap) { const unsigned int *pval = check_data; _check_numbers_template("%u", pval, string, fmt, n_args, ap); } -static int __init check_int(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_int(struct kunit *test, const char *file, const int line, const void *check_data, + const char *string, const char *fmt, int n_args, va_list ap) { const int *pval = check_data; _check_numbers_template("%d", pval, string, fmt, n_args, ap); } -static int __init check_ushort(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_ushort(struct kunit *test, const char *file, const int line, + const void *check_data, const char *string, const char *fmt, int n_args, + va_list ap) { const unsigned short *pval = check_data; _check_numbers_template("%hu", pval, string, fmt, n_args, ap); } -static int __init check_short(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_short(struct kunit *test, const char *file, const int line, + const void *check_data, const char *string, const char *fmt, int n_args, + va_list ap) { const short *pval = check_data; _check_numbers_template("%hd", pval, string, fmt, n_args, ap); } -static int __init check_uchar(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_uchar(struct kunit *test, const char *file, const int line, + const void *check_data, const char *string, const char *fmt, int n_args, + va_list ap) { const unsigned char *pval = check_data; _check_numbers_template("%hhu", pval, string, fmt, n_args, ap); } -static int __init check_char(const void *check_data, const char *string, - const char *fmt, int n_args, va_list ap) +static void check_char(struct kunit *test, const char *file, const int line, const void *check_data, + const char *string, const char *fmt, int n_args, va_list ap) { const signed char *pval = check_data; @@ -156,7 +142,7 @@ static int __init check_char(const void *check_data, const char *string, } /* Selection of interesting numbers to test, copied from test-kstrtox.c */ -static const unsigned long long numbers[] __initconst = { +static const unsigned long long numbers[] = { 0x0ULL, 0x1ULL, 0x7fULL, @@ -196,7 +182,7 @@ do { \ T result = ~expect_val; /* should be overwritten */ \ \ snprintf(test_buffer, BUF_SIZE, gen_fmt, expect_val); \ - _test(fn, &expect_val, test_buffer, "%" scan_fmt, 1, &result); \ + _test(test, __FILE__, __LINE__, fn, &expect_val, test_buffer, "%" scan_fmt, 1, &result);\ } while (0) #define simple_numbers_loop(T, gen_fmt, scan_fmt, fn) \ @@ -214,7 +200,7 @@ do { \ } \ } while (0) -static void __init numbers_simple(void) +static void numbers_simple(struct kunit *test) { simple_numbers_loop(unsigned long long, "%llu", "llu", check_ull); simple_numbers_loop(long long, "%lld", "lld", check_ll); @@ -267,14 +253,14 @@ static void __init numbers_simple(void) * the raw prandom*() functions (Not mathematically rigorous!!). * Variabilty of length and value is more important than perfect randomness. */ -static u32 __init next_test_random(u32 max_bits) +static u32 next_test_random(u32 max_bits) { u32 n_bits = hweight32(prandom_u32_state(&rnd_state)) % (max_bits + 1); return prandom_u32_state(&rnd_state) & GENMASK(n_bits, 0); } -static unsigned long long __init next_test_random_ull(void) +static unsigned long long next_test_random_ull(void) { u32 rand1 = prandom_u32_state(&rnd_state); u32 n_bits = (hweight32(rand1) * 3) % 64; @@ -311,7 +297,7 @@ do { \ * updating buf_pos and returning the number of characters appended. * On error buf_pos is not changed and return value is 0. */ -static int __init __printf(4, 5) +static int __printf(4, 5) append_fmt(char *buf, int *buf_pos, int buf_len, const char *val_fmt, ...) { va_list ap; @@ -333,7 +319,7 @@ append_fmt(char *buf, int *buf_pos, int buf_len, const char *val_fmt, ...) * Convenience function to append the field delimiter string * to both the value string and format string buffers. */ -static void __init append_delim(char *str_buf, int *str_buf_pos, int str_buf_len, +static void append_delim(char *str_buf, int *str_buf_pos, int str_buf_len, char *fmt_buf, int *fmt_buf_pos, int fmt_buf_len, const char *delim_str) { @@ -344,7 +330,7 @@ static void __init append_delim(char *str_buf, int *str_buf_pos, int str_buf_len #define test_array_8(fn, check_data, string, fmt, arr) \ do { \ BUILD_BUG_ON(ARRAY_SIZE(arr) != 8); \ - _test(fn, check_data, string, fmt, 8, \ + _test(test, __FILE__, __LINE__, fn, check_data, string, fmt, 8, \ &(arr)[0], &(arr)[1], &(arr)[2], &(arr)[3], \ &(arr)[4], &(arr)[5], &(arr)[6], &(arr)[7]); \ } while (0) @@ -398,7 +384,7 @@ do { \ test_array_8(fn, expect, test_buffer, fmt_buffer, result); \ } while (0) -static void __init numbers_list_ll(const char *delim) +static void numbers_list_ll(struct kunit *test, const char *delim) { numbers_list_8(unsigned long long, "%llu", delim, "llu", check_ull); numbers_list_8(long long, "%lld", delim, "lld", check_ll); @@ -408,7 +394,7 @@ static void __init numbers_list_ll(const char *delim) numbers_list_8(long long, "0x%llx", delim, "lli", check_ll); } -static void __init numbers_list_l(const char *delim) +static void numbers_list_l(struct kunit *test, const char *delim) { numbers_list_8(unsigned long, "%lu", delim, "lu", check_ulong); numbers_list_8(long, "%ld", delim, "ld", check_long); @@ -418,7 +404,7 @@ static void __init numbers_list_l(const char *delim) numbers_list_8(long, "0x%lx", delim, "li", check_long); } -static void __init numbers_list_d(const char *delim) +static void numbers_list_d(struct kunit *test, const char *delim) { numbers_list_8(unsigned int, "%u", delim, "u", check_uint); numbers_list_8(int, "%d", delim, "d", check_int); @@ -428,7 +414,7 @@ static void __init numbers_list_d(const char *delim) numbers_list_8(int, "0x%x", delim, "i", check_int); } -static void __init numbers_list_h(const char *delim) +static void numbers_list_h(struct kunit *test, const char *delim) { numbers_list_8(unsigned short, "%hu", delim, "hu", check_ushort); numbers_list_8(short, "%hd", delim, "hd", check_short); @@ -438,7 +424,7 @@ static void __init numbers_list_h(const char *delim) numbers_list_8(short, "0x%hx", delim, "hi", check_short); } -static void __init numbers_list_hh(const char *delim) +static void numbers_list_hh(struct kunit *test, const char *delim) { numbers_list_8(unsigned char, "%hhu", delim, "hhu", check_uchar); numbers_list_8(signed char, "%hhd", delim, "hhd", check_char); @@ -448,16 +434,19 @@ static void __init numbers_list_hh(const char *delim) numbers_list_8(signed char, "0x%hhx", delim, "hhi", check_char); } -static void __init numbers_list(const char *delim) +static void numbers_list(struct kunit *test) { - numbers_list_ll(delim); - numbers_list_l(delim); - numbers_list_d(delim); - numbers_list_h(delim); - numbers_list_hh(delim); + const char * const *param = test->param_value; + const char *delim = *param; + + numbers_list_ll(test, delim); + numbers_list_l(test, delim); + numbers_list_d(test, delim); + numbers_list_h(test, delim); + numbers_list_hh(test, delim); } -static void __init numbers_list_field_width_ll(const char *delim) +static void numbers_list_field_width_ll(struct kunit *test, const char *delim) { numbers_list_fix_width(unsigned long long, "%llu", delim, 20, "llu", check_ull); numbers_list_fix_width(long long, "%lld", delim, 20, "lld", check_ll); @@ -467,7 +456,7 @@ static void __init numbers_list_field_width_ll(const char *delim) numbers_list_fix_width(long long, "0x%llx", delim, 18, "lli", check_ll); } -static void __init numbers_list_field_width_l(const char *delim) +static void numbers_list_field_width_l(struct kunit *test, const char *delim) { #if BITS_PER_LONG == 64 numbers_list_fix_width(unsigned long, "%lu", delim, 20, "lu", check_ulong); @@ -486,7 +475,7 @@ static void __init numbers_list_field_width_l(const char *delim) #endif } -static void __init numbers_list_field_width_d(const char *delim) +static void numbers_list_field_width_d(struct kunit *test, const char *delim) { numbers_list_fix_width(unsigned int, "%u", delim, 10, "u", check_uint); numbers_list_fix_width(int, "%d", delim, 11, "d", check_int); @@ -496,7 +485,7 @@ static void __init numbers_list_field_width_d(const char *delim) numbers_list_fix_width(int, "0x%x", delim, 10, "i", check_int); } -static void __init numbers_list_field_width_h(const char *delim) +static void numbers_list_field_width_h(struct kunit *test, const char *delim) { numbers_list_fix_width(unsigned short, "%hu", delim, 5, "hu", check_ushort); numbers_list_fix_width(short, "%hd", delim, 6, "hd", check_short); @@ -506,7 +495,7 @@ static void __init numbers_list_field_width_h(const char *delim) numbers_list_fix_width(short, "0x%hx", delim, 6, "hi", check_short); } -static void __init numbers_list_field_width_hh(const char *delim) +static void numbers_list_field_width_hh(struct kunit *test, const char *delim) { numbers_list_fix_width(unsigned char, "%hhu", delim, 3, "hhu", check_uchar); numbers_list_fix_width(signed char, "%hhd", delim, 4, "hhd", check_char); @@ -520,16 +509,19 @@ static void __init numbers_list_field_width_hh(const char *delim) * List of numbers separated by delim. Each field width specifier is the * maximum possible digits for the given type and base. */ -static void __init numbers_list_field_width_typemax(const char *delim) +static void numbers_list_field_width_typemax(struct kunit *test) { - numbers_list_field_width_ll(delim); - numbers_list_field_width_l(delim); - numbers_list_field_width_d(delim); - numbers_list_field_width_h(delim); - numbers_list_field_width_hh(delim); + const char * const *param = test->param_value; + const char *delim = *param; + + numbers_list_field_width_ll(test, delim); + numbers_list_field_width_l(test, delim); + numbers_list_field_width_d(test, delim); + numbers_list_field_width_h(test, delim); + numbers_list_field_width_hh(test, delim); } -static void __init numbers_list_field_width_val_ll(const char *delim) +static void numbers_list_field_width_val_ll(struct kunit *test, const char *delim) { numbers_list_val_width(unsigned long long, "%llu", delim, "llu", check_ull); numbers_list_val_width(long long, "%lld", delim, "lld", check_ll); @@ -539,7 +531,7 @@ static void __init numbers_list_field_width_val_ll(const char *delim) numbers_list_val_width(long long, "0x%llx", delim, "lli", check_ll); } -static void __init numbers_list_field_width_val_l(const char *delim) +static void numbers_list_field_width_val_l(struct kunit *test, const char *delim) { numbers_list_val_width(unsigned long, "%lu", delim, "lu", check_ulong); numbers_list_val_width(long, "%ld", delim, "ld", check_long); @@ -549,7 +541,7 @@ static void __init numbers_list_field_width_val_l(const char *delim) numbers_list_val_width(long, "0x%lx", delim, "li", check_long); } -static void __init numbers_list_field_width_val_d(const char *delim) +static void numbers_list_field_width_val_d(struct kunit *test, const char *delim) { numbers_list_val_width(unsigned int, "%u", delim, "u", check_uint); numbers_list_val_width(int, "%d", delim, "d", check_int); @@ -559,7 +551,7 @@ static void __init numbers_list_field_width_val_d(const char *delim) numbers_list_val_width(int, "0x%x", delim, "i", check_int); } -static void __init numbers_list_field_width_val_h(const char *delim) +static void numbers_list_field_width_val_h(struct kunit *test, const char *delim) { numbers_list_val_width(unsigned short, "%hu", delim, "hu", check_ushort); numbers_list_val_width(short, "%hd", delim, "hd", check_short); @@ -569,7 +561,7 @@ static void __init numbers_list_field_width_val_h(const char *delim) numbers_list_val_width(short, "0x%hx", delim, "hi", check_short); } -static void __init numbers_list_field_width_val_hh(const char *delim) +static void numbers_list_field_width_val_hh(struct kunit *test, const char *delim) { numbers_list_val_width(unsigned char, "%hhu", delim, "hhu", check_uchar); numbers_list_val_width(signed char, "%hhd", delim, "hhd", check_char); @@ -583,13 +575,16 @@ static void __init numbers_list_field_width_val_hh(const char *delim) * List of numbers separated by delim. Each field width specifier is the * exact length of the corresponding value digits in the string being scanned. */ -static void __init numbers_list_field_width_val_width(const char *delim) +static void numbers_list_field_width_val_width(struct kunit *test) { - numbers_list_field_width_val_ll(delim); - numbers_list_field_width_val_l(delim); - numbers_list_field_width_val_d(delim); - numbers_list_field_width_val_h(delim); - numbers_list_field_width_val_hh(delim); + const char * const *param = test->param_value; + const char *delim = *param; + + numbers_list_field_width_val_ll(test, delim); + numbers_list_field_width_val_l(test, delim); + numbers_list_field_width_val_d(test, delim); + numbers_list_field_width_val_h(test, delim); + numbers_list_field_width_val_hh(test, delim); } /* @@ -598,9 +593,14 @@ static void __init numbers_list_field_width_val_width(const char *delim) * of digits. For example the hex values c0,3,bf01,303 would have a * string representation of "c03bf01303" and extracted with "%2x%1x%4x%3x". */ -static void __init numbers_slice(void) +static void numbers_slice(struct kunit *test) { - numbers_list_field_width_val_width(""); + const char *delim = ""; + + KUNIT_ASSERT_PTR_EQ(test, test->param_value, NULL); + test->param_value = &delim; + + numbers_list_field_width_val_width(test); } #define test_number_prefix(T, str, scan_fmt, expect0, expect1, n_args, fn) \ @@ -608,14 +608,14 @@ do { \ const T expect[2] = { expect0, expect1 }; \ T result[2] = { (T)~expect[0], (T)~expect[1] }; \ \ - _test(fn, &expect, str, scan_fmt, n_args, &result[0], &result[1]); \ + _test(test, __FILE__, __LINE__, fn, &expect, str, scan_fmt, n_args, &result[0], &result[1]);\ } while (0) /* * Number prefix is >= field width. * Expected behaviour is derived from testing userland sscanf. */ -static void __init numbers_prefix_overflow(void) +static void numbers_prefix_overflow(struct kunit *test) { /* * Negative decimal with a field of width 1, should quit scanning @@ -684,25 +684,17 @@ do { \ T got; \ char *endp; \ int len; \ - bool fail = false; \ \ - total_tests++; \ len = snprintf(test_buffer, BUF_SIZE, gen_fmt, expect); \ got = (fn)(test_buffer, &endp, base); \ - pr_debug(#fn "(\"%s\", %d) -> " gen_fmt "\n", test_buffer, base, got); \ if (got != (expect)) { \ - fail = true; \ - pr_warn(#fn "(\"%s\", %d): got " gen_fmt " expected " gen_fmt "\n", \ - test_buffer, base, got, expect); \ + KUNIT_FAIL(test, #fn "(\"%s\", %d): got " gen_fmt " expected " gen_fmt, \ + test_buffer, base, got, expect); \ } else if (endp != test_buffer + len) { \ - fail = true; \ - pr_warn(#fn "(\"%s\", %d) startp=0x%px got endp=0x%px expected 0x%px\n", \ - test_buffer, base, test_buffer, \ - test_buffer + len, endp); \ + KUNIT_FAIL(test, #fn "(\"%s\", %d) startp=0x%px got endp=0x%px expected 0x%px", \ + test_buffer, base, test_buffer, \ + test_buffer + len, endp); \ } \ - \ - if (fail) \ - failed_tests++; \ } while (0) #define test_simple_strtoxx(T, fn, gen_fmt, base) \ @@ -718,7 +710,7 @@ do { \ } \ } while (0) -static void __init test_simple_strtoull(void) +static void test_simple_strtoull(struct kunit *test) { test_simple_strtoxx(unsigned long long, simple_strtoull, "%llu", 10); test_simple_strtoxx(unsigned long long, simple_strtoull, "%llu", 0); @@ -727,7 +719,7 @@ static void __init test_simple_strtoull(void) test_simple_strtoxx(unsigned long long, simple_strtoull, "0x%llx", 0); } -static void __init test_simple_strtoll(void) +static void test_simple_strtoll(struct kunit *test) { test_simple_strtoxx(long long, simple_strtoll, "%lld", 10); test_simple_strtoxx(long long, simple_strtoll, "%lld", 0); @@ -736,7 +728,7 @@ static void __init test_simple_strtoll(void) test_simple_strtoxx(long long, simple_strtoll, "0x%llx", 0); } -static void __init test_simple_strtoul(void) +static void test_simple_strtoul(struct kunit *test) { test_simple_strtoxx(unsigned long, simple_strtoul, "%lu", 10); test_simple_strtoxx(unsigned long, simple_strtoul, "%lu", 0); @@ -745,7 +737,7 @@ static void __init test_simple_strtoul(void) test_simple_strtoxx(unsigned long, simple_strtoul, "0x%lx", 0); } -static void __init test_simple_strtol(void) +static void test_simple_strtol(struct kunit *test) { test_simple_strtoxx(long, simple_strtol, "%ld", 10); test_simple_strtoxx(long, simple_strtol, "%ld", 0); @@ -755,60 +747,69 @@ static void __init test_simple_strtol(void) } /* Selection of common delimiters/separators between numbers in a string. */ -static const char * const number_delimiters[] __initconst = { +static const char * const number_delimiters[] = { " ", ":", ",", "-", "/", }; -static void __init test_numbers(void) +static void number_delimiter_param_desc(const char * const *param, + char *desc) { - int i; + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "\"%s\"", *param); +} - /* String containing only one number. */ - numbers_simple(); +KUNIT_ARRAY_PARAM(number_delimiters, number_delimiters, number_delimiter_param_desc); +static struct kunit_case scanf_test_cases[] = { + KUNIT_CASE(numbers_simple), /* String with multiple numbers separated by delimiter. */ - for (i = 0; i < ARRAY_SIZE(number_delimiters); i++) { - numbers_list(number_delimiters[i]); - - /* Field width may be longer than actual field digits. */ - numbers_list_field_width_typemax(number_delimiters[i]); - - /* Each field width exactly length of actual field digits. */ - numbers_list_field_width_val_width(number_delimiters[i]); - } - + KUNIT_CASE_PARAM(numbers_list, number_delimiters_gen_params), + /* Field width may be longer than actual field digits. */ + KUNIT_CASE_PARAM(numbers_list_field_width_typemax, number_delimiters_gen_params), + /* Each field width exactly length of actual field digits. */ + KUNIT_CASE_PARAM(numbers_list_field_width_val_width, number_delimiters_gen_params), /* Slice continuous sequence of digits using field widths. */ - numbers_slice(); - - numbers_prefix_overflow(); -} + KUNIT_CASE(numbers_slice), + KUNIT_CASE(numbers_prefix_overflow), + + KUNIT_CASE(test_simple_strtoull), + KUNIT_CASE(test_simple_strtoll), + KUNIT_CASE(test_simple_strtoul), + KUNIT_CASE(test_simple_strtol), + {} +}; -static void __init selftest(void) +static int scanf_suite_init(struct kunit_suite *suite) { test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); if (!test_buffer) - return; + return -ENOMEM; fmt_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); if (!fmt_buffer) { kfree(test_buffer); - return; + return -ENOMEM; } prandom_seed_state(&rnd_state, 3141592653589793238ULL); - test_numbers(); - - test_simple_strtoull(); - test_simple_strtoll(); - test_simple_strtoul(); - test_simple_strtol(); + return 0; +} +static void scanf_suite_exit(struct kunit_suite *suite) +{ kfree(fmt_buffer); kfree(test_buffer); } -KSTM_MODULE_LOADERS(test_scanf); +static struct kunit_suite scanf_test_suite = { + .name = "scanf", + .suite_init = scanf_suite_init, + .suite_exit = scanf_suite_exit, + .test_cases = scanf_test_cases, +}; + +kunit_test_suite(scanf_test_suite); + MODULE_AUTHOR("Richard Fitzgerald <rf@opensource.cirrus.com>"); MODULE_DESCRIPTION("Test cases for sscanf facility"); MODULE_LICENSE("GPL v2"); diff --git a/lib/siphash_kunit.c b/lib/tests/siphash_kunit.c index 26bd4e8dc03e..26bd4e8dc03e 100644 --- a/lib/siphash_kunit.c +++ b/lib/tests/siphash_kunit.c diff --git a/lib/slub_kunit.c b/lib/tests/slub_kunit.c index f11691315c2f..d47c472b0520 100644 --- a/lib/slub_kunit.c +++ b/lib/tests/slub_kunit.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/rcupdate.h> +#include <linux/delay.h> #include "../mm/slab.h" static struct kunit_resource resource; @@ -181,6 +182,63 @@ static void test_kfree_rcu(struct kunit *test) KUNIT_EXPECT_EQ(test, 0, slab_errors); } +struct cache_destroy_work { + struct work_struct work; + struct kmem_cache *s; +}; + +static void cache_destroy_workfn(struct work_struct *w) +{ + struct cache_destroy_work *cdw; + + cdw = container_of(w, struct cache_destroy_work, work); + kmem_cache_destroy(cdw->s); +} + +#define KMEM_CACHE_DESTROY_NR 10 + +static void test_kfree_rcu_wq_destroy(struct kunit *test) +{ + struct test_kfree_rcu_struct *p; + struct cache_destroy_work cdw; + struct workqueue_struct *wq; + struct kmem_cache *s; + unsigned int delay; + int i; + + if (IS_BUILTIN(CONFIG_SLUB_KUNIT_TEST)) + kunit_skip(test, "can't do kfree_rcu() when test is built-in"); + + INIT_WORK_ONSTACK(&cdw.work, cache_destroy_workfn); + wq = alloc_workqueue("test_kfree_rcu_destroy_wq", + WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + + if (!wq) + kunit_skip(test, "failed to alloc wq"); + + for (i = 0; i < KMEM_CACHE_DESTROY_NR; i++) { + s = test_kmem_cache_create("TestSlub_kfree_rcu_wq_destroy", + sizeof(struct test_kfree_rcu_struct), + SLAB_NO_MERGE); + + if (!s) + kunit_skip(test, "failed to create cache"); + + delay = get_random_u8(); + p = kmem_cache_alloc(s, GFP_KERNEL); + kfree_rcu(p, rcu); + + cdw.s = s; + + msleep(delay); + queue_work(wq, &cdw.work); + flush_work(&cdw.work); + } + + destroy_workqueue(wq); + KUNIT_EXPECT_EQ(test, 0, slab_errors); +} + static void test_leak_destroy(struct kunit *test) { struct kmem_cache *s = test_kmem_cache_create("TestSlub_leak_destroy", @@ -254,6 +312,7 @@ static struct kunit_case test_cases[] = { KUNIT_CASE(test_clobber_redzone_free), KUNIT_CASE(test_kmalloc_redzone_access), KUNIT_CASE(test_kfree_rcu), + KUNIT_CASE(test_kfree_rcu_wq_destroy), KUNIT_CASE(test_leak_destroy), KUNIT_CASE(test_krealloc_redzone_zeroing), {} diff --git a/lib/stackinit_kunit.c b/lib/tests/stackinit_kunit.c index 135322592faf..63aa78e6f5c1 100644 --- a/lib/stackinit_kunit.c +++ b/lib/tests/stackinit_kunit.c @@ -185,6 +185,15 @@ static bool stackinit_range_contains(char *haystack_start, size_t haystack_size, INIT_STRUCT_assigned_copy(var_type) /* + * The "did we actually fill the stack?" check value needs + * to be neither 0 nor any of the "pattern" bytes. The + * pattern bytes are compiler, architecture, and type based, + * so we have to pick a value that never appears for those + * combinations. Use 0x99 which is not 0xFF, 0xFE, nor 0xAA. + */ +#define FILL_BYTE 0x99 + +/* * @name: unique string name for the test * @var_type: type to be tested for zeroing initialization * @which: is this a SCALAR, STRING, or STRUCT type? @@ -206,12 +215,12 @@ static noinline void test_ ## name (struct kunit *test) \ ZERO_CLONE_ ## which(zero); \ /* Clear entire check buffer for 0xFF overlap test. */ \ memset(check_buf, 0x00, sizeof(check_buf)); \ - /* Fill stack with 0xFF. */ \ + /* Fill stack with FILL_BYTE. */ \ ignored = leaf_ ##name((unsigned long)&ignored, 1, \ FETCH_ARG_ ## which(zero)); \ - /* Verify all bytes overwritten with 0xFF. */ \ + /* Verify all bytes overwritten with FILL_BYTE. */ \ for (sum = 0, i = 0; i < target_size; i++) \ - sum += (check_buf[i] != 0xFF); \ + sum += (check_buf[i] != FILL_BYTE); \ /* Clear entire check buffer for later bit tests. */ \ memset(check_buf, 0x00, sizeof(check_buf)); \ /* Extract stack-defined variable contents. */ \ @@ -222,7 +231,8 @@ static noinline void test_ ## name (struct kunit *test) \ * possible between the two leaf function calls. \ */ \ KUNIT_ASSERT_EQ_MSG(test, sum, 0, \ - "leaf fill was not 0xFF!?\n"); \ + "leaf fill was not 0x%02X!?\n", \ + FILL_BYTE); \ \ /* Validate that compiler lined up fill and target. */ \ KUNIT_ASSERT_TRUE_MSG(test, \ @@ -234,9 +244,9 @@ static noinline void test_ ## name (struct kunit *test) \ (int)((ssize_t)(uintptr_t)fill_start - \ (ssize_t)(uintptr_t)target_start)); \ \ - /* Look for any bytes still 0xFF in check region. */ \ + /* Validate check region has no FILL_BYTE bytes. */ \ for (sum = 0, i = 0; i < target_size; i++) \ - sum += (check_buf[i] == 0xFF); \ + sum += (check_buf[i] == FILL_BYTE); \ \ if (sum != 0 && xfail) \ kunit_skip(test, \ @@ -271,12 +281,12 @@ static noinline int leaf_ ## name(unsigned long sp, bool fill, \ * stack frame of SOME kind... \ */ \ memset(buf, (char)(sp & 0xff), sizeof(buf)); \ - /* Fill variable with 0xFF. */ \ + /* Fill variable with FILL_BYTE. */ \ if (fill) { \ fill_start = &var; \ fill_size = sizeof(var); \ memset(fill_start, \ - (char)((sp & 0xff) | forced_mask), \ + FILL_BYTE & forced_mask, \ fill_size); \ } \ \ @@ -469,7 +479,7 @@ static int noinline __leaf_switch_none(int path, bool fill) fill_start = &var; fill_size = sizeof(var); - memset(fill_start, forced_mask | 0x55, fill_size); + memset(fill_start, (forced_mask | 0x55) & FILL_BYTE, fill_size); } memcpy(check_buf, target_start, target_size); break; @@ -480,7 +490,7 @@ static int noinline __leaf_switch_none(int path, bool fill) fill_start = &var; fill_size = sizeof(var); - memset(fill_start, forced_mask | 0xaa, fill_size); + memset(fill_start, (forced_mask | 0xaa) & FILL_BYTE, fill_size); } memcpy(check_buf, target_start, target_size); break; diff --git a/lib/string_helpers_kunit.c b/lib/tests/string_helpers_kunit.c index c853046183d2..c853046183d2 100644 --- a/lib/string_helpers_kunit.c +++ b/lib/tests/string_helpers_kunit.c diff --git a/lib/string_kunit.c b/lib/tests/string_kunit.c index c919e3293da6..0ed7448a26d3 100644 --- a/lib/string_kunit.c +++ b/lib/tests/string_kunit.c @@ -579,8 +579,8 @@ static void string_test_strtomem(struct kunit *test) static void string_test_memtostr(struct kunit *test) { - char nonstring[7] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g' }; - char nonstring_small[3] = { 'a', 'b', 'c' }; + char nonstring[7] __nonstring = { 'a', 'b', 'c', 'd', 'e', 'f', 'g' }; + char nonstring_small[3] __nonstring = { 'a', 'b', 'c' }; char dest[sizeof(nonstring) + 1]; /* Copy in a non-NUL-terminated string into exactly right-sized dest. */ diff --git a/lib/test_bits.c b/lib/tests/test_bits.c index c7b38d91e1f1..c7b38d91e1f1 100644 --- a/lib/test_bits.c +++ b/lib/tests/test_bits.c diff --git a/lib/test_fprobe.c b/lib/tests/test_fprobe.c index cf92111b5c79..cf92111b5c79 100644 --- a/lib/test_fprobe.c +++ b/lib/tests/test_fprobe.c diff --git a/lib/test_hash.c b/lib/tests/test_hash.c index a7af39662a0a..a7af39662a0a 100644 --- a/lib/test_hash.c +++ b/lib/tests/test_hash.c diff --git a/lib/test_kprobes.c b/lib/tests/test_kprobes.c index b7582010125c..b7582010125c 100644 --- a/lib/test_kprobes.c +++ b/lib/tests/test_kprobes.c diff --git a/lib/test_linear_ranges.c b/lib/tests/test_linear_ranges.c index f482be00f1bc..f482be00f1bc 100644 --- a/lib/test_linear_ranges.c +++ b/lib/tests/test_linear_ranges.c diff --git a/lib/test_list_sort.c b/lib/tests/test_list_sort.c index 30879abc8a42..30879abc8a42 100644 --- a/lib/test_list_sort.c +++ b/lib/tests/test_list_sort.c diff --git a/lib/test_sort.c b/lib/tests/test_sort.c index cd4a338d1153..cd4a338d1153 100644 --- a/lib/test_sort.c +++ b/lib/tests/test_sort.c diff --git a/lib/usercopy_kunit.c b/lib/tests/usercopy_kunit.c index 77fa00a13df7..77fa00a13df7 100644 --- a/lib/usercopy_kunit.c +++ b/lib/tests/usercopy_kunit.c diff --git a/lib/util_macros_kunit.c b/lib/tests/util_macros_kunit.c index 94cc9f0de50a..94cc9f0de50a 100644 --- a/lib/util_macros_kunit.c +++ b/lib/tests/util_macros_kunit.c diff --git a/lib/ubsan.c b/lib/ubsan.c index a1c983d148f1..cdc1d31c3821 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -44,7 +44,7 @@ const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) case ubsan_shift_out_of_bounds: return "UBSAN: shift out of bounds"; #endif -#if defined(CONFIG_UBSAN_DIV_ZERO) || defined(CONFIG_UBSAN_SIGNED_WRAP) +#if defined(CONFIG_UBSAN_DIV_ZERO) || defined(CONFIG_UBSAN_INTEGER_WRAP) /* * SanitizerKind::IntegerDivideByZero and * SanitizerKind::SignedIntegerOverflow emit @@ -79,7 +79,7 @@ const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) case ubsan_type_mismatch: return "UBSAN: type mismatch"; #endif -#ifdef CONFIG_UBSAN_SIGNED_WRAP +#ifdef CONFIG_UBSAN_INTEGER_WRAP /* * SanitizerKind::SignedIntegerOverflow emits * SanitizerHandler::AddOverflow, SanitizerHandler::SubOverflow, @@ -303,6 +303,30 @@ void __ubsan_handle_negate_overflow(void *_data, void *old_val) } EXPORT_SYMBOL(__ubsan_handle_negate_overflow); +void __ubsan_handle_implicit_conversion(void *_data, void *from_val, void *to_val) +{ + struct implicit_conversion_data *data = _data; + char from_val_str[VALUE_LENGTH]; + char to_val_str[VALUE_LENGTH]; + + if (suppress_report(&data->location)) + return; + + val_to_string(from_val_str, sizeof(from_val_str), data->from_type, from_val); + val_to_string(to_val_str, sizeof(to_val_str), data->to_type, to_val); + + ubsan_prologue(&data->location, "implicit-conversion"); + + pr_err("cannot represent %s value %s during %s %s, truncated to %s\n", + data->from_type->type_name, + from_val_str, + type_check_kinds[data->type_check_kind], + data->to_type->type_name, + to_val_str); + + ubsan_epilogue(); +} +EXPORT_SYMBOL(__ubsan_handle_implicit_conversion); void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) { diff --git a/lib/ubsan.h b/lib/ubsan.h index 07e37d4429b4..b37e22374e77 100644 --- a/lib/ubsan.h +++ b/lib/ubsan.h @@ -62,6 +62,13 @@ struct overflow_data { struct type_descriptor *type; }; +struct implicit_conversion_data { + struct source_location location; + struct type_descriptor *from_type; + struct type_descriptor *to_type; + unsigned char type_check_kind; +}; + struct type_mismatch_data { struct source_location location; struct type_descriptor *type; @@ -142,6 +149,7 @@ void ubsan_linkage __ubsan_handle_sub_overflow(void *data, void *lhs, void *rhs) void ubsan_linkage __ubsan_handle_mul_overflow(void *data, void *lhs, void *rhs); void ubsan_linkage __ubsan_handle_negate_overflow(void *_data, void *old_val); void ubsan_linkage __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs); +void ubsan_linkage __ubsan_handle_implicit_conversion(void *_data, void *lhs, void *rhs); void ubsan_linkage __ubsan_handle_type_mismatch(struct type_mismatch_data *data, void *ptr); void ubsan_linkage __ubsan_handle_type_mismatch_v1(void *_data, void *ptr); void ubsan_linkage __ubsan_handle_out_of_bounds(void *_data, void *index); diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig index 82fe827af542..45df764b49ad 100644 --- a/lib/vdso/Kconfig +++ b/lib/vdso/Kconfig @@ -43,3 +43,8 @@ config VDSO_GETRANDOM bool help Selected by architectures that support vDSO getrandom(). + +config GENERIC_VDSO_DATA_STORE + bool + help + Selected by architectures that use the generic vDSO data store. diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index cedbf15f8087..aedd40aaa950 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -1,18 +1,3 @@ -# SPDX-License-Identifier: GPL-2.0 +# SPDX-License-Identifier: GPL-2.0-only -GENERIC_VDSO_MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) - -c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) -c-getrandom-$(CONFIG_VDSO_GETRANDOM) := $(addprefix $(GENERIC_VDSO_DIR), getrandom.c) - -# This cmd checks that the vdso library does not contain dynamic relocations. -# It has to be called after the linking of the vdso library and requires it -# as a parameter. -# -# As a workaround for some GNU ld ports which produce unneeded R_*_NONE -# dynamic relocations, ignore R_*_NONE. -quiet_cmd_vdso_check = VDSOCHK $@ - cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \ - then (echo >&2 "$@: dynamic relocations are not supported"; \ - rm -f $@; /bin/false); fi +obj-$(CONFIG_GENERIC_VDSO_DATA_STORE) += datastore.o diff --git a/lib/vdso/Makefile.include b/lib/vdso/Makefile.include new file mode 100644 index 000000000000..cedbf15f8087 --- /dev/null +++ b/lib/vdso/Makefile.include @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 + +GENERIC_VDSO_MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) + +c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) +c-getrandom-$(CONFIG_VDSO_GETRANDOM) := $(addprefix $(GENERIC_VDSO_DIR), getrandom.c) + +# This cmd checks that the vdso library does not contain dynamic relocations. +# It has to be called after the linking of the vdso library and requires it +# as a parameter. +# +# As a workaround for some GNU ld ports which produce unneeded R_*_NONE +# dynamic relocations, ignore R_*_NONE. +quiet_cmd_vdso_check = VDSOCHK $@ + cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \ + then (echo >&2 "$@: dynamic relocations are not supported"; \ + rm -f $@; /bin/false); fi diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c new file mode 100644 index 000000000000..c715e217ec65 --- /dev/null +++ b/lib/vdso/datastore.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/linkage.h> +#include <linux/mmap_lock.h> +#include <linux/mm.h> +#include <linux/time_namespace.h> +#include <linux/types.h> +#include <linux/vdso_datastore.h> +#include <vdso/datapage.h> + +/* + * The vDSO data page. + */ +#ifdef CONFIG_HAVE_GENERIC_VDSO +static union { + struct vdso_time_data data; + u8 page[PAGE_SIZE]; +} vdso_time_data_store __page_aligned_data; +struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data; +static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE); +#endif /* CONFIG_HAVE_GENERIC_VDSO */ + +#ifdef CONFIG_VDSO_GETRANDOM +static union { + struct vdso_rng_data data; + u8 page[PAGE_SIZE]; +} vdso_rng_data_store __page_aligned_data; +struct vdso_rng_data *vdso_k_rng_data = &vdso_rng_data_store.data; +static_assert(sizeof(vdso_rng_data_store) == PAGE_SIZE); +#endif /* CONFIG_VDSO_GETRANDOM */ + +#ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA +static union { + struct vdso_arch_data data; + u8 page[VDSO_ARCH_DATA_SIZE]; +} vdso_arch_data_store __page_aligned_data; +struct vdso_arch_data *vdso_k_arch_data = &vdso_arch_data_store.data; +#endif /* CONFIG_ARCH_HAS_VDSO_ARCH_DATA */ + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long addr, pfn; + vm_fault_t err; + + switch (vmf->pgoff) { + case VDSO_TIME_PAGE_OFFSET: + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_VDSO)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + if (timens_page) { + /* + * Fault in VVAR page too, since it will be accessed + * to get clock data anyway. + */ + addr = vmf->address + VDSO_TIMENS_PAGE_OFFSET * PAGE_SIZE; + err = vmf_insert_pfn(vma, addr, pfn); + if (unlikely(err & VM_FAULT_ERROR)) + return err; + pfn = page_to_pfn(timens_page); + } + break; + case VDSO_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!IS_ENABLED(CONFIG_TIME_NS) || !timens_page) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + break; + case VDSO_RNG_PAGE_OFFSET: + if (!IS_ENABLED(CONFIG_VDSO_GETRANDOM)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_rng_data)); + break; + case VDSO_ARCH_PAGES_START ... VDSO_ARCH_PAGES_END: + if (!IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) + return VM_FAULT_SIGBUS; + pfn = __phys_to_pfn(__pa_symbol(vdso_k_arch_data)) + + vmf->pgoff - VDSO_ARCH_PAGES_START; + break; + default: + return VM_FAULT_SIGBUS; + } + + return vmf_insert_pfn(vma, vmf->address, pfn); +} + +const struct vm_special_mapping vdso_vvar_mapping = { + .name = "[vvar]", + .fault = vvar_fault, +}; + +struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr) +{ + return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, + VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | VM_PFNMAP, + &vdso_vvar_mapping); +} + +#ifdef CONFIG_TIME_NS +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_clock_data() for details. + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) + zap_vma_pages(vma); + } + mmap_read_unlock(mm); + + return 0; +} +#endif diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c index 938ca539aaa6..440f8a6203a6 100644 --- a/lib/vdso/getrandom.c +++ b/lib/vdso/getrandom.c @@ -12,6 +12,9 @@ #include <uapi/linux/mman.h> #include <uapi/linux/random.h> +/* Bring in default accessors */ +#include <vdso/vsyscall.h> + #undef PAGE_SIZE #undef PAGE_MASK #define PAGE_SIZE (1UL << CONFIG_PAGE_SHIFT) @@ -152,7 +155,7 @@ retry_generation: /* * Prevent the syscall from being reordered wrt current_generation. Pairs with the - * smp_store_release(&_vdso_rng_data.generation) in random.c. + * smp_store_release(&vdso_k_rng_data->generation) in random.c. */ smp_rmb(); @@ -256,5 +259,6 @@ fallback_syscall: static __always_inline ssize_t __cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) { - return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer, len, flags, opaque_state, opaque_len); + return __cvdso_getrandom_data(__arch_get_vdso_u_rng_data(), buffer, len, flags, + opaque_state, opaque_len); } diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index c01eaafd8041..93ef801a97ef 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -5,6 +5,9 @@ #include <vdso/datapage.h> #include <vdso/helpers.h> +/* Bring in default accessors */ +#include <vdso/vsyscall.h> + #ifndef vdso_calc_ns #ifdef VDSO_DELTA_NOMASK @@ -14,12 +17,12 @@ #endif #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT -static __always_inline bool vdso_delta_ok(const struct vdso_data *vd, u64 delta) +static __always_inline bool vdso_delta_ok(const struct vdso_clock *vc, u64 delta) { - return delta < vd->max_cycles; + return delta < vc->max_cycles; } #else -static __always_inline bool vdso_delta_ok(const struct vdso_data *vd, u64 delta) +static __always_inline bool vdso_delta_ok(const struct vdso_clock *vc, u64 delta) { return true; } @@ -36,14 +39,14 @@ static __always_inline u64 vdso_shift_ns(u64 ns, u32 shift) * Default implementation which works for all sane clocksources. That * obviously excludes x86/TSC. */ -static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base) +static __always_inline u64 vdso_calc_ns(const struct vdso_clock *vc, u64 cycles, u64 base) { - u64 delta = (cycles - vd->cycle_last) & VDSO_DELTA_MASK(vd); + u64 delta = (cycles - vc->cycle_last) & VDSO_DELTA_MASK(vc); - if (likely(vdso_delta_ok(vd, delta))) - return vdso_shift_ns((delta * vd->mult) + base, vd->shift); + if (likely(vdso_delta_ok(vc, delta))) + return vdso_shift_ns((delta * vc->mult) + base, vc->shift); - return mul_u64_u32_add_u64_shr(delta, vd->mult, base, vd->shift); + return mul_u64_u32_add_u64_shr(delta, vc->mult, base, vc->shift); } #endif /* vdso_calc_ns */ @@ -55,9 +58,9 @@ static inline bool __arch_vdso_hres_capable(void) #endif #ifndef vdso_clocksource_ok -static inline bool vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool vdso_clocksource_ok(const struct vdso_clock *vc) { - return vd->clock_mode != VDSO_CLOCKMODE_NONE; + return vc->clock_mode != VDSO_CLOCKMODE_NONE; } #endif @@ -69,36 +72,45 @@ static inline bool vdso_cycles_ok(u64 cycles) #endif #ifdef CONFIG_TIME_NS -static __always_inline int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, - struct __kernel_timespec *ts) + +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE +static __always_inline +const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) +{ + return (void *)vd + PAGE_SIZE; +} +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ + +static __always_inline +int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { - const struct timens_offset *offs = &vdns->offset[clk]; + const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); + const struct timens_offset *offs = &vcns->offset[clk]; + const struct vdso_clock *vc = vd->clock_data; const struct vdso_timestamp *vdso_ts; - const struct vdso_data *vd; u64 cycles, ns; u32 seq; s64 sec; - vd = vdns - (clk == CLOCK_MONOTONIC_RAW ? CS_RAW : CS_HRES_COARSE); - vd = __arch_get_timens_vdso_data(vd); if (clk != CLOCK_MONOTONIC_RAW) - vd = &vd[CS_HRES_COARSE]; + vc = &vc[CS_HRES_COARSE]; else - vd = &vd[CS_RAW]; - vdso_ts = &vd->basetime[clk]; + vc = &vc[CS_RAW]; + vdso_ts = &vc->basetime[clk]; do { - seq = vdso_read_begin(vd); + seq = vdso_read_begin(vc); - if (unlikely(!vdso_clocksource_ok(vd))) + if (unlikely(!vdso_clocksource_ok(vc))) return -1; - cycles = __arch_get_hw_counter(vd->clock_mode, vd); + cycles = __arch_get_hw_counter(vc->clock_mode, vd); if (unlikely(!vdso_cycles_ok(cycles))) return -1; - ns = vdso_calc_ns(vd, cycles, vdso_ts->nsec); + ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); sec = vdso_ts->sec; - } while (unlikely(vdso_read_retry(vd, seq))); + } while (unlikely(vdso_read_retry(vc, seq))); /* Add the namespace offset */ sec += offs->sec; @@ -115,22 +127,24 @@ static __always_inline int do_hres_timens(const struct vdso_data *vdns, clockid_ } #else static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) +const struct vdso_time_data *__arch_get_vdso_u_timens_data(const struct vdso_time_data *vd) { return NULL; } -static __always_inline int do_hres_timens(const struct vdso_data *vdns, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_hres_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { return -EINVAL; } #endif -static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_hres(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) { - const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; + const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; u64 cycles, sec, ns; u32 seq; @@ -142,31 +156,31 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, /* * Open coded function vdso_read_begin() to handle * VDSO_CLOCKMODE_TIMENS. Time namespace enabled tasks have a - * special VVAR page installed which has vd->seq set to 1 and - * vd->clock_mode set to VDSO_CLOCKMODE_TIMENS. For non time + * special VVAR page installed which has vc->seq set to 1 and + * vc->clock_mode set to VDSO_CLOCKMODE_TIMENS. For non time * namespace affected tasks this does not affect performance - * because if vd->seq is odd, i.e. a concurrent update is in - * progress the extra check for vd->clock_mode is just a few - * extra instructions while spin waiting for vd->seq to become + * because if vc->seq is odd, i.e. a concurrent update is in + * progress the extra check for vc->clock_mode is just a few + * extra instructions while spin waiting for vc->seq to become * even again. */ - while (unlikely((seq = READ_ONCE(vd->seq)) & 1)) { + while (unlikely((seq = READ_ONCE(vc->seq)) & 1)) { if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - return do_hres_timens(vd, clk, ts); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + return do_hres_timens(vd, vc, clk, ts); cpu_relax(); } smp_rmb(); - if (unlikely(!vdso_clocksource_ok(vd))) + if (unlikely(!vdso_clocksource_ok(vc))) return -1; - cycles = __arch_get_hw_counter(vd->clock_mode, vd); + cycles = __arch_get_hw_counter(vc->clock_mode, vd); if (unlikely(!vdso_cycles_ok(cycles))) return -1; - ns = vdso_calc_ns(vd, cycles, vdso_ts->nsec); + ns = vdso_calc_ns(vc, cycles, vdso_ts->nsec); sec = vdso_ts->sec; - } while (unlikely(vdso_read_retry(vd, seq))); + } while (unlikely(vdso_read_retry(vc, seq))); /* * Do this outside the loop: a race inside the loop could result @@ -179,21 +193,25 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk, } #ifdef CONFIG_TIME_NS -static __always_inline int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { - const struct vdso_data *vd = __arch_get_timens_vdso_data(vdns); - const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; - const struct timens_offset *offs = &vdns->offset[clk]; + const struct vdso_time_data *vd = __arch_get_vdso_u_timens_data(vdns); + const struct timens_offset *offs = &vcns->offset[clk]; + const struct vdso_clock *vc = vd->clock_data; + const struct vdso_timestamp *vdso_ts; u64 nsec; s64 sec; s32 seq; + vdso_ts = &vc->basetime[clk]; + do { - seq = vdso_read_begin(vd); + seq = vdso_read_begin(vc); sec = vdso_ts->sec; nsec = vdso_ts->nsec; - } while (unlikely(vdso_read_retry(vd, seq))); + } while (unlikely(vdso_read_retry(vc, seq))); /* Add the namespace offset */ sec += offs->sec; @@ -208,17 +226,19 @@ static __always_inline int do_coarse_timens(const struct vdso_data *vdns, clocki return 0; } #else -static __always_inline int do_coarse_timens(const struct vdso_data *vdns, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_coarse_timens(const struct vdso_time_data *vdns, const struct vdso_clock *vcns, + clockid_t clk, struct __kernel_timespec *ts) { return -1; } #endif -static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, - struct __kernel_timespec *ts) +static __always_inline +int do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc, + clockid_t clk, struct __kernel_timespec *ts) { - const struct vdso_timestamp *vdso_ts = &vd->basetime[clk]; + const struct vdso_timestamp *vdso_ts = &vc->basetime[clk]; u32 seq; do { @@ -226,25 +246,26 @@ static __always_inline int do_coarse(const struct vdso_data *vd, clockid_t clk, * Open coded function vdso_read_begin() to handle * VDSO_CLOCK_TIMENS. See comment in do_hres(). */ - while ((seq = READ_ONCE(vd->seq)) & 1) { + while ((seq = READ_ONCE(vc->seq)) & 1) { if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - return do_coarse_timens(vd, clk, ts); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + return do_coarse_timens(vd, vc, clk, ts); cpu_relax(); } smp_rmb(); ts->tv_sec = vdso_ts->sec; ts->tv_nsec = vdso_ts->nsec; - } while (unlikely(vdso_read_retry(vd, seq))); + } while (unlikely(vdso_read_retry(vc, seq))); return 0; } static __always_inline int -__cvdso_clock_gettime_common(const struct vdso_data *vd, clockid_t clock, +__cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) { + const struct vdso_clock *vc = vd->clock_data; u32 msk; /* Check for negative values or invalid clocks */ @@ -257,19 +278,19 @@ __cvdso_clock_gettime_common(const struct vdso_data *vd, clockid_t clock, */ msk = 1U << clock; if (likely(msk & VDSO_HRES)) - vd = &vd[CS_HRES_COARSE]; + vc = &vc[CS_HRES_COARSE]; else if (msk & VDSO_COARSE) - return do_coarse(&vd[CS_HRES_COARSE], clock, ts); + return do_coarse(vd, &vc[CS_HRES_COARSE], clock, ts); else if (msk & VDSO_RAW) - vd = &vd[CS_RAW]; + vc = &vc[CS_RAW]; else return -1; - return do_hres(vd, clock, ts); + return do_hres(vd, vc, clock, ts); } static __maybe_unused int -__cvdso_clock_gettime_data(const struct vdso_data *vd, clockid_t clock, +__cvdso_clock_gettime_data(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts) { int ret = __cvdso_clock_gettime_common(vd, clock, ts); @@ -282,12 +303,12 @@ __cvdso_clock_gettime_data(const struct vdso_data *vd, clockid_t clock, static __maybe_unused int __cvdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { - return __cvdso_clock_gettime_data(__arch_get_vdso_data(), clock, ts); + return __cvdso_clock_gettime_data(__arch_get_vdso_u_time_data(), clock, ts); } #ifdef BUILD_VDSO32 static __maybe_unused int -__cvdso_clock_gettime32_data(const struct vdso_data *vd, clockid_t clock, +__cvdso_clock_gettime32_data(const struct vdso_time_data *vd, clockid_t clock, struct old_timespec32 *res) { struct __kernel_timespec ts; @@ -308,19 +329,20 @@ __cvdso_clock_gettime32_data(const struct vdso_data *vd, clockid_t clock, static __maybe_unused int __cvdso_clock_gettime32(clockid_t clock, struct old_timespec32 *res) { - return __cvdso_clock_gettime32_data(__arch_get_vdso_data(), clock, res); + return __cvdso_clock_gettime32_data(__arch_get_vdso_u_time_data(), clock, res); } #endif /* BUILD_VDSO32 */ static __maybe_unused int -__cvdso_gettimeofday_data(const struct vdso_data *vd, +__cvdso_gettimeofday_data(const struct vdso_time_data *vd, struct __kernel_old_timeval *tv, struct timezone *tz) { + const struct vdso_clock *vc = vd->clock_data; if (likely(tv != NULL)) { struct __kernel_timespec ts; - if (do_hres(&vd[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) + if (do_hres(vd, &vc[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) return gettimeofday_fallback(tv, tz); tv->tv_sec = ts.tv_sec; @@ -329,8 +351,8 @@ __cvdso_gettimeofday_data(const struct vdso_data *vd, if (unlikely(tz != NULL)) { if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - vd = __arch_get_timens_vdso_data(vd); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + vd = __arch_get_vdso_u_timens_data(vd); tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest; tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime; @@ -342,20 +364,23 @@ __cvdso_gettimeofday_data(const struct vdso_data *vd, static __maybe_unused int __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { - return __cvdso_gettimeofday_data(__arch_get_vdso_data(), tv, tz); + return __cvdso_gettimeofday_data(__arch_get_vdso_u_time_data(), tv, tz); } #ifdef VDSO_HAS_TIME static __maybe_unused __kernel_old_time_t -__cvdso_time_data(const struct vdso_data *vd, __kernel_old_time_t *time) +__cvdso_time_data(const struct vdso_time_data *vd, __kernel_old_time_t *time) { + const struct vdso_clock *vc = vd->clock_data; __kernel_old_time_t t; if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - vd = __arch_get_timens_vdso_data(vd); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) { + vd = __arch_get_vdso_u_timens_data(vd); + vc = vd->clock_data; + } - t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); + t = READ_ONCE(vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec); if (time) *time = t; @@ -365,15 +390,16 @@ __cvdso_time_data(const struct vdso_data *vd, __kernel_old_time_t *time) static __maybe_unused __kernel_old_time_t __cvdso_time(__kernel_old_time_t *time) { - return __cvdso_time_data(__arch_get_vdso_data(), time); + return __cvdso_time_data(__arch_get_vdso_u_time_data(), time); } #endif /* VDSO_HAS_TIME */ #ifdef VDSO_HAS_CLOCK_GETRES static __maybe_unused -int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, +int __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *res) { + const struct vdso_clock *vc = vd->clock_data; u32 msk; u64 ns; @@ -382,8 +408,8 @@ int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, return -1; if (IS_ENABLED(CONFIG_TIME_NS) && - vd->clock_mode == VDSO_CLOCKMODE_TIMENS) - vd = __arch_get_timens_vdso_data(vd); + vc->clock_mode == VDSO_CLOCKMODE_TIMENS) + vd = __arch_get_vdso_u_timens_data(vd); /* * Convert the clockid to a bitmask and use it to check which @@ -394,7 +420,7 @@ int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, /* * Preserves the behaviour of posix_get_hrtimer_res(). */ - ns = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res); + ns = READ_ONCE(vd->hrtimer_res); } else if (msk & VDSO_COARSE) { /* * Preserves the behaviour of posix_get_coarse_res(). @@ -412,7 +438,7 @@ int __cvdso_clock_getres_common(const struct vdso_data *vd, clockid_t clock, } static __maybe_unused -int __cvdso_clock_getres_data(const struct vdso_data *vd, clockid_t clock, +int __cvdso_clock_getres_data(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *res) { int ret = __cvdso_clock_getres_common(vd, clock, res); @@ -425,12 +451,12 @@ int __cvdso_clock_getres_data(const struct vdso_data *vd, clockid_t clock, static __maybe_unused int __cvdso_clock_getres(clockid_t clock, struct __kernel_timespec *res) { - return __cvdso_clock_getres_data(__arch_get_vdso_data(), clock, res); + return __cvdso_clock_getres_data(__arch_get_vdso_u_time_data(), clock, res); } #ifdef BUILD_VDSO32 static __maybe_unused int -__cvdso_clock_getres_time32_data(const struct vdso_data *vd, clockid_t clock, +__cvdso_clock_getres_time32_data(const struct vdso_time_data *vd, clockid_t clock, struct old_timespec32 *res) { struct __kernel_timespec ts; @@ -451,7 +477,7 @@ __cvdso_clock_getres_time32_data(const struct vdso_data *vd, clockid_t clock, static __maybe_unused int __cvdso_clock_getres_time32(clockid_t clock, struct old_timespec32 *res) { - return __cvdso_clock_getres_time32_data(__arch_get_vdso_data(), + return __cvdso_clock_getres_time32_data(__arch_get_vdso_u_time_data(), clock, res); } #endif /* BUILD_VDSO32 */ diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 56fe96319292..734bd70c8b9b 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -114,6 +114,13 @@ unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base) } EXPORT_SYMBOL(simple_strtoul); +unsigned long simple_strntoul(const char *cp, char **endp, unsigned int base, + size_t max_chars) +{ + return simple_strntoull(cp, endp, base, max_chars); +} +EXPORT_SYMBOL(simple_strntoul); + /** * simple_strtol - convert a string to a signed long * @cp: The start of the string diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h index 0e3b2c0a527d..0dde8bf56595 100644 --- a/lib/zstd/common/portability_macros.h +++ b/lib/zstd/common/portability_macros.h @@ -55,7 +55,7 @@ #ifndef DYNAMIC_BMI2 #if ((defined(__clang__) && __has_attribute(__target__)) \ || (defined(__GNUC__) \ - && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ + && (__GNUC__ >= 11))) \ && (defined(__x86_64__) || defined(_M_X64)) \ && !defined(__BMI2__) # define DYNAMIC_BMI2 1 diff --git a/mm/Kconfig b/mm/Kconfig index 1b501db06417..0b7f4bb5cb80 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -242,6 +242,10 @@ menu "Slab allocator options" config SLUB def_bool y +config KVFREE_RCU_BATCHED + def_bool y + depends on !SLUB_TINY && !TINY_RCU + config SLUB_TINY bool "Configure for minimal memory footprint" depends on EXPERT diff --git a/mm/damon/core.c b/mm/damon/core.c index c7b981308862..384935ef4e65 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -373,6 +373,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, * or damon_attrs are updated. */ scheme->next_apply_sis = 0; + scheme->walk_completed = false; INIT_LIST_HEAD(&scheme->filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -1429,9 +1430,13 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, { struct damos_filter *filter; + s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) + if (damos_filter_match(ctx, t, r, filter)) { + if (filter->allow) + s->core_filters_allowed = true; return !filter->allow; + } } return false; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 0f9ae14f884d..c834aa217835 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -236,6 +236,9 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) { struct damos_filter *filter; + if (scheme->core_filters_allowed) + return false; + damos_for_each_filter(filter, scheme) { if (damos_pa_filter_match(filter, folio)) return !filter->allow; diff --git a/mm/execmem.c b/mm/execmem.c index 317b6a8d35be..e6c4f5076ca8 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -257,7 +257,6 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; - unsigned long start, end; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - start = (unsigned long)p; - end = start + alloc_size; - - vunmap_range(start, end); - - err = execmem_set_direct_map_valid(vm, false); - if (err) - goto err_free_mem; - - err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, - PMD_SHIFT); + err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) - goto err_free_mem; + goto err_reset_direct_map; return 0; +err_reset_direct_map: + execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); return err; @@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr) return true; } + +int execmem_make_temp_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { diff --git a/mm/filemap.c b/mm/filemap.c index 6d616bb9001e..e9404290f2c6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1985,8 +1985,19 @@ no_page: if (err == -EEXIST) goto repeat; - if (err) + if (err) { + /* + * When NOWAIT I/O fails to allocate folios this could + * be due to a nonblocking memory allocation and not + * because the system actually is out of memory. + * Return -EAGAIN so that there caller retries in a + * blocking fashion instead of propagating -ENOMEM + * to the application. + */ + if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) + err = -EAGAIN; return ERR_PTR(err); + } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. @@ -4083,17 +4094,6 @@ retry: bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { - status = -EFAULT; - break; - } - if (fatal_signal_pending(current)) { status = -EINTR; break; @@ -4111,6 +4111,12 @@ retry: if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); @@ -4136,6 +4142,16 @@ retry: bytes = copied; goto retry; } + + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } } else { pos += status; written += status; @@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address + * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). @@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable); * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) +struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; - int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3d3ebdc002d5..373781b21e5c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3304,7 +3304,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); - folio_put(tail); + folio_put_refs(tail, folio_nr_pages(tail)); } else if (!folio_test_anon(folio)) { __xa_store(&folio->mapping->i_pages, tail->index, tail, 0); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 97930d44d460..318624c96584 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2135,6 +2135,8 @@ retry: if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); + bool adjust_surplus = false; + if (!available_huge_pages(h)) goto out; @@ -2157,7 +2159,9 @@ retry: goto retry; } - remove_hugetlb_folio(h, folio, false); + if (h->surplus_huge_pages_node[folio_nid(folio)]) + adjust_surplus = true; + remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2177,7 +2181,7 @@ retry: rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); + add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 2be6b9112808..2e9fa431bbf5 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1855,9 +1855,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > MAX_SWAPPINESS) return -EINVAL; - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { + pr_info_once("Per memcg swappiness does not exist in cgroup v2. " + "See memory.reclaim or memory.swap.max there\n "); WRITE_ONCE(memcg->swappiness, val); - else + } else WRITE_ONCE(vm_swappiness, val); return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4de6acb9b8ec..a037ec92881d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1921,9 +1921,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; + struct obj_cgroup *old; + unsigned long flags; stock = &per_cpu(memcg_stock, cpu); + + /* drain_obj_stock requires stock_lock */ + local_lock_irqsave(&memcg_stock.stock_lock, flags); + old = drain_obj_stock(stock); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + drain_stock(stock); + obj_cgroup_put(old); return 0; } @@ -4993,7 +5002,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, entry); + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; @@ -5055,7 +5064,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(memcg, nr_pages - 1); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); - swap_cgroup_record(folio, entry); + swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); return 0; } diff --git a/mm/memory.c b/mm/memory.c index fb7b8dc75167..4f6d9766a046 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6834,10 +6834,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bbaadbeeb291..a9eea051b0d6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state) } EXPORT_SYMBOL_GPL(numa_nearest_node); +/** + * nearest_node_nodemask - Find the node in @mask at the nearest distance + * from @node. + * + * @node: a valid node ID to start the search from. + * @mask: a pointer to a nodemask representing the allowed nodes. + * + * This function iterates over all nodes in @mask and calculates the + * distance from the starting @node, then it returns the node ID that is + * the closest to @node, or MAX_NUMNODES if no node is found. + * + * Note that @node must be a valid node ID usable with node_distance(), + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes + * or unexpected behavior. + */ +int nearest_node_nodemask(int node, nodemask_t *mask) +{ + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; + + for_each_node_mask(n, *mask) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(nearest_node_nodemask); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; diff --git a/mm/migrate.c b/mm/migrate.c index fb19a18892c8..97f0edf0c032 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -518,15 +518,13 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ - if (folio_test_swapbacked(folio)) { + if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - if (folio_test_swapcache(folio)) { - folio_set_swapcache(newfolio); - newfolio->private = folio_get_private(folio); - } + if (folio_test_swapcache(folio)) { + folio_set_swapcache(newfolio); + newfolio->private = folio_get_private(folio); entries = nr; } else { - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); entries = 1; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94917c729120..542d25f77be8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7004,7 +7004,7 @@ static inline bool has_unaccepted_memory(void) static bool cond_accept_memory(struct zone *zone, unsigned int order) { - long to_accept; + long to_accept, wmark; bool ret = false; if (!has_unaccepted_memory()) @@ -7013,8 +7013,18 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) if (list_empty(&zone->unaccepted_pages)) return false; + wmark = promo_wmark_pages(zone); + + /* + * Watermarks have not been initialized yet. + * + * Accepting one MAX_ORDER page to ensure progress. + */ + if (!wmark) + return try_to_accept_memory_one(zone); + /* How much to accept to get to promo watermark? */ - to_accept = promo_wmark_pages(zone) - + to_accept = wmark - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0) - zone_page_state(zone, NR_UNACCEPTED)); diff --git a/mm/percpu.c b/mm/percpu.c index ac61e3fc5f15..7b5835356d1e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3071,7 +3071,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, continue; } /* copy and return the unused part */ - memcpy(ptr, __per_cpu_load, ai->static_size); + memcpy(ptr, __per_cpu_start, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3240,7 +3240,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + memcpy((void *)unit_addr, __per_cpu_start, ai->static_size); } /* we're ready, commit */ diff --git a/mm/shmem.c b/mm/shmem.c index 1ede0800e846..ab61c8bb20e1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3912,16 +3912,16 @@ out_iput: return error; } -static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) - return error; + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/mm/slab.h b/mm/slab.h index e9fd9bf0bfa6..05a21dc796e0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } -/* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ - SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ + SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) -#ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) -#else -#define SLAB_DEBUG_FLAGS (0) -#endif -#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | \ - SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) - -/* Common flags available with current configuration */ -#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) - -/* Common flags permitted for kmem_cache_create */ -#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ - SLAB_RED_ZONE | \ - SLAB_POISON | \ - SLAB_STORE_USER | \ - SLAB_TRACE | \ - SLAB_CONSISTENCY_CHECKS | \ - SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | \ - SLAB_ACCOUNT | \ - SLAB_KMALLOC | \ - SLAB_NO_MERGE | \ - SLAB_NO_USER_FLAGS) +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); @@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif +void kvfree_rcu_cb(struct rcu_head *head); + size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index 4c9f0a87f733..5be257e03c7c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -298,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, static_branch_enable(&slub_debug_enabled); if (flags & SLAB_STORE_USER) stack_depot_init(); +#else + flags &= ~SLAB_DEBUG_FLAGS; #endif mutex_lock(&slab_mutex); @@ -307,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, goto out_unlock; } - /* Refuse requests with allocator specific flags */ if (flags & ~SLAB_FLAGS_PERMITTED) { err = -EINVAL; goto out_unlock; } - /* - * Some allocators will constraint the set of valid flags to a subset - * of all flags. We expect them to define CACHE_CREATE_MASK in this - * case, and we'll just provide them with a sanitized version of the - * passed flags. - */ - flags &= CACHE_CREATE_MASK; - /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || WARN_ON(!args->usersize && args->useroffset) || @@ -1284,6 +1277,29 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +#ifndef CONFIG_KVFREE_RCU_BATCHED + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + if (head) { + kasan_record_aux_stack(ptr); + call_rcu(head, kvfree_rcu_cb); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree(ptr); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +void __init kvfree_rcu_init(void) +{ +} + +#else /* CONFIG_KVFREE_RCU_BATCHED */ + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached @@ -1534,8 +1550,7 @@ kvfree_rcu_list(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kvfree_callback("slab", head, offset); - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -1863,8 +1878,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return true; } -#if !defined(CONFIG_TINY_RCU) - static enum hrtimer_restart schedule_page_work_fn(struct hrtimer *t) { @@ -1889,8 +1902,8 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) &krcp->page_cache_work, msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); } } @@ -2073,8 +2086,6 @@ void kvfree_rcu_barrier(void) } EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); -#endif /* #if !defined(CONFIG_TINY_RCU) */ - static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -2168,3 +2179,6 @@ void __init kvfree_rcu_init(void) shrinker_register(kfree_rcu_shrinker); } + +#endif /* CONFIG_KVFREE_RCU_BATCHED */ + diff --git a/mm/slub.c b/mm/slub.c index 1f50129dcfb3..5eac408e818e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" +#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> @@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object) set_orig_size(s, (void *)object, s->object_size); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { struct va_format vaf; va_list args; - va_start(args, fmt); + va_copy(args, argsp); vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); } +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); +} + __printf(2, 3) -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); - - dump_stack(); } static void object_err(struct kmem_cache *s, struct slab *slab, - u8 *object, char *reason) + u8 *object, const char *reason) { if (slab_add_kunit_errors()) return; - slab_bug(s, "%s", reason); + slab_bug(s, reason); print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, @@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, return false; } +static void __slab_err(struct slab *slab) +{ + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; - char buf[100]; if (slab_add_kunit_errors()) return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); - print_slab_info(slab); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); @@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, static pad_check_attributes int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) + u8 *object, const char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; @@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab, if (slab_add_kunit_errors()) goto skip_bug_print; - slab_bug(s, "%s overwritten", what); - pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault - addr, - fault[0], value); + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) return 1; return check_bytes_and_report(s, slab, p, "Object padding", - p + off, POISON_INUSE, size_from_object(s) - off); + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ @@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", - fault, end - 1, fault - start); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); } @@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", - object - s->red_left_pad, val, s->red_left_pad)) + object - s->red_left_pad, val, s->red_left_pad, ret)) ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", - endobject, val, s->inuse - s->object_size)) + endobject, val, s->inuse - s->object_size, ret)) ret = 0; if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { @@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->object_size > orig_size && !check_bytes_and_report(s, slab, object, "kmalloc Redzone", p + orig_size, - val, s->object_size - orig_size)) { + val, s->object_size - orig_size, ret)) { ret = 0; } } @@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size)) + s->inuse - s->object_size, ret)) ret = 0; } } @@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (kasan_meta_size < s->object_size - 1 && !check_bytes_and_report(s, slab, p, "Poison", p + kasan_meta_size, POISON_FREE, - s->object_size - kasan_meta_size - 1)) + s->object_size - kasan_meta_size - 1, ret)) ret = 0; if (kasan_meta_size < s->object_size && !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1)) + p + s->object_size - 1, POISON_END, 1, ret)) ret = 0; } /* @@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab, ret = 0; } - if (!ret && !slab_in_kunit_test()) { - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - } - return ret; } @@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; @@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) fp = slab->freelist; while (fp && nr <= slab->objects) { if (fp == search) - return 1; + return true; if (!check_valid_pointer(s, slab, fp)) { if (object) { object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); + break; } else { slab_err(s, slab, "Freepointer corrupt"); slab->freelist = NULL; slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s, slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", object); } else if (!slab->slab_cache) { - pr_err("SLUB <none>: no slab for object 0x%p.\n", - object); - dump_stack(); - } else + slab_err(NULL, slab, "No slab cache for object 0x%p", + object); + } else { object_err(s, slab, object, - "page slab pointer corrupt."); + "page slab pointer corrupt."); + } return 0; } return 1; @@ -4241,6 +4264,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); + __folio_set_large_kmalloc(folio); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -4716,6 +4740,11 @@ static void free_large_kmalloc(struct folio *folio, void *object) { unsigned int order = folio_order(folio); + if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { + dump_page(&folio->page, "Not a kmalloc allocation"); + return; + } + if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); @@ -4725,9 +4754,55 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __folio_clear_large_kmalloc(folio); folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() @@ -4878,6 +4953,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc_noprof); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); + if (ret || size <= PAGE_SIZE) + return ret; + + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; @@ -5570,14 +5807,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) return !!oo_objects(s->oo); } -static void list_slab_objects(struct kmem_cache *s, struct slab *slab, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); void *p; - slab_err(s, slab, text, s->name); + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); spin_lock(&object_map_lock); __fill_map(object_map, s, slab); @@ -5592,6 +5829,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, } } spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } @@ -5612,8 +5851,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) remove_partial(n, slab); list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, slab, - "Objects remaining in %s on __kmem_cache_shutdown()"); + list_slab_objects(s, slab); } } spin_unlock_irq(&n->list_lock); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index be39078f255b..1007c30f12e2 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -58,9 +58,11 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map, * entries must not have been charged * * @folio: the folio that the swap entry belongs to + * @id: mem_cgroup ID to be recorded * @ent: the first swap entry to be recorded */ -void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +void swap_cgroup_record(struct folio *folio, unsigned short id, + swp_entry_t ent) { unsigned int nr_ents = folio_nr_pages(folio); struct swap_cgroup *map; @@ -72,8 +74,7 @@ void swap_cgroup_record(struct folio *folio, swp_entry_t ent) map = swap_cgroup_ctrl[swp_type(ent)].map; do { - old = __swap_cgroup_id_xchg(map, offset, - mem_cgroup_id(folio_memcg(folio))); + old = __swap_cgroup_id_xchg(map, offset, id); VM_BUG_ON(old); } while (++offset != end); } diff --git a/mm/usercopy.c b/mm/usercopy.c index 83c164aba6e0..dbdcc43964fb 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -17,7 +17,7 @@ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> -#include <linux/thread_info.h> +#include <linux/ucopysize.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> @@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } } -static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); +DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + validate_usercopy_range); +EXPORT_SYMBOL(validate_usercopy_range); /* * Validates that the given object is: @@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { - if (static_branch_unlikely(&bypass_usercopy_checks)) - return; - /* Skip all tests if size is zero. */ if (!n) return; @@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) } EXPORT_SYMBOL(__check_object_size); -static bool enable_checks __initdata = true; +static bool enable_checks __initdata = + IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON); static int __init parse_hardened_usercopy(char *str) { @@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { - if (enable_checks == false) - static_branch_enable(&bypass_usercopy_checks); + if (enable_checks) + static_branch_enable(&validate_usercopy_range); + else + static_branch_disable(&validate_usercopy_range); return 1; } diff --git a/mm/util.c b/mm/util.c index 8c965474d329..e7d81371032b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -615,168 +615,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); -static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) -{ - /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - flags |= __GFP_NOWARN; - - if (!(flags & __GFP_RETRY_MAYFAIL)) - flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - flags &= ~__GFP_NOFAIL; - } - - return flags; -} - -/** - * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon - * failure, fall back to non-contiguous (vmalloc) allocation. - * @size: size of the request. - * @b: which set of kmalloc buckets to allocate from. - * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. - * @node: numa node to allocate from - * - * Uses kmalloc to get the memory but if the allocation fails then falls back - * to the vmalloc allocator. Use kvfree for freeing the memory. - * - * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. - * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is - * preferable to the vmalloc fallback, due to visible performance drawbacks. - * - * Return: pointer to the allocated memory of %NULL in case of failure - */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) -{ - void *ret; - - /* - * It doesn't really make sense to fallback to vmalloc for sub page - * requests - */ - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), - kmalloc_gfp_adjust(flags, size), - node); - if (ret || size <= PAGE_SIZE) - return ret; - - /* non-sleeping allocations are not supported by vmalloc */ - if (!gfpflags_allow_blocking(flags)) - return NULL; - - /* Don't even allow crazy sizes */ - if (unlikely(size > INT_MAX)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - - /* - * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, - * since the callers already cannot assume anything - * about the resulting pointer, and cannot play - * protection games. - */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - node, __builtin_return_address(0)); -} -EXPORT_SYMBOL(__kvmalloc_node_noprof); - -/** - * kvfree() - Free memory. - * @addr: Pointer to allocated memory. - * - * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). - * It is slightly more efficient to use kfree() or vfree() if you are certain - * that you know which one to use. - * - * Context: Either preemptible task context or not-NMI interrupt. - */ -void kvfree(const void *addr) -{ - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); -} -EXPORT_SYMBOL(kvfree); - -/** - * kvfree_sensitive - Free a data object containing sensitive information. - * @addr: address of the data object to be freed. - * @len: length of the data object. - * - * Use the special memzero_explicit() function to clear the content of a - * kvmalloc'ed object containing sensitive data to make sure that the - * compiler won't optimize out the data clearing. - */ -void kvfree_sensitive(const void *addr, size_t len) -{ - if (likely(!ZERO_OR_NULL_PTR(addr))) { - memzero_explicit((void *)addr, len); - kvfree(addr); - } -} -EXPORT_SYMBOL(kvfree_sensitive); - -/** - * kvrealloc - reallocate memory; contents remain unchanged - * @p: object to reallocate memory for - * @size: the size to reallocate - * @flags: the flags for the page level allocator - * - * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 - * and @p is not a %NULL pointer, the object pointed to is freed. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * This function must not be called concurrently with itself or kvfree() for the - * same memory allocation. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) -{ - void *n; - - if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); - - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); - if (!n) { - /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); - if (!n) - return NULL; - - if (p) { - /* We already know that `p` is not a vmalloc address. */ - kasan_disable_current(); - memcpy(n, kasan_reset_tag(p), ksize(p)); - kasan_enable_current(); - - kfree(p); - } - } - - return n; -} -EXPORT_SYMBOL(kvrealloc_noprof); - /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. @@ -2381,7 +2381,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ - khugepaged_enter_vma(vma, map->flags); + if (!vma_is_anonymous(vma)) + khugepaged_enter_vma(vma, map->flags); ksm_add_vma(vma); *vmap = vma; return 0; diff --git a/mm/vmstat.c b/mm/vmstat.c index 16bfe1c694dd..88998725f1c5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1435,6 +1435,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", + "direct_map_level2_collapses", + "direct_map_level3_collapses", #endif #ifdef CONFIG_PER_VMA_LOCK_STATS "vma_lock_success", diff --git a/net/atm/lec.c b/net/atm/lec.c index ffef658862db..a948dd47c3f3 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -181,6 +181,7 @@ static void lec_send(struct atm_vcc *vcc, struct sk_buff *skb) { struct net_device *dev = skb->dev; + unsigned int len = skb->len; ATM_SKB(skb)->vcc = vcc; atm_account_tx(vcc, skb); @@ -191,7 +192,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) } dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + dev->stats.tx_bytes += len; } static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 07ae5dd1f150..b12645949ae5 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -325,8 +325,7 @@ batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /* send a batman ogm to a given interface */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e503ee0d896b..8f89ffe6020c 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -839,8 +839,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /** diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 50cfec8ccac4..3c29778171c5 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -825,11 +825,16 @@ static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { + struct sk_buff *skb; + /* Note that we must allocate using GFP_ATOMIC here as * this function is called originally from netdev hard xmit * function in atomic context. */ - return bt_skb_alloc(hdr_len + len, GFP_ATOMIC); + skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + return skb; } static void chan_suspend_cb(struct l2cap_chan *chan) diff --git a/net/can/af_can.c b/net/can/af_can.c index 01f3fbb3b67d..65230e81fa08 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -287,8 +287,8 @@ int can_send(struct sk_buff *skb, int loop) netif_rx(newskb); /* update statistics */ - pkg_stats->tx_frames++; - pkg_stats->tx_frames_delta++; + atomic_long_inc(&pkg_stats->tx_frames); + atomic_long_inc(&pkg_stats->tx_frames_delta); return 0; @@ -647,8 +647,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) int matches; /* update statistics */ - pkg_stats->rx_frames++; - pkg_stats->rx_frames_delta++; + atomic_long_inc(&pkg_stats->rx_frames); + atomic_long_inc(&pkg_stats->rx_frames_delta); /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) @@ -669,8 +669,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); if (matches > 0) { - pkg_stats->matches++; - pkg_stats->matches_delta++; + atomic_long_inc(&pkg_stats->matches); + atomic_long_inc(&pkg_stats->matches_delta); } } diff --git a/net/can/af_can.h b/net/can/af_can.h index 7c2d9161e224..22f3352c77fe 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -66,9 +66,9 @@ struct receiver { struct can_pkg_stats { unsigned long jiffies_init; - unsigned long rx_frames; - unsigned long tx_frames; - unsigned long matches; + atomic_long_t rx_frames; + atomic_long_t tx_frames; + atomic_long_t matches; unsigned long total_rx_rate; unsigned long total_tx_rate; @@ -82,9 +82,9 @@ struct can_pkg_stats { unsigned long max_tx_rate; unsigned long max_rx_match_ratio; - unsigned long rx_frames_delta; - unsigned long tx_frames_delta; - unsigned long matches_delta; + atomic_long_t rx_frames_delta; + atomic_long_t tx_frames_delta; + atomic_long_t matches_delta; }; /* persistent statistics */ diff --git a/net/can/bcm.c b/net/can/bcm.c index 217049fa496e..526cb6cd901f 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1011,13 +1011,12 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->timer.function = bcm_tx_timeout_handler; + hrtimer_setup(&op->timer, bcm_tx_timeout_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* currently unused in tx_ops */ - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&op->thrtimer, hrtimer_dummy_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); @@ -1192,13 +1191,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->timer.function = bcm_rx_timeout_handler; - - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->thrtimer.function = bcm_rx_thr_handler; + hrtimer_setup(&op->timer, bcm_rx_timeout_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&op->thrtimer, bcm_rx_thr_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); diff --git a/net/can/isotp.c b/net/can/isotp.c index 16046931542a..442c343afe1f 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1634,12 +1634,10 @@ static int isotp_init(struct sock *sk) so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); - hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->rxtimer.function = isotp_rx_timer_handler; - hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->txtimer.function = isotp_tx_timer_handler; - hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->txfrtimer.function = isotp_txfr_timer_handler; + hrtimer_setup(&so->rxtimer, isotp_rx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&so->txtimer, isotp_tx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&so->txfrtimer, isotp_txfr_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 486687901602..39844f14eed8 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -158,8 +158,8 @@ struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name) ecu->addr = J1939_IDLE_ADDR; ecu->name = name; - hrtimer_init(&ecu->ac_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - ecu->ac_timer.function = j1939_ecu_timer_handler; + hrtimer_setup(&ecu->ac_timer, j1939_ecu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); INIT_LIST_HEAD(&ecu->list); j1939_priv_get(priv); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 9b72d118d756..fbf5c8001c9d 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1511,12 +1511,8 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv, skcb = j1939_skb_to_cb(skb); memcpy(&session->skcb, skcb, sizeof(session->skcb)); - hrtimer_init(&session->txtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - session->txtimer.function = j1939_tp_txtimer; - hrtimer_init(&session->rxtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - session->rxtimer.function = j1939_tp_rxtimer; + hrtimer_setup(&session->txtimer, j1939_tp_txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&session->rxtimer, j1939_tp_rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n", __func__, session, skcb->addr.sa, skcb->addr.da); diff --git a/net/can/proc.c b/net/can/proc.c index bbce97825f13..25fdf060e30d 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -118,6 +118,13 @@ void can_stat_update(struct timer_list *t) struct can_pkg_stats *pkg_stats = net->can.pkg_stats; unsigned long j = jiffies; /* snapshot */ + long rx_frames = atomic_long_read(&pkg_stats->rx_frames); + long tx_frames = atomic_long_read(&pkg_stats->tx_frames); + long matches = atomic_long_read(&pkg_stats->matches); + long rx_frames_delta = atomic_long_read(&pkg_stats->rx_frames_delta); + long tx_frames_delta = atomic_long_read(&pkg_stats->tx_frames_delta); + long matches_delta = atomic_long_read(&pkg_stats->matches_delta); + /* restart counting in timer context on user request */ if (user_reset) can_init_stats(net); @@ -127,35 +134,33 @@ void can_stat_update(struct timer_list *t) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (pkg_stats->rx_frames > (ULONG_MAX / HZ)) + if (rx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (pkg_stats->tx_frames > (ULONG_MAX / HZ)) + if (tx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* matches overflow - very improbable */ - if (pkg_stats->matches > (ULONG_MAX / 100)) + if (matches > (LONG_MAX / 100)) can_init_stats(net); /* calc total values */ - if (pkg_stats->rx_frames) - pkg_stats->total_rx_match_ratio = (pkg_stats->matches * 100) / - pkg_stats->rx_frames; + if (rx_frames) + pkg_stats->total_rx_match_ratio = (matches * 100) / rx_frames; pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j, - pkg_stats->tx_frames); + tx_frames); pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j, - pkg_stats->rx_frames); + rx_frames); /* calc current values */ - if (pkg_stats->rx_frames_delta) + if (rx_frames_delta) pkg_stats->current_rx_match_ratio = - (pkg_stats->matches_delta * 100) / - pkg_stats->rx_frames_delta; + (matches_delta * 100) / rx_frames_delta; - pkg_stats->current_tx_rate = calc_rate(0, HZ, pkg_stats->tx_frames_delta); - pkg_stats->current_rx_rate = calc_rate(0, HZ, pkg_stats->rx_frames_delta); + pkg_stats->current_tx_rate = calc_rate(0, HZ, tx_frames_delta); + pkg_stats->current_rx_rate = calc_rate(0, HZ, rx_frames_delta); /* check / update maximum values */ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate) @@ -168,9 +173,9 @@ void can_stat_update(struct timer_list *t) pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio; /* clear values for 'current rate' calculation */ - pkg_stats->tx_frames_delta = 0; - pkg_stats->rx_frames_delta = 0; - pkg_stats->matches_delta = 0; + atomic_long_set(&pkg_stats->tx_frames_delta, 0); + atomic_long_set(&pkg_stats->rx_frames_delta, 0); + atomic_long_set(&pkg_stats->matches_delta, 0); /* restart timer (one second) */ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); @@ -214,9 +219,12 @@ static int can_stats_proc_show(struct seq_file *m, void *v) struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; seq_putc(m, '\n'); - seq_printf(m, " %8ld transmitted frames (TXF)\n", pkg_stats->tx_frames); - seq_printf(m, " %8ld received frames (RXF)\n", pkg_stats->rx_frames); - seq_printf(m, " %8ld matched frames (RXMF)\n", pkg_stats->matches); + seq_printf(m, " %8ld transmitted frames (TXF)\n", + atomic_long_read(&pkg_stats->tx_frames)); + seq_printf(m, " %8ld received frames (RXF)\n", + atomic_long_read(&pkg_stats->rx_frames)); + seq_printf(m, " %8ld matched frames (RXMF)\n", + atomic_long_read(&pkg_stats->matches)); seq_putc(m, '\n'); diff --git a/net/core/dev.c b/net/core/dev.c index 2f7f5fd9ffec..901514e42d15 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7016,8 +7016,7 @@ void netif_napi_add_weight_locked(struct net_device *dev, INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); - hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - napi->timer.function = napi_watchdog; + hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 711cd3b4347a..4417a18b3e95 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -23,6 +23,8 @@ #include <net/ip6_fib.h> #include <net/rtnh.h> +#include "dev.h" + DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); @@ -325,13 +327,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; + goto drop; + } - if (!dst) + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || @@ -341,8 +353,11 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->output)) + if (likely(ops && ops->output)) { + dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) @@ -359,13 +374,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; + goto drop; + } - if (!dst) + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; goto drop; + } lwtstate = dst->lwtstate; @@ -376,8 +401,11 @@ int lwtunnel_xmit(struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->xmit)) + if (likely(ops && ops->xmit)) { + dev_xmit_recursion_inc(); ret = ops->xmit(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) @@ -394,13 +422,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || @@ -410,8 +448,11 @@ int lwtunnel_input(struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->input)) + if (likely(ops && ops->input)) { + dev_xmit_recursion_inc(); ret = ops->input(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bd0251bd74a1..1a620f903c56 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2250,6 +2250,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, diff --git a/net/devlink/core.c b/net/devlink/core.c index f49cd83f1955..7203c39532fc 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -117,7 +117,7 @@ static struct devlink_rel *devlink_rel_alloc(void) err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel, xa_limit_32b, &next, GFP_KERNEL); - if (err) { + if (err < 0) { kfree(rel); return ERR_PTR(err); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b412ed88ccd9..e7a75afa995d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -884,11 +884,9 @@ void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); - hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED_SOFT); - tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; + hrtimer_setup(&tcp_sk(sk)->pacing_timer, tcp_pace_kick, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); - hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED_SOFT); - tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; + hrtimer_setup(&tcp_sk(sk)->compressed_ack_timer, tcp_compressed_ack_kick, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8b6258819dad..ac8cc1076536 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3209,13 +3209,16 @@ static void add_v4_addrs(struct inet6_dev *idev) struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen; + int scope, plen, offset = 0; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ + if (idev->dev->addr_len == sizeof(struct in6_addr)) + offset = sizeof(struct in6_addr) - 4; + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { scope = IPV6_ADDR_COMPATv4; @@ -3526,13 +3529,7 @@ static void addrconf_gre_config(struct net_device *dev) return; } - /* Generate the IPv6 link-local address using addrconf_addr_gen(), - * unless we have an IPv4 GRE device not bound to an IP address and - * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this - * case). Such devices fall back to add_v4_addrs() instead. - */ - if (!(dev->type == ARPHRD_IPGRE && *(__be32 *)dev->dev_addr == 0 && - idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) { + if (dev->type == ARPHRD_ETHER) { addrconf_addr_gen(idev, true); return; } diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 2c383c12a431..09065187378e 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -337,7 +337,6 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; - struct in6_addr orig_daddr; struct ioam6_lwt *ilwt; int err = -EINVAL; u32 pkt_cnt; @@ -352,8 +351,6 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k) goto out; - orig_daddr = ipv6_hdr(skb)->daddr; - local_bh_disable(); cache_dst = dst_cache_get(&ilwt->cache); local_bh_enable(); @@ -422,7 +419,10 @@ do_encap: goto drop; } - if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { + /* avoid lwtunnel_output() reentry loop when destination is the same + * after transformation (e.g., with the inline mode) + */ + if (dst->lwtstate != cache_dst->lwtstate) { skb_dst_drop(skb); skb_dst_set(skb, cache_dst); return dst_output(net, sk, skb); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ef2d23a1e3d5..15ce21afc8c6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3644,7 +3644,8 @@ out: in6_dev_put(idev); if (err) { - lwtstate_put(fib6_nh->fib_nh_lws); + fib_nh_common_release(&fib6_nh->nh_common); + fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } @@ -3802,10 +3803,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (nh) { if (rt->fib6_src.plen) { NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + err = -EINVAL; goto out_free; } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -ENOENT; goto out_free; } rt->nh = nh; diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a45bf17cb2a1..ae2da28f9dfb 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -94,14 +94,23 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) } static void __tcpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, __be16 *oldport, __be16 newport) { - struct tcphdr *th; + struct tcphdr *th = tcp_hdr(seg); + + if (!ipv6_addr_equal(oldip, newip)) { + inet_proto_csum_replace16(&th->check, seg, + oldip->s6_addr32, + newip->s6_addr32, + true); + *oldip = *newip; + } if (*oldport == newport) return; - th = tcp_hdr(seg); inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); *oldport = newport; } @@ -129,10 +138,10 @@ static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) th2 = tcp_hdr(seg); iph2 = ipv6_hdr(seg); - iph2->saddr = iph->saddr; - iph2->daddr = iph->daddr; - __tcpv6_gso_segment_csum(seg, &th2->source, th->source); - __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest); + __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &th2->source, th->source); + __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &th2->dest, th->dest); } return segs; diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 21b7c3b280b4..ea1efef3572a 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -213,8 +213,8 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) goto out_wq; } - hrtimer_init(&local->ifs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - local->ifs_timer.function = ieee802154_xmit_ifs_timer; + hrtimer_setup(&local->ifs_timer, ieee802154_xmit_ifs_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); wpan_phy_set_dev(local->phy, local->hw.parent); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index fd2de185bc93..23949ae2a3a8 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -651,6 +651,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; + struct mptcp_addr_info addr; bool echo; int len; @@ -659,7 +660,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || - !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, + !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; @@ -672,7 +673,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * else if (opts->suboptions & OPTION_MPTCP_DSS) return false; - len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); + len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; @@ -689,6 +690,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->ahmac = 0; *size -= opt_size; } + opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a64a0cab1bf7..3cc3af15086f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -344,6 +344,7 @@ struct rxrpc_peer { struct hlist_head error_targets; /* targets for net error distribution */ struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ + unsigned long app_data; /* Application data (e.g. afs_server) */ time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 56e09d161a97..71b6e07bf161 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -461,7 +461,7 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) continue; hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) { - pr_err("Leaked peer %u {%u} %pISp\n", + pr_err("Leaked peer %x {%u} %pISp\n", peer->debug_id, refcount_read(&peer->ref), &peer->srx.transport); @@ -478,7 +478,7 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) */ struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) { - return call->peer; + return rxrpc_get_peer(call->peer, rxrpc_peer_get_application); } EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); @@ -520,3 +520,29 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) (peer ? &peer->srx.transport : &rxrpc_null_addr.transport); } EXPORT_SYMBOL(rxrpc_kernel_remote_addr); + +/** + * rxrpc_kernel_set_peer_data - Set app-specific data on a peer. + * @peer: The peer to alter + * @app_data: The data to set + * + * Set the app-specific data on a peer. AF_RXRPC makes no effort to retain + * anything the data might refer to. The previous app_data is returned. + */ +unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data) +{ + return xchg(&peer->app_data, app_data); +} +EXPORT_SYMBOL(rxrpc_kernel_set_peer_data); + +/** + * rxrpc_kernel_get_peer_data - Get app-specific data from a peer. + * @peer: The peer to query + * + * Retrieve the app-specific data from a peer. + */ +unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer) +{ + return peer->app_data; +} +EXPORT_SYMBOL(rxrpc_kernel_get_peer_data); diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 91c0ec729823..c1f75f272757 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -287,8 +287,7 @@ static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, gact->param.tcfg_basetime = basetime; gact->param.tcfg_clockid = clockid; gact->tk_offset = tko; - hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); - gact->hitimer.function = gate_timer_func; + hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 6c625dcd0651..0b102a0ca83d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -619,8 +619,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid) { - hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); - wd->timer.function = qdisc_watchdog; + hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED); wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init_clockid); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a68e17891b0b..14021b812329 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1932,8 +1932,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { - hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS); } err = taprio_get_start_time(sch, new_admin, &start); @@ -2056,8 +2055,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, spin_lock_init(&q->current_entry_lock); - hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS); q->root = sch; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 1f7975b49657..d158cb6dd391 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -105,7 +105,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, if (pool->unaligned) pool->free_heads[i] = xskb; else - xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); + xp_init_xskb_addr(xskb, pool, (u64)i * pool->chunk_size); } return pool; diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 755f1eea8bfa..3b6d7284fc70 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -2625,12 +2625,10 @@ static void __iptfs_init_state(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs) { __skb_queue_head_init(&xtfs->queue); - hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); - xtfs->iptfs_timer.function = iptfs_delay_timer; + hrtimer_setup(&xtfs->iptfs_timer, iptfs_delay_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); spin_lock_init(&xtfs->drop_lock); - hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); - xtfs->drop_timer.function = iptfs_drop_timer; + hrtimer_setup(&xtfs->drop_timer, iptfs_drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); /* Modify type (esp) adjustment values */ diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index f7abd42c077d..3cabc87978dd 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -612,6 +612,40 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output_resume); +static int xfrm_dev_direct_output(struct sock *sk, struct xfrm_state *x, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net *net = xs_net(x); + int err; + + dst = skb_dst_pop(skb); + if (!dst) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -EHOSTUNREACH; + } + skb_dst_set(skb, dst); + nf_reset_ct(skb); + + err = skb_dst(skb)->ops->local_out(net, sk, skb); + if (unlikely(err != 1)) { + kfree_skb(skb); + return err; + } + + /* In transport mode, network destination is + * directly reachable, while in tunnel mode, + * inner packet network may not be. In packet + * offload type, HW is responsible for hard + * header packet mangling so directly xmit skb + * to netdevice. + */ + skb->dev = x->xso.dev; + __skb_push(skb, skb->dev->hard_header_len); + return dev_queue_xmit(skb); +} + static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { return xfrm_output_resume(sk, skb, 1); @@ -735,6 +769,13 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) return -EHOSTUNREACH; } + /* Exclusive direct xmit for tunnel mode, as + * some filtering or matching rules may apply + * in transport mode. + */ + if (x->props.mode == XFRM_MODE_TUNNEL) + return xfrm_dev_direct_output(sk, x, skb); + return xfrm_output_resume(sk, skb, 0); } @@ -758,7 +799,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) skb->encapsulation = 1; if (skb_is_gso(skb)) { - if (skb->inner_protocol) + if (skb->inner_protocol && x->props.mode == XFRM_MODE_TUNNEL) return xfrm_output_gso(net, sk, skb); skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ad2202fa82f3..9bd14fdb67a5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -746,8 +746,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); INIT_HLIST_NODE(&x->byseq); - hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); - x->mtimer.function = xfrm_timer_handler; + hrtimer_setup(&x->mtimer, xfrm_timer_handler, CLOCK_BOOTTIME, + HRTIMER_MODE_ABS_SOFT); timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); x->curlft.add_time = ktime_get_real_seconds(); x->lft.soft_byte_limit = XFRM_INF; diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index f46cf3bb7069..2396ca1cf8fb 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -10,6 +10,7 @@ #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/blkdev.h> +#include <linux/cpumask.h> #include <linux/cred.h> #include <linux/device/faux.h> #include <linux/errname.h> diff --git a/rust/helpers/cpumask.c b/rust/helpers/cpumask.c new file mode 100644 index 000000000000..2d380a86c34a --- /dev/null +++ b/rust/helpers/cpumask.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/cpumask.h> + +void rust_helper_cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +{ + cpumask_set_cpu(cpu, dstp); +} + +void rust_helper_cpumask_clear_cpu(int cpu, struct cpumask *dstp) +{ + cpumask_clear_cpu(cpu, dstp); +} + +void rust_helper_cpumask_setall(struct cpumask *dstp) +{ + cpumask_setall(dstp); +} + +unsigned int rust_helper_cpumask_weight(struct cpumask *srcp) +{ + return cpumask_weight(srcp); +} + +void rust_helper_cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) +{ + cpumask_copy(dstp, srcp); +} + +bool rust_helper_alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + return alloc_cpumask_var(mask, flags); +} + +bool rust_helper_zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + return zalloc_cpumask_var(mask, flags); +} + +#ifndef CONFIG_CPUMASK_OFFSTACK +void rust_helper_free_cpumask_var(cpumask_var_t mask) +{ + free_cpumask_var(mask); +} +#endif diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 0640b7e115be..e1c21eba9b15 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -11,6 +11,7 @@ #include "bug.c" #include "build_assert.c" #include "build_bug.c" +#include "cpumask.c" #include "cred.c" #include "device.c" #include "err.c" @@ -30,6 +31,7 @@ #include "signal.c" #include "slab.c" #include "spinlock.c" +#include "sync.c" #include "task.c" #include "uaccess.c" #include "vmalloc.c" diff --git a/rust/helpers/sync.c b/rust/helpers/sync.c new file mode 100644 index 000000000000..ff7e68b48810 --- /dev/null +++ b/rust/helpers/sync.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/lockdep.h> + +void rust_helper_lockdep_register_key(struct lock_class_key *k) +{ + lockdep_register_key(k); +} + +void rust_helper_lockdep_unregister_key(struct lock_class_key *k) +{ + lockdep_unregister_key(k); +} diff --git a/rust/kernel/fs/file.rs b/rust/kernel/fs/file.rs index e03dbe14d62a..736209a1b983 100644 --- a/rust/kernel/fs/file.rs +++ b/rust/kernel/fs/file.rs @@ -392,6 +392,7 @@ pub struct FileDescriptorReservation { impl FileDescriptorReservation { /// Creates a new file descriptor reservation. + #[inline] pub fn get_unused_fd_flags(flags: u32) -> Result<Self> { // SAFETY: FFI call, there are no safety requirements on `flags`. let fd: i32 = unsafe { bindings::get_unused_fd_flags(flags) }; @@ -405,6 +406,7 @@ impl FileDescriptorReservation { } /// Returns the file descriptor number that was reserved. + #[inline] pub fn reserved_fd(&self) -> u32 { self.fd } @@ -413,6 +415,7 @@ impl FileDescriptorReservation { /// /// The previously reserved file descriptor is bound to `file`. This method consumes the /// [`FileDescriptorReservation`], so it will not be usable after this call. + #[inline] pub fn fd_install(self, file: ARef<File>) { // SAFETY: `self.fd` was previously returned by `get_unused_fd_flags`. We have not yet used // the fd, so it is still valid, and `current` still refers to the same task, as this type @@ -433,6 +436,7 @@ impl FileDescriptorReservation { } impl Drop for FileDescriptorReservation { + #[inline] fn drop(&mut self) { // SAFETY: By the type invariants of this type, `self.fd` was previously returned by // `get_unused_fd_flags`. We have not yet used the fd, so it is still valid, and `current` diff --git a/rust/kernel/seq_file.rs b/rust/kernel/seq_file.rs index 04947c672979..efc4dd09850a 100644 --- a/rust/kernel/seq_file.rs +++ b/rust/kernel/seq_file.rs @@ -30,6 +30,7 @@ impl SeqFile { } /// Used by the [`seq_print`] macro. + #[inline] pub fn call_printf(&self, args: core::fmt::Arguments<'_>) { // SAFETY: Passing a void pointer to `Arguments` is valid for `%pA`. unsafe { diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs index 16eab9138b2b..4104bc26471a 100644 --- a/rust/kernel/sync.rs +++ b/rust/kernel/sync.rs @@ -5,6 +5,8 @@ //! This module contains the kernel APIs related to synchronisation that have been ported or //! wrapped for usage by Rust code in the kernel. +use crate::pin_init; +use crate::prelude::*; use crate::types::Opaque; mod arc; @@ -23,15 +25,64 @@ pub use locked_by::LockedBy; /// Represents a lockdep class. It's a wrapper around C's `lock_class_key`. #[repr(transparent)] -pub struct LockClassKey(Opaque<bindings::lock_class_key>); +#[pin_data(PinnedDrop)] +pub struct LockClassKey { + #[pin] + inner: Opaque<bindings::lock_class_key>, +} // SAFETY: `bindings::lock_class_key` is designed to be used concurrently from multiple threads and // provides its own synchronization. unsafe impl Sync for LockClassKey {} impl LockClassKey { + /// Initializes a dynamically allocated lock class key. In the common case of using a + /// statically allocated lock class key, the static_lock_class! macro should be used instead. + /// + /// # Example + /// ``` + /// # use kernel::{c_str, stack_pin_init}; + /// # use kernel::alloc::KBox; + /// # use kernel::types::ForeignOwnable; + /// # use kernel::sync::{LockClassKey, SpinLock}; + /// + /// let key = KBox::pin_init(LockClassKey::new_dynamic(), GFP_KERNEL)?; + /// let key_ptr = key.into_foreign(); + /// + /// { + /// stack_pin_init!(let num: SpinLock<u32> = SpinLock::new( + /// 0, + /// c_str!("my_spinlock"), + /// // SAFETY: `key_ptr` is returned by the above `into_foreign()`, whose + /// // `from_foreign()` has not yet been called. + /// unsafe { <Pin<KBox<LockClassKey>> as ForeignOwnable>::borrow(key_ptr) } + /// )); + /// } + /// + /// // SAFETY: We dropped `num`, the only use of the key, so the result of the previous + /// // `borrow` has also been dropped. Thus, it's safe to use from_foreign. + /// unsafe { drop(<Pin<KBox<LockClassKey>> as ForeignOwnable>::from_foreign(key_ptr)) }; + /// + /// # Ok::<(), Error>(()) + /// ``` + pub fn new_dynamic() -> impl PinInit<Self> { + pin_init!(Self { + // SAFETY: lockdep_register_key expects an uninitialized block of memory + inner <- Opaque::ffi_init(|slot| unsafe { bindings::lockdep_register_key(slot) }) + }) + } + pub(crate) fn as_ptr(&self) -> *mut bindings::lock_class_key { - self.0.get() + self.inner.get() + } +} + +#[pinned_drop] +impl PinnedDrop for LockClassKey { + fn drop(self: Pin<&mut Self>) { + // SAFETY: self.as_ptr was registered with lockdep and self is pinned, so the address + // hasn't changed. Thus, it's safe to pass to unregister. + unsafe { bindings::lockdep_unregister_key(self.as_ptr()) } } } @@ -44,7 +95,7 @@ macro_rules! static_lock_class { // SAFETY: lockdep expects uninitialized memory when it's handed a statically allocated // lock_class_key unsafe { ::core::mem::MaybeUninit::uninit().assume_init() }; - &CLASS + $crate::prelude::Pin::static_ref(&CLASS) }}; } diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index 7df565038d7d..fbf68ada582f 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -11,12 +11,13 @@ use crate::{ init::PinInit, pin_init, str::CStr, - task::{MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE, TASK_NORMAL, TASK_UNINTERRUPTIBLE}, + task::{ + MAX_SCHEDULE_TIMEOUT, TASK_FREEZABLE, TASK_INTERRUPTIBLE, TASK_NORMAL, TASK_UNINTERRUPTIBLE, + }, time::Jiffies, types::Opaque, }; -use core::marker::PhantomPinned; -use core::ptr; +use core::{marker::PhantomPinned, pin::Pin, ptr}; use macros::pin_data; /// Creates a [`CondVar`] initialiser with the given name and a newly-created lock class. @@ -101,7 +102,7 @@ unsafe impl Sync for CondVar {} impl CondVar { /// Constructs a new condvar initialiser. - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit<Self> { + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit<Self> { pin_init!(Self { _pin: PhantomPinned, // SAFETY: `slot` is valid while the closure is called and both `name` and `key` have @@ -159,6 +160,25 @@ impl CondVar { crate::current!().signal_pending() } + /// Releases the lock and waits for a notification in interruptible and freezable mode. + /// + /// The process is allowed to be frozen during this sleep. No lock should be held when calling + /// this function, and there is a lockdep assertion for this. Freezing a task that holds a lock + /// can trivially deadlock vs another task that needs that lock to complete before it too can + /// hit freezable. + #[must_use = "wait_interruptible_freezable returns if a signal is pending, so the caller must check the return value"] + pub fn wait_interruptible_freezable<T: ?Sized, B: Backend>( + &self, + guard: &mut Guard<'_, T, B>, + ) -> bool { + self.wait_internal( + TASK_INTERRUPTIBLE | TASK_FREEZABLE, + guard, + MAX_SCHEDULE_TIMEOUT, + ); + crate::current!().signal_pending() + } + /// Releases the lock and waits for a notification in interruptible mode. /// /// Atomically releases the given lock (whose ownership is proven by the guard) and puts the diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index eb80048e0110..360a10a9216d 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -12,7 +12,7 @@ use crate::{ str::CStr, types::{NotThreadSafe, Opaque, ScopeGuard}, }; -use core::{cell::UnsafeCell, marker::PhantomPinned}; +use core::{cell::UnsafeCell, marker::PhantomPinned, pin::Pin}; use macros::pin_data; pub mod mutex; @@ -129,7 +129,7 @@ unsafe impl<T: ?Sized + Send, B: Backend> Sync for Lock<T, B> {} impl<T, B: Backend> Lock<T, B> { /// Constructs a new lock initialiser. - pub fn new(t: T, name: &'static CStr, key: &'static LockClassKey) -> impl PinInit<Self> { + pub fn new(t: T, name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit<Self> { pin_init!(Self { data: UnsafeCell::new(t), _pin: PhantomPinned, @@ -199,7 +199,36 @@ pub struct Guard<'a, T: ?Sized, B: Backend> { // SAFETY: `Guard` is sync when the data protected by the lock is also sync. unsafe impl<T: Sync + ?Sized, B: Backend> Sync for Guard<'_, T, B> {} -impl<T: ?Sized, B: Backend> Guard<'_, T, B> { +impl<'a, T: ?Sized, B: Backend> Guard<'a, T, B> { + /// Returns the lock that this guard originates from. + /// + /// # Examples + /// + /// The following example shows how to use [`Guard::lock_ref()`] to assert the corresponding + /// lock is held. + /// + /// ``` + /// # use kernel::{new_spinlock, stack_pin_init, sync::lock::{Backend, Guard, Lock}}; + /// + /// fn assert_held<T, B: Backend>(guard: &Guard<'_, T, B>, lock: &Lock<T, B>) { + /// // Address-equal means the same lock. + /// assert!(core::ptr::eq(guard.lock_ref(), lock)); + /// } + /// + /// // Creates a new lock on the stack. + /// stack_pin_init!{ + /// let l = new_spinlock!(42) + /// } + /// + /// let g = l.lock(); + /// + /// // `g` originates from `l`. + /// assert_held(&g, &l); + /// ``` + pub fn lock_ref(&self) -> &'a Lock<T, B> { + self.lock + } + pub(crate) fn do_unlocked<U>(&mut self, cb: impl FnOnce() -> U) -> U { // SAFETY: The caller owns the lock, so it is safe to unlock it. unsafe { B::unlock(self.lock.state.get(), &self.state) }; diff --git a/rust/kernel/sync/lock/global.rs b/rust/kernel/sync/lock/global.rs index 480ee724e3cc..d65f94b5caf2 100644 --- a/rust/kernel/sync/lock/global.rs +++ b/rust/kernel/sync/lock/global.rs @@ -13,6 +13,7 @@ use crate::{ use core::{ cell::UnsafeCell, marker::{PhantomData, PhantomPinned}, + pin::Pin, }; /// Trait implemented for marker types for global locks. @@ -26,7 +27,7 @@ pub trait GlobalLockBackend { /// The backend used for this global lock. type Backend: Backend + 'static; /// The class for this global lock. - fn get_lock_class() -> &'static LockClassKey; + fn get_lock_class() -> Pin<&'static LockClassKey>; } /// Type used for global locks. @@ -270,7 +271,7 @@ macro_rules! global_lock { type Item = $valuety; type Backend = $crate::global_lock_inner!(backend $kind); - fn get_lock_class() -> &'static $crate::sync::LockClassKey { + fn get_lock_class() -> Pin<&'static $crate::sync::LockClassKey> { $crate::static_lock_class!() } } diff --git a/rust/kernel/sync/poll.rs b/rust/kernel/sync/poll.rs index d5f17153b424..c4934f82d68b 100644 --- a/rust/kernel/sync/poll.rs +++ b/rust/kernel/sync/poll.rs @@ -89,7 +89,7 @@ pub struct PollCondVar { impl PollCondVar { /// Constructs a new condvar initialiser. - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit<Self> { + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit<Self> { pin_init!(Self { inner <- CondVar::new(name, key), }) diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 38da555a2bdb..0e40a3dc0410 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -23,6 +23,8 @@ pub const MAX_SCHEDULE_TIMEOUT: c_long = c_long::MAX; pub const TASK_INTERRUPTIBLE: c_int = bindings::TASK_INTERRUPTIBLE as c_int; /// Bitmask for tasks that are sleeping in an uninterruptible state. pub const TASK_UNINTERRUPTIBLE: c_int = bindings::TASK_UNINTERRUPTIBLE as c_int; +/// Bitmask for tasks that are sleeping in a freezable state. +pub const TASK_FREEZABLE: c_int = bindings::TASK_FREEZABLE as c_int; /// Convenience constant for waking up tasks regardless of whether they are in interruptible or /// uninterruptible sleep. pub const TASK_NORMAL: c_uint = bindings::TASK_NORMAL as c_uint; diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index b7be224cdf4b..f98bd02b838f 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -369,7 +369,7 @@ unsafe impl<T: ?Sized, const ID: u64> Sync for Work<T, ID> {} impl<T: ?Sized, const ID: u64> Work<T, ID> { /// Creates a new instance of [`Work`]. #[inline] - pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit<Self> + pub fn new(name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit<Self> where T: WorkItem<ID>, { @@ -703,3 +703,21 @@ pub fn system_freezable_power_efficient() -> &'static Queue { // SAFETY: `system_freezable_power_efficient_wq` is a C global, always available. unsafe { Queue::from_raw(bindings::system_freezable_power_efficient_wq) } } + +/// Returns the system bottom halves work queue (`system_bh_wq`). +/// +/// It is similar to the one returned by [`system`] but for work items which +/// need to run from a softirq context. +pub fn system_bh() -> &'static Queue { + // SAFETY: `system_bh_wq` is a C global, always available. + unsafe { Queue::from_raw(bindings::system_bh_wq) } +} + +/// Returns the system bottom halves high-priority work queue (`system_bh_highpri_wq`). +/// +/// It is similar to the one returned by [`system_bh`] but for work items which +/// require higher scheduling priority. +pub fn system_bh_highpri() -> &'static Queue { + // SAFETY: `system_bh_highpri_wq` is a C global, always available. + unsafe { Queue::from_raw(bindings::system_bh_highpri_wq) } +} diff --git a/samples/check-exec/run-script-ask.inc b/samples/check-exec/run-script-ask.sh index 8ef0fdc37266..8ef0fdc37266 100755 --- a/samples/check-exec/run-script-ask.inc +++ b/samples/check-exec/run-script-ask.sh diff --git a/samples/vfs/samples-vfs.h b/samples/vfs/samples-vfs.h index 103e1e7c4cec..498baf581b56 100644 --- a/samples/vfs/samples-vfs.h +++ b/samples/vfs/samples-vfs.h @@ -42,7 +42,11 @@ struct statmount { __u32 opt_array; /* [str] Array of nul terminated fs options */ __u32 opt_sec_num; /* Number of security options */ __u32 opt_sec_array; /* [str] Array of nul terminated security options */ - __u64 __spare2[46]; + __u32 mnt_uidmap_num; /* Number of uid mappings */ + __u32 mnt_uidmap; /* [str] Array of uid mappings */ + __u32 mnt_gidmap_num; /* Number of gid mappings */ + __u32 mnt_gidmap; /* [str] Array of gid mappings */ + __u64 __spare2[44]; char str[]; /* Variable size part containing strings */ }; @@ -158,6 +162,14 @@ struct mnt_ns_info { #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #endif +#ifndef STATMOUNT_MNT_UIDMAP +#define STATMOUNT_MNT_UIDMAP 0x00002000U /* Want/got uidmap... */ +#endif + +#ifndef STATMOUNT_MNT_GIDMAP +#define STATMOUNT_MNT_GIDMAP 0x00004000U /* Want/got gidmap... */ +#endif + #ifndef MOUNT_ATTR_RDONLY #define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */ #endif diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c index 1a02ea4593e3..713c174626aa 100644 --- a/samples/vfs/test-list-all-mounts.c +++ b/samples/vfs/test-list-all-mounts.c @@ -128,20 +128,43 @@ next: STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID | STATMOUNT_MNT_OPTS | - STATMOUNT_FS_TYPE, 0); + STATMOUNT_FS_TYPE | + STATMOUNT_MNT_UIDMAP | + STATMOUNT_MNT_GIDMAP, 0); if (!stmnt) { printf("Failed to statmount(%" PRIu64 ") in mount namespace(%" PRIu64 ")\n", (uint64_t)last_mnt_id, (uint64_t)info.mnt_ns_id); continue; } - printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n", + printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n", (uint64_t)stmnt->mnt_id, (uint64_t)stmnt->mnt_parent_id, - stmnt->str + stmnt->fs_type, - stmnt->str + stmnt->mnt_root, - stmnt->str + stmnt->mnt_point, - stmnt->str + stmnt->mnt_opts); + (stmnt->mask & STATMOUNT_FS_TYPE) ? stmnt->str + stmnt->fs_type : "", + (stmnt->mask & STATMOUNT_MNT_ROOT) ? stmnt->str + stmnt->mnt_root : "", + (stmnt->mask & STATMOUNT_MNT_POINT) ? stmnt->str + stmnt->mnt_point : "", + (stmnt->mask & STATMOUNT_MNT_OPTS) ? stmnt->str + stmnt->mnt_opts : ""); + + if (stmnt->mask & STATMOUNT_MNT_UIDMAP) { + const char *idmap = stmnt->str + stmnt->mnt_uidmap; + + for (size_t idx = 0; idx < stmnt->mnt_uidmap_num; idx++) { + printf("mnt_uidmap[%zu]:\t%s\n", idx, idmap); + idmap += strlen(idmap) + 1; + } + } + + if (stmnt->mask & STATMOUNT_MNT_GIDMAP) { + const char *idmap = stmnt->str + stmnt->mnt_gidmap; + + for (size_t idx = 0; idx < stmnt->mnt_gidmap_num; idx++) { + printf("mnt_gidmap[%zu]:\t%s\n", idx, idmap); + idmap += strlen(idmap) + 1; + } + } + + printf("\n"); + free(stmnt); } } diff --git a/scripts/Makefile.clang b/scripts/Makefile.clang index 2435efae67f5..b67636b28c35 100644 --- a/scripts/Makefile.clang +++ b/scripts/Makefile.clang @@ -12,6 +12,8 @@ CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu CLANG_TARGET_FLAGS_sparc := sparc64-linux-gnu CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu +# This is only for i386 UM builds, which need the 32-bit target not -m32 +CLANG_TARGET_FLAGS_i386 := i386-linux-gnu CLANG_TARGET_FLAGS_um := $(CLANG_TARGET_FLAGS_$(SUBARCH)) CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(SRCARCH)) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index cad20f0e66ee..57620b439a1f 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -166,8 +166,8 @@ _c_flags += $(if $(patsubst n%,, \ $(UBSAN_SANITIZE_$(target-stem).o)$(UBSAN_SANITIZE)$(is-kernel-object)), \ $(CFLAGS_UBSAN)) _c_flags += $(if $(patsubst n%,, \ - $(UBSAN_SIGNED_WRAP_$(target-stem).o)$(UBSAN_SANITIZE_$(target-stem).o)$(UBSAN_SIGNED_WRAP)$(UBSAN_SANITIZE)$(is-kernel-object)), \ - $(CFLAGS_UBSAN_SIGNED_WRAP)) + $(UBSAN_INTEGER_WRAP_$(target-stem).o)$(UBSAN_SANITIZE_$(target-stem).o)$(UBSAN_INTEGER_WRAP)$(UBSAN_SANITIZE)$(is-kernel-object)), \ + $(CFLAGS_UBSAN_INTEGER_WRAP)) endif ifeq ($(CONFIG_KCOV),y) @@ -277,6 +277,7 @@ objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE) += --static-call objtool-args-$(CONFIG_HAVE_UACCESS_VALIDATION) += --uaccess objtool-args-$(CONFIG_GCOV_KERNEL) += --no-unreachable objtool-args-$(CONFIG_PREFIX_SYMBOLS) += --prefix=$(CONFIG_FUNCTION_PADDING_BYTES) +objtool-args-$(CONFIG_OBJTOOL_WERROR) += --Werror --backtrace objtool-args = $(objtool-args-y) \ $(if $(delay-objtool), --link) \ diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index b2d3b273b802..9e35198edbf0 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -14,5 +14,11 @@ ubsan-cflags-$(CONFIG_UBSAN_TRAP) += $(call cc-option,-fsanitize-trap=undefined export CFLAGS_UBSAN := $(ubsan-cflags-y) -ubsan-signed-wrap-cflags-$(CONFIG_UBSAN_SIGNED_WRAP) += -fsanitize=signed-integer-overflow -export CFLAGS_UBSAN_SIGNED_WRAP := $(ubsan-signed-wrap-cflags-y) +ubsan-integer-wrap-cflags-$(CONFIG_UBSAN_INTEGER_WRAP) += \ + -fsanitize-undefined-ignore-overflow-pattern=all \ + -fsanitize=signed-integer-overflow \ + -fsanitize=unsigned-integer-overflow \ + -fsanitize=implicit-signed-integer-truncation \ + -fsanitize=implicit-unsigned-integer-truncation \ + -fsanitize-ignorelist=$(srctree)/scripts/integer-wrap-ignore.scl +export CFLAGS_UBSAN_INTEGER_WRAP := $(ubsan-integer-wrap-cflags-y) diff --git a/scripts/documentation-file-ref-check b/scripts/documentation-file-ref-check index 68083f2f1122..408b1dbe7884 100755 --- a/scripts/documentation-file-ref-check +++ b/scripts/documentation-file-ref-check @@ -92,7 +92,7 @@ while (<IN>) { next if ($f =~ m,^Next/,); # Makefiles and scripts contain nasty expressions to parse docs - next if ($f =~ m/Makefile/ || $f =~ m/\.sh$/); + next if ($f =~ m/Makefile/ || $f =~ m/\.(sh|py|pl|~|rej|org|orig)$/); # It doesn't make sense to parse hidden files next if ($f =~ m#/\.#); diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh deleted file mode 100755 index 9459ca4f0f11..000000000000 --- a/scripts/gcc-x86_32-has-stack-protector.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# This requires GCC 8.1 or better. Specifically, we require -# -mstack-protector-guard-reg, added by -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81708 - -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh deleted file mode 100755 index f680bb01aeeb..000000000000 --- a/scripts/gcc-x86_64-has-stack-protector.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index 13eb8b3901b8..8f7c4fb78c2c 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -164,7 +164,7 @@ def get_current_task(cpu): var_ptr = gdb.parse_and_eval("(struct task_struct *)cpu_tasks[0].task") return var_ptr.dereference() else: - var_ptr = gdb.parse_and_eval("&pcpu_hot.current_task") + var_ptr = gdb.parse_and_eval("¤t_task") return per_cpu(var_ptr, cpu).dereference() elif utils.is_target_arch("aarch64"): current_task_addr = gdb.parse_and_eval("(unsigned long)$SP_EL0") diff --git a/scripts/get_abi.pl b/scripts/get_abi.pl deleted file mode 100755 index de1c0354b50c..000000000000 --- a/scripts/get_abi.pl +++ /dev/null @@ -1,1103 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -BEGIN { $Pod::Usage::Formatter = 'Pod::Text::Termcap'; } - -use strict; -use warnings; -use utf8; -use Pod::Usage qw(pod2usage); -use Getopt::Long; -use File::Find; -use IO::Handle; -use Fcntl ':mode'; -use Cwd 'abs_path'; -use Data::Dumper; - -my $help = 0; -my $hint = 0; -my $man = 0; -my $debug = 0; -my $enable_lineno = 0; -my $show_warnings = 1; -my $prefix="Documentation/ABI"; -my $sysfs_prefix="/sys"; -my $search_string; - -# Debug options -my $dbg_what_parsing = 1; -my $dbg_what_open = 2; -my $dbg_dump_abi_structs = 4; -my $dbg_undefined = 8; - -$Data::Dumper::Indent = 1; -$Data::Dumper::Terse = 1; - -# -# If true, assumes that the description is formatted with ReST -# -my $description_is_rst = 1; - -GetOptions( - "debug=i" => \$debug, - "enable-lineno" => \$enable_lineno, - "rst-source!" => \$description_is_rst, - "dir=s" => \$prefix, - 'help|?' => \$help, - "show-hints" => \$hint, - "search-string=s" => \$search_string, - man => \$man -) or pod2usage(2); - -pod2usage(1) if $help; -pod2usage(-exitstatus => 0, -noperldoc, -verbose => 2) if $man; - -pod2usage(2) if (scalar @ARGV < 1 || @ARGV > 2); - -my ($cmd, $arg) = @ARGV; - -pod2usage(2) if ($cmd ne "search" && $cmd ne "rest" && $cmd ne "validate" && $cmd ne "undefined"); -pod2usage(2) if ($cmd eq "search" && !$arg); - -require Data::Dumper if ($debug & $dbg_dump_abi_structs); - -my %data; -my %symbols; - -# -# Displays an error message, printing file name and line -# -sub parse_error($$$$) { - my ($file, $ln, $msg, $data) = @_; - - return if (!$show_warnings); - - $data =~ s/\s+$/\n/; - - print STDERR "Warning: file $file#$ln:\n\t$msg"; - - if ($data ne "") { - print STDERR ". Line\n\t\t$data"; - } else { - print STDERR "\n"; - } -} - -# -# Parse an ABI file, storing its contents at %data -# -sub parse_abi { - my $file = $File::Find::name; - - my $mode = (stat($file))[2]; - return if ($mode & S_IFDIR); - return if ($file =~ m,/README,); - return if ($file =~ m,/\.,); - return if ($file =~ m,\.(rej|org|orig|bak)$,); - - my $name = $file; - $name =~ s,.*/,,; - - my $fn = $file; - $fn =~ s,.*Documentation/ABI/,,; - - my $nametag = "File $fn"; - $data{$nametag}->{what} = "File $name"; - $data{$nametag}->{type} = "File"; - $data{$nametag}->{file} = $name; - $data{$nametag}->{filepath} = $file; - $data{$nametag}->{is_file} = 1; - $data{$nametag}->{line_no} = 1; - - my $type = $file; - $type =~ s,.*/(.*)/.*,$1,; - - my $what; - my $new_what; - my $tag = ""; - my $ln; - my $xrefs; - my $space; - my @labels; - my $label = ""; - - print STDERR "Opening $file\n" if ($debug & $dbg_what_open); - open IN, $file; - while(<IN>) { - $ln++; - if (m/^(\S+)(:\s*)(.*)/i) { - my $new_tag = lc($1); - my $sep = $2; - my $content = $3; - - if (!($new_tag =~ m/(what|where|date|kernelversion|contact|description|users)/)) { - if ($tag eq "description") { - # New "tag" is actually part of - # description. Don't consider it a tag - $new_tag = ""; - } elsif ($tag ne "") { - parse_error($file, $ln, "tag '$tag' is invalid", $_); - } - } - - # Invalid, but it is a common mistake - if ($new_tag eq "where") { - parse_error($file, $ln, "tag 'Where' is invalid. Should be 'What:' instead", ""); - $new_tag = "what"; - } - - if ($new_tag =~ m/what/) { - $space = ""; - $content =~ s/[,.;]$//; - - push @{$symbols{$content}->{file}}, " $file:" . ($ln - 1); - - if ($tag =~ m/what/) { - $what .= "\xac" . $content; - } else { - if ($what) { - parse_error($file, $ln, "What '$what' doesn't have a description", "") if (!$data{$what}->{description}); - - foreach my $w(split /\xac/, $what) { - $symbols{$w}->{xref} = $what; - }; - } - - $what = $content; - $label = $content; - $new_what = 1; - } - push @labels, [($content, $label)]; - $tag = $new_tag; - - push @{$data{$nametag}->{symbols}}, $content if ($data{$nametag}->{what}); - next; - } - - if ($tag ne "" && $new_tag) { - $tag = $new_tag; - - if ($new_what) { - @{$data{$what}->{label_list}} = @labels if ($data{$nametag}->{what}); - @labels = (); - $label = ""; - $new_what = 0; - - $data{$what}->{type} = $type; - if (!defined($data{$what}->{file})) { - $data{$what}->{file} = $name; - $data{$what}->{filepath} = $file; - } else { - $data{$what}->{description} .= "\n\n" if (defined($data{$what}->{description})); - if ($name ne $data{$what}->{file}) { - $data{$what}->{file} .= " " . $name; - $data{$what}->{filepath} .= " " . $file; - } - } - print STDERR "\twhat: $what\n" if ($debug & $dbg_what_parsing); - $data{$what}->{line_no} = $ln; - } else { - $data{$what}->{line_no} = $ln if (!defined($data{$what}->{line_no})); - } - - if (!$what) { - parse_error($file, $ln, "'What:' should come first:", $_); - next; - } - if ($new_tag eq "description") { - $sep =~ s,:, ,; - $content = ' ' x length($new_tag) . $sep . $content; - while ($content =~ s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e) {} - if ($content =~ m/^(\s*)(\S.*)$/) { - # Preserve initial spaces for the first line - $space = $1; - $content = "$2\n"; - $data{$what}->{$tag} .= $content; - } else { - undef($space); - } - - } else { - $data{$what}->{$tag} = $content; - } - next; - } - } - - # Store any contents before tags at the database - if (!$tag && $data{$nametag}->{what}) { - $data{$nametag}->{description} .= $_; - next; - } - - if ($tag eq "description") { - my $content = $_; - while ($content =~ s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e) {} - if (m/^\s*\n/) { - $data{$what}->{$tag} .= "\n"; - next; - } - - if (!defined($space)) { - # Preserve initial spaces for the first line - if ($content =~ m/^(\s*)(\S.*)$/) { - $space = $1; - $content = "$2\n"; - } - } else { - $space = "" if (!($content =~ s/^($space)//)); - } - $data{$what}->{$tag} .= $content; - - next; - } - if (m/^\s*(.*)/) { - $data{$what}->{$tag} .= "\n$1"; - $data{$what}->{$tag} =~ s/\n+$//; - next; - } - - # Everything else is error - parse_error($file, $ln, "Unexpected content", $_); - } - $data{$nametag}->{description} =~ s/^\n+// if ($data{$nametag}->{description}); - if ($what) { - parse_error($file, $ln, "What '$what' doesn't have a description", "") if (!$data{$what}->{description}); - - foreach my $w(split /\xac/,$what) { - $symbols{$w}->{xref} = $what; - }; - } - close IN; -} - -sub create_labels { - my %labels; - - foreach my $what (keys %data) { - next if ($data{$what}->{file} eq "File"); - - foreach my $p (@{$data{$what}->{label_list}}) { - my ($content, $label) = @{$p}; - $label = "abi_" . $label . " "; - $label =~ tr/A-Z/a-z/; - - # Convert special chars to "_" - $label =~s/([\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\xff])/_/g; - $label =~ s,_+,_,g; - $label =~ s,_$,,; - - # Avoid duplicated labels - while (defined($labels{$label})) { - my @chars = ("A".."Z", "a".."z"); - $label .= $chars[rand @chars]; - } - $labels{$label} = 1; - - $data{$what}->{label} = $label; - - # only one label is enough - last; - } - } -} - -# -# Outputs the book on ReST format -# - -# \b doesn't work well with paths. So, we need to define something else: -# Boundaries are punct characters, spaces and end-of-line -my $start = qr {(^|\s|\() }x; -my $bondary = qr { ([,.:;\)\s]|\z) }x; -my $xref_match = qr { $start(\/(sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)$bondary }x; -my $symbols = qr { ([\x01-\x08\x0e-\x1f\x21-\x2f\x3a-\x40\x7b-\xff]) }x; - -sub output_rest { - create_labels(); - - my $part = ""; - - foreach my $what (sort { - ($data{$a}->{type} eq "File") cmp ($data{$b}->{type} eq "File") || - $a cmp $b - } keys %data) { - my $type = $data{$what}->{type}; - - my @file = split / /, $data{$what}->{file}; - my @filepath = split / /, $data{$what}->{filepath}; - - if ($enable_lineno) { - printf ".. LINENO %s%s#%s\n\n", - $prefix, $file[0], - $data{$what}->{line_no}; - } - - my $w = $what; - - if ($type ne "File") { - my $cur_part = $what; - if ($what =~ '/') { - if ($what =~ m#^(\/?(?:[\w\-]+\/?){1,2})#) { - $cur_part = "Symbols under $1"; - $cur_part =~ s,/$,,; - } - } - - if ($cur_part ne "" && $part ne $cur_part) { - $part = $cur_part; - my $bar = $part; - $bar =~ s/./-/g; - print "$part\n$bar\n\n"; - } - - printf ".. _%s:\n\n", $data{$what}->{label}; - - my @names = split /\xac/,$w; - my $len = 0; - - foreach my $name (@names) { - $name =~ s/$symbols/\\$1/g; - $name = "**$name**"; - $len = length($name) if (length($name) > $len); - } - - print "+-" . "-" x $len . "-+\n"; - foreach my $name (@names) { - printf "| %s", $name . " " x ($len - length($name)) . " |\n"; - print "+-" . "-" x $len . "-+\n"; - } - - print "\n"; - } - - for (my $i = 0; $i < scalar(@filepath); $i++) { - my $path = $filepath[$i]; - my $f = $file[$i]; - - $path =~ s,.*/(.*/.*),$1,;; - $path =~ s,[/\-],_,g;; - my $fileref = "abi_file_".$path; - - if ($type eq "File") { - print ".. _$fileref:\n\n"; - } else { - print "Defined on file :ref:`$f <$fileref>`\n\n"; - } - } - - if ($type eq "File") { - my $bar = $w; - $bar =~ s/./-/g; - print "$w\n$bar\n\n"; - } - - my $desc = ""; - $desc = $data{$what}->{description} if (defined($data{$what}->{description})); - $desc =~ s/\s+$/\n/; - - if (!($desc =~ /^\s*$/)) { - if ($description_is_rst) { - # Remove title markups from the description - # Having titles inside ABI files will only work if extra - # care would be taken in order to strictly follow the same - # level order for each markup. - $desc =~ s/\n[\-\*\=\^\~]+\n/\n\n/g; - - # Enrich text by creating cross-references - - my $new_desc = ""; - my $init_indent = -1; - my $literal_indent = -1; - - open(my $fh, "+<", \$desc); - while (my $d = <$fh>) { - my $indent = $d =~ m/^(\s+)/; - my $spaces = length($indent); - $init_indent = $indent if ($init_indent < 0); - if ($literal_indent >= 0) { - if ($spaces > $literal_indent) { - $new_desc .= $d; - next; - } else { - $literal_indent = -1; - } - } else { - if ($d =~ /()::$/ && !($d =~ /^\s*\.\./)) { - $literal_indent = $spaces; - } - } - - $d =~ s,Documentation/(?!devicetree)(\S+)\.rst,:doc:`/$1`,g; - - my @matches = $d =~ m,Documentation/ABI/([\w\/\-]+),g; - foreach my $f (@matches) { - my $xref = $f; - my $path = $f; - $path =~ s,.*/(.*/.*),$1,;; - $path =~ s,[/\-],_,g;; - $xref .= " <abi_file_" . $path . ">"; - $d =~ s,\bDocumentation/ABI/$f\b,:ref:`$xref`,g; - } - - # Seek for cross reference symbols like /sys/... - @matches = $d =~ m/$xref_match/g; - - foreach my $s (@matches) { - next if (!($s =~ m,/,)); - if (defined($data{$s}) && defined($data{$s}->{label})) { - my $xref = $s; - - $xref =~ s/$symbols/\\$1/g; - $xref = ":ref:`$xref <" . $data{$s}->{label} . ">`"; - - $d =~ s,$start$s$bondary,$1$xref$2,g; - } - } - $new_desc .= $d; - } - close $fh; - - - print "$new_desc\n\n"; - } else { - $desc =~ s/^\s+//; - - # Remove title markups from the description, as they won't work - $desc =~ s/\n[\-\*\=\^\~]+\n/\n\n/g; - - if ($desc =~ m/\:\n/ || $desc =~ m/\n[\t ]+/ || $desc =~ m/[\x00-\x08\x0b-\x1f\x7b-\xff]/) { - # put everything inside a code block - $desc =~ s/\n/\n /g; - - print "::\n\n"; - print " $desc\n\n"; - } else { - # Escape any special chars from description - $desc =~s/([\x00-\x08\x0b-\x1f\x21-\x2a\x2d\x2f\x3c-\x40\x5c\x5e-\x60\x7b-\xff])/\\$1/g; - print "$desc\n\n"; - } - } - } else { - print "DESCRIPTION MISSING for $what\n\n" if (!$data{$what}->{is_file}); - } - - if ($data{$what}->{symbols}) { - printf "Has the following ABI:\n\n"; - - foreach my $content(@{$data{$what}->{symbols}}) { - my $label = $data{$symbols{$content}->{xref}}->{label}; - - # Escape special chars from content - $content =~s/([\x00-\x1f\x21-\x2f\x3a-\x40\x7b-\xff])/\\$1/g; - - print "- :ref:`$content <$label>`\n\n"; - } - } - - if (defined($data{$what}->{users})) { - my $users = $data{$what}->{users}; - - $users =~ s/\n/\n\t/g; - printf "Users:\n\t%s\n\n", $users if ($users ne ""); - } - - } -} - -# -# Searches for ABI symbols -# -sub search_symbols { - foreach my $what (sort keys %data) { - next if (!($what =~ m/($arg)/)); - - my $type = $data{$what}->{type}; - next if ($type eq "File"); - - my $file = $data{$what}->{filepath}; - - $what =~ s/\xac/, /g; - my $bar = $what; - $bar =~ s/./-/g; - - print "\n$what\n$bar\n\n"; - - my $kernelversion = $data{$what}->{kernelversion} if (defined($data{$what}->{kernelversion})); - my $contact = $data{$what}->{contact} if (defined($data{$what}->{contact})); - my $users = $data{$what}->{users} if (defined($data{$what}->{users})); - my $date = $data{$what}->{date} if (defined($data{$what}->{date})); - my $desc = $data{$what}->{description} if (defined($data{$what}->{description})); - - $kernelversion =~ s/^\s+// if ($kernelversion); - $contact =~ s/^\s+// if ($contact); - if ($users) { - $users =~ s/^\s+//; - $users =~ s/\n//g; - } - $date =~ s/^\s+// if ($date); - $desc =~ s/^\s+// if ($desc); - - printf "Kernel version:\t\t%s\n", $kernelversion if ($kernelversion); - printf "Date:\t\t\t%s\n", $date if ($date); - printf "Contact:\t\t%s\n", $contact if ($contact); - printf "Users:\t\t\t%s\n", $users if ($users); - print "Defined on file(s):\t$file\n\n"; - print "Description:\n\n$desc"; - } -} - -# Exclude /sys/kernel/debug and /sys/kernel/tracing from the search path -sub dont_parse_special_attributes { - if (($File::Find::dir =~ m,^/sys/kernel,)) { - return grep {!/(debug|tracing)/ } @_; - } - - if (($File::Find::dir =~ m,^/sys/fs,)) { - return grep {!/(pstore|bpf|fuse)/ } @_; - } - - return @_ -} - -my %leaf; -my %aliases; -my @files; -my %root; - -sub graph_add_file { - my $file = shift; - my $type = shift; - - my $dir = $file; - $dir =~ s,^(.*/).*,$1,; - $file =~ s,.*/,,; - - my $name; - my $file_ref = \%root; - foreach my $edge(split "/", $dir) { - $name .= "$edge/"; - if (!defined ${$file_ref}{$edge}) { - ${$file_ref}{$edge} = { }; - } - $file_ref = \%{$$file_ref{$edge}}; - ${$file_ref}{"__name"} = [ $name ]; - } - $name .= "$file"; - ${$file_ref}{$file} = { - "__name" => [ $name ] - }; - - return \%{$$file_ref{$file}}; -} - -sub graph_add_link { - my $file = shift; - my $link = shift; - - # Traverse graph to find the reference - my $file_ref = \%root; - foreach my $edge(split "/", $file) { - $file_ref = \%{$$file_ref{$edge}} || die "Missing node!"; - } - - # do a BFS - - my @queue; - my %seen; - my $st; - - push @queue, $file_ref; - $seen{$start}++; - - while (@queue) { - my $v = shift @queue; - my @child = keys(%{$v}); - - foreach my $c(@child) { - next if $seen{$$v{$c}}; - next if ($c eq "__name"); - - if (!defined($$v{$c}{"__name"})) { - printf STDERR "Error: Couldn't find a non-empty name on a children of $file/.*: "; - print STDERR Dumper(%{$v}); - exit; - } - - # Add new name - my $name = @{$$v{$c}{"__name"}}[0]; - if ($name =~ s#^$file/#$link/#) { - push @{$$v{$c}{"__name"}}, $name; - } - # Add child to the queue and mark as seen - push @queue, $$v{$c}; - $seen{$c}++; - } - } -} - -my $escape_symbols = qr { ([\x01-\x08\x0e-\x1f\x21-\x29\x2b-\x2d\x3a-\x40\x7b-\xfe]) }x; -sub parse_existing_sysfs { - my $file = $File::Find::name; - - my $mode = (lstat($file))[2]; - my $abs_file = abs_path($file); - - my @tmp; - push @tmp, $file; - push @tmp, $abs_file if ($abs_file ne $file); - - foreach my $f(@tmp) { - # Ignore cgroup, as this is big and has zero docs under ABI - return if ($f =~ m#^/sys/fs/cgroup/#); - - # Ignore firmware as it is documented elsewhere - # Either ACPI or under Documentation/devicetree/bindings/ - return if ($f =~ m#^/sys/firmware/#); - - # Ignore some sysfs nodes that aren't actually part of ABI - return if ($f =~ m#/sections|notes/#); - - # Would need to check at - # Documentation/admin-guide/kernel-parameters.txt, but this - # is not easily parseable. - return if ($f =~ m#/parameters/#); - } - - if (S_ISLNK($mode)) { - $aliases{$file} = $abs_file; - return; - } - - return if (S_ISDIR($mode)); - - # Trivial: file is defined exactly the same way at ABI What: - return if (defined($data{$file})); - return if (defined($data{$abs_file})); - - push @files, graph_add_file($abs_file, "file"); -} - -sub get_leave($) -{ - my $what = shift; - my $leave; - - my $l = $what; - my $stop = 1; - - $leave = $l; - $leave =~ s,/$,,; - $leave =~ s,.*/,,; - $leave =~ s/[\(\)]//g; - - # $leave is used to improve search performance at - # check_undefined_symbols, as the algorithm there can seek - # for a small number of "what". It also allows giving a - # hint about a leave with the same name somewhere else. - # However, there are a few occurences where the leave is - # either a wildcard or a number. Just group such cases - # altogether. - if ($leave =~ m/\.\*/ || $leave eq "" || $leave =~ /\\d/) { - $leave = "others"; - } - - return $leave; -} - -my @not_found; - -sub check_file($$) -{ - my $file_ref = shift; - my $names_ref = shift; - my @names = @{$names_ref}; - my $file = $names[0]; - - my $found_string; - - my $leave = get_leave($file); - if (!defined($leaf{$leave})) { - $leave = "others"; - } - my @expr = @{$leaf{$leave}->{expr}}; - die ("\rmissing rules for $leave") if (!defined($leaf{$leave})); - - my $path = $file; - $path =~ s,(.*/).*,$1,; - - if ($search_string) { - return if (!($file =~ m#$search_string#)); - $found_string = 1; - } - - for (my $i = 0; $i < @names; $i++) { - if ($found_string && $hint) { - if (!$i) { - print STDERR "--> $names[$i]\n"; - } else { - print STDERR " $names[$i]\n"; - } - } - foreach my $re (@expr) { - print STDERR "$names[$i] =~ /^$re\$/\n" if ($debug && $dbg_undefined); - if ($names[$i] =~ $re) { - return; - } - } - } - - if ($leave ne "others") { - my @expr = @{$leaf{"others"}->{expr}}; - for (my $i = 0; $i < @names; $i++) { - foreach my $re (@expr) { - print STDERR "$names[$i] =~ /^$re\$/\n" if ($debug && $dbg_undefined); - if ($names[$i] =~ $re) { - return; - } - } - } - } - - push @not_found, $file if (!$search_string || $found_string); - - if ($hint && (!$search_string || $found_string)) { - my $what = $leaf{$leave}->{what}; - $what =~ s/\xac/\n\t/g; - if ($leave ne "others") { - print STDERR "\r more likely regexes:\n\t$what\n"; - } else { - print STDERR "\r tested regexes:\n\t$what\n"; - } - } -} - -sub check_undefined_symbols { - my $num_files = scalar @files; - my $next_i = 0; - my $start_time = times; - - @files = sort @files; - - my $last_time = $start_time; - - # When either debug or hint is enabled, there's no sense showing - # progress, as the progress will be overriden. - if ($hint || ($debug && $dbg_undefined)) { - $next_i = $num_files; - } - - my $is_console; - $is_console = 1 if (-t STDERR); - - for (my $i = 0; $i < $num_files; $i++) { - my $file_ref = $files[$i]; - my @names = @{$$file_ref{"__name"}}; - - check_file($file_ref, \@names); - - my $cur_time = times; - - if ($i == $next_i || $cur_time > $last_time + 1) { - my $percent = $i * 100 / $num_files; - - my $tm = $cur_time - $start_time; - my $time = sprintf "%d:%02d", int($tm), 60 * ($tm - int($tm)); - - printf STDERR "\33[2K\r", if ($is_console); - printf STDERR "%s: processing sysfs files... %i%%: $names[0]", $time, $percent; - printf STDERR "\n", if (!$is_console); - STDERR->flush(); - - $next_i = int (($percent + 1) * $num_files / 100); - $last_time = $cur_time; - } - } - - my $cur_time = times; - my $tm = $cur_time - $start_time; - my $time = sprintf "%d:%02d", int($tm), 60 * ($tm - int($tm)); - - printf STDERR "\33[2K\r", if ($is_console); - printf STDERR "%s: processing sysfs files... done\n", $time; - - foreach my $file (@not_found) { - print "$file not found.\n"; - } -} - -sub undefined_symbols { - print STDERR "Reading $sysfs_prefix directory contents..."; - find({ - wanted =>\&parse_existing_sysfs, - preprocess =>\&dont_parse_special_attributes, - no_chdir => 1 - }, $sysfs_prefix); - print STDERR "done.\n"; - - $leaf{"others"}->{what} = ""; - - print STDERR "Converting ABI What fields into regexes..."; - foreach my $w (sort keys %data) { - foreach my $what (split /\xac/,$w) { - next if (!($what =~ m/^$sysfs_prefix/)); - - # Convert what into regular expressions - - # Escape dot characters - $what =~ s/\./\xf6/g; - - # Temporarily change [0-9]+ type of patterns - $what =~ s/\[0\-9\]\+/\xff/g; - - # Temporarily change [\d+-\d+] type of patterns - $what =~ s/\[0\-\d+\]/\xff/g; - $what =~ s/\[(\d+)\]/\xf4$1\xf5/g; - - # Temporarily change [0-9] type of patterns - $what =~ s/\[(\d)\-(\d)\]/\xf4$1-$2\xf5/g; - - # Handle multiple option patterns - $what =~ s/[\{\<\[]([\w_]+)(?:[,|]+([\w_]+)){1,}[\}\>\]]/($1|$2)/g; - - # Handle wildcards - $what =~ s,\*,.*,g; - $what =~ s,/\xf6..,/.*,g; - $what =~ s/\<[^\>]+\>/.*/g; - $what =~ s/\{[^\}]+\}/.*/g; - $what =~ s/\[[^\]]+\]/.*/g; - - $what =~ s/[XYZ]/.*/g; - - # Recover [0-9] type of patterns - $what =~ s/\xf4/[/g; - $what =~ s/\xf5/]/g; - - # Remove duplicated spaces - $what =~ s/\s+/ /g; - - # Special case: this ABI has a parenthesis on it - $what =~ s/sqrt\(x^2\+y^2\+z^2\)/sqrt\(x^2\+y^2\+z^2\)/; - - # Special case: drop comparition as in: - # What: foo = <something> - # (this happens on a few IIO definitions) - $what =~ s,\s*\=.*$,,; - - # Escape all other symbols - $what =~ s/$escape_symbols/\\$1/g; - $what =~ s/\\\\/\\/g; - $what =~ s/\\([\[\]\(\)\|])/$1/g; - $what =~ s/(\d+)\\(-\d+)/$1$2/g; - - $what =~ s/\xff/\\d+/g; - - # Special case: IIO ABI which a parenthesis. - $what =~ s/sqrt(.*)/sqrt\(.*\)/; - - # Simplify regexes with multiple .* - $what =~ s#(?:\.\*){2,}##g; -# $what =~ s#\.\*/\.\*#.*#g; - - # Recover dot characters - $what =~ s/\xf6/\./g; - - my $leave = get_leave($what); - - my $added = 0; - foreach my $l (split /\|/, $leave) { - if (defined($leaf{$l})) { - next if ($leaf{$l}->{what} =~ m/\b$what\b/); - $leaf{$l}->{what} .= "\xac" . $what; - $added = 1; - } else { - $leaf{$l}->{what} = $what; - $added = 1; - } - } - if ($search_string && $added) { - print STDERR "What: $what\n" if ($what =~ m#$search_string#); - } - - } - } - # Compile regexes - foreach my $l (sort keys %leaf) { - my @expr; - foreach my $w(sort split /\xac/, $leaf{$l}->{what}) { - push @expr, qr /^$w$/; - } - $leaf{$l}->{expr} = \@expr; - } - - # Take links into account - foreach my $link (sort keys %aliases) { - my $abs_file = $aliases{$link}; - graph_add_link($abs_file, $link); - } - print STDERR "done.\n"; - - check_undefined_symbols; -} - -# Ensure that the prefix will always end with a slash -# While this is not needed for find, it makes the patch nicer -# with --enable-lineno -$prefix =~ s,/?$,/,; - -if ($cmd eq "undefined" || $cmd eq "search") { - $show_warnings = 0; -} -# -# Parses all ABI files located at $prefix dir -# -find({wanted =>\&parse_abi, no_chdir => 1}, $prefix); - -print STDERR Data::Dumper->Dump([\%data], [qw(*data)]) if ($debug & $dbg_dump_abi_structs); - -# -# Handles the command -# -if ($cmd eq "undefined") { - undefined_symbols; -} elsif ($cmd eq "search") { - search_symbols; -} else { - if ($cmd eq "rest") { - output_rest; - } - - # Warn about duplicated ABI entries - foreach my $what(sort keys %symbols) { - my @files = @{$symbols{$what}->{file}}; - - next if (scalar(@files) == 1); - - printf STDERR "Warning: $what is defined %d times: @files\n", - scalar(@files); - } -} - -__END__ - -=head1 NAME - -get_abi.pl - parse the Linux ABI files and produce a ReST book. - -=head1 SYNOPSIS - -B<get_abi.pl> [--debug <level>] [--enable-lineno] [--man] [--help] - [--(no-)rst-source] [--dir=<dir>] [--show-hints] - [--search-string <regex>] - <COMMAND> [<ARGUMENT>] - -Where B<COMMAND> can be: - -=over 8 - -B<search> I<SEARCH_REGEX> - search for I<SEARCH_REGEX> inside ABI - -B<rest> - output the ABI in ReST markup language - -B<validate> - validate the ABI contents - -B<undefined> - existing symbols at the system that aren't - defined at Documentation/ABI - -=back - -=head1 OPTIONS - -=over 8 - -=item B<--dir> - -Changes the location of the ABI search. By default, it uses -the Documentation/ABI directory. - -=item B<--rst-source> and B<--no-rst-source> - -The input file may be using ReST syntax or not. Those two options allow -selecting between a rst-compliant source ABI (B<--rst-source>), or a -plain text that may be violating ReST spec, so it requres some escaping -logic (B<--no-rst-source>). - -=item B<--enable-lineno> - -Enable output of .. LINENO lines. - -=item B<--debug> I<debug level> - -Print debug information according with the level, which is given by the -following bitmask: - - - 1: Debug parsing What entries from ABI files; - - 2: Shows what files are opened from ABI files; - - 4: Dump the structs used to store the contents of the ABI files. - -=item B<--show-hints> - -Show hints about possible definitions for the missing ABI symbols. -Used only when B<undefined>. - -=item B<--search-string> I<regex string> - -Show only occurences that match a search string. -Used only when B<undefined>. - -=item B<--help> - -Prints a brief help message and exits. - -=item B<--man> - -Prints the manual page and exits. - -=back - -=head1 DESCRIPTION - -Parse the Linux ABI files from ABI DIR (usually located at Documentation/ABI), -allowing to search for ABI symbols or to produce a ReST book containing -the Linux ABI documentation. - -=head1 EXAMPLES - -Search for all stable symbols with the word "usb": - -=over 8 - -$ scripts/get_abi.pl search usb --dir Documentation/ABI/stable - -=back - -Search for all symbols that match the regex expression "usb.*cap": - -=over 8 - -$ scripts/get_abi.pl search usb.*cap - -=back - -Output all obsoleted symbols in ReST format - -=over 8 - -$ scripts/get_abi.pl rest --dir Documentation/ABI/obsolete - -=back - -=head1 BUGS - -Report bugs to Mauro Carvalho Chehab <mchehab+huawei@kernel.org> - -=head1 COPYRIGHT - -Copyright (c) 2016-2021 by Mauro Carvalho Chehab <mchehab+huawei@kernel.org>. - -License GPLv2: GNU GPL version 2 <http://gnu.org/licenses/gpl.html>. - -This is free software: you are free to change and redistribute it. -There is NO WARRANTY, to the extent permitted by law. - -=cut diff --git a/scripts/get_abi.py b/scripts/get_abi.py new file mode 100755 index 000000000000..7ce4748a46d2 --- /dev/null +++ b/scripts/get_abi.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +# pylint: disable=R0903 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +import argparse +import logging +import os +import sys + +# Import Python modules + +LIB_DIR = "lib/abi" +SRC_DIR = os.path.dirname(os.path.realpath(__file__)) + +sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR)) + +from abi_parser import AbiParser # pylint: disable=C0413 +from abi_regex import AbiRegex # pylint: disable=C0413 +from helpers import ABI_DIR, DEBUG_HELP # pylint: disable=C0413 +from system_symbols import SystemSymbols # pylint: disable=C0413 + +# Command line classes + + +REST_DESC = """ +Produce output in ReST format. + +The output is done on two sections: + +- Symbols: show all parsed symbols in alphabetic order; +- Files: cross reference the content of each file with the symbols on it. +""" + +class AbiRest: + """Initialize an argparse subparser for rest output""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("rest", + formatter_class=argparse.RawTextHelpFormatter, + description=REST_DESC) + + parser.add_argument("--enable-lineno", action="store_true", + help="enable lineno") + parser.add_argument("--raw", action="store_true", + help="output text as contained in the ABI files. " + "It not used, output will contain dynamically" + " generated cross references when possible.") + parser.add_argument("--no-file", action="store_true", + help="Don't the files section") + parser.add_argument("--show-hints", help="Show-hints") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.check_issues() + + for t in parser.doc(args.raw, not args.no_file): + if args.enable_lineno: + print (f".. LINENO {t[1]}#{t[2]}\n\n") + + print(t[0]) + +class AbiValidate: + """Initialize an argparse subparser for ABI validation""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("validate", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="list events") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.check_issues() + + +class AbiSearch: + """Initialize an argparse subparser for ABI search""" + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("search", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Search ABI using a regular expression") + + parser.add_argument("expression", + help="Case-insensitive search pattern for the ABI symbol") + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + parser = AbiParser(args.dir, debug=args.debug) + parser.parse_abi() + parser.search_symbols(args.expression) + +UNDEFINED_DESC=""" +Check undefined ABIs on local machine. + +Read sysfs devnodes and check if the devnodes there are defined inside +ABI documentation. + +The search logic tries to minimize the number of regular expressions to +search per each symbol. + +By default, it runs on a single CPU, as Python support for CPU threads +is still experimental, and multi-process runs on Python is very slow. + +On experimental tests, if the number of ABI symbols to search per devnode +is contained on a limit of ~150 regular expressions, using a single CPU +is a lot faster than using multiple processes. However, if the number of +regular expressions to check is at the order of ~30000, using multiple +CPUs speeds up the check. +""" + +class AbiUndefined: + """ + Initialize an argparse subparser for logic to check undefined ABI at + the current machine's sysfs + """ + + def __init__(self, subparsers): + """Initialize argparse subparsers""" + + parser = subparsers.add_parser("undefined", + formatter_class=argparse.RawTextHelpFormatter, + description=UNDEFINED_DESC) + + parser.add_argument("-S", "--sysfs-dir", default="/sys", + help="directory where sysfs is mounted") + parser.add_argument("-s", "--search-string", + help="search string regular expression to limit symbol search") + parser.add_argument("-H", "--show-hints", action="store_true", + help="Hints about definitions for missing ABI symbols.") + parser.add_argument("-j", "--jobs", "--max-workers", type=int, default=1, + help="If bigger than one, enables multiprocessing.") + parser.add_argument("-c", "--max-chunk-size", type=int, default=50, + help="Maximum number of chunk size") + parser.add_argument("-f", "--found", action="store_true", + help="Also show found items. " + "Helpful to debug the parser."), + parser.add_argument("-d", "--dry-run", action="store_true", + help="Don't actually search for undefined. " + "Helpful to debug the parser."), + + parser.set_defaults(func=self.run) + + def run(self, args): + """Run subparser""" + + abi = AbiRegex(args.dir, debug=args.debug, + search_string=args.search_string) + + abi_symbols = SystemSymbols(abi=abi, hints=args.show_hints, + sysfs=args.sysfs_dir) + + abi_symbols.check_undefined_symbols(dry_run=args.dry_run, + found=args.found, + max_workers=args.jobs, + chunk_size=args.max_chunk_size) + + +def main(): + """Main program""" + + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument("-d", "--debug", type=int, default=0, help="debug level") + parser.add_argument("-D", "--dir", default=ABI_DIR, help=DEBUG_HELP) + + subparsers = parser.add_subparsers() + + AbiRest(subparsers) + AbiValidate(subparsers) + AbiSearch(subparsers) + AbiUndefined(subparsers) + + args = parser.parse_args() + + if args.debug: + level = logging.DEBUG + else: + level = logging.INFO + + logging.basicConfig(level=level, format="[%(levelname)s] %(message)s") + + if "func" in args: + args.func(args) + else: + sys.exit(f"Please specify a valid command for {sys.argv[0]}") + + +# Call main method +if __name__ == "__main__": + main() diff --git a/scripts/get_feat.pl b/scripts/get_feat.pl index 5c5397eeb237..40fb28c8424e 100755 --- a/scripts/get_feat.pl +++ b/scripts/get_feat.pl @@ -512,13 +512,13 @@ print STDERR Data::Dumper->Dump([\%data], [qw(*data)]) if ($debug); # Handles the command # if ($cmd eq "current") { - $arch = qx(uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/'); + $arch = qx(uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/' | sed 's/s390x/s390/'); $arch =~s/\s+$//; } if ($cmd eq "ls" or $cmd eq "list") { if (!$arch) { - $arch = qx(uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/'); + $arch = qx(uname -m | sed 's/x86_64/x86/' | sed 's/i386/x86/' | sed 's/s390x/s390/'); $arch =~s/\s+$//; } diff --git a/scripts/integer-wrap-ignore.scl b/scripts/integer-wrap-ignore.scl new file mode 100644 index 000000000000..431c3053a4a2 --- /dev/null +++ b/scripts/integer-wrap-ignore.scl @@ -0,0 +1,3 @@ +[{unsigned-integer-overflow,signed-integer-overflow,implicit-signed-integer-truncation,implicit-unsigned-integer-truncation}] +type:* +type:size_t=sanitize diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 03852da3d249..4b0234e4b12f 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -5,7 +5,7 @@ * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. * - * Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S + * Usage: kallsyms [--all-symbols] in.map > out.S * * Table compression uses all the unused char codes on the symbols and * maps these to the most used substrings (tokens). For instance, it might @@ -37,7 +37,6 @@ struct sym_entry { unsigned long long addr; unsigned int len; unsigned int seq; - bool percpu_absolute; unsigned char sym[]; }; @@ -55,14 +54,9 @@ static struct addr_range text_ranges[] = { #define text_range_text (&text_ranges[0]) #define text_range_inittext (&text_ranges[1]) -static struct addr_range percpu_range = { - "__per_cpu_start", "__per_cpu_end", -1ULL, 0 -}; - static struct sym_entry **table; static unsigned int table_size, table_cnt; static int all_symbols; -static int absolute_percpu; static int token_profit[0x10000]; @@ -73,7 +67,7 @@ static unsigned char best_table_len[256]; static void usage(void) { - fprintf(stderr, "Usage: kallsyms [--all-symbols] [--absolute-percpu] in.map > out.S\n"); + fprintf(stderr, "Usage: kallsyms [--all-symbols] in.map > out.S\n"); exit(1); } @@ -164,7 +158,6 @@ static struct sym_entry *read_symbol(FILE *in, char **buf, size_t *buf_len) return NULL; check_symbol_range(name, addr, text_ranges, ARRAY_SIZE(text_ranges)); - check_symbol_range(name, addr, &percpu_range, 1); /* include the type field in the symbol name, so that it gets * compressed together */ @@ -175,7 +168,6 @@ static struct sym_entry *read_symbol(FILE *in, char **buf, size_t *buf_len) sym->len = len; sym->sym[0] = type; strcpy(sym_name(sym), name); - sym->percpu_absolute = false; return sym; } @@ -319,11 +311,6 @@ static int expand_symbol(const unsigned char *data, int len, char *result) return total; } -static bool symbol_absolute(const struct sym_entry *s) -{ - return s->percpu_absolute; -} - static int compare_names(const void *a, const void *b) { int ret; @@ -455,22 +442,11 @@ static void write_src(void) */ long long offset; - bool overflow; - - if (!absolute_percpu) { - offset = table[i]->addr - relative_base; - overflow = offset < 0 || offset > UINT_MAX; - } else if (symbol_absolute(table[i])) { - offset = table[i]->addr; - overflow = offset < 0 || offset > INT_MAX; - } else { - offset = relative_base - table[i]->addr - 1; - overflow = offset < INT_MIN || offset >= 0; - } - if (overflow) { + + offset = table[i]->addr - relative_base; + if (offset < 0 || offset > UINT_MAX) { fprintf(stderr, "kallsyms failure: " - "%s symbol value %#llx out of range in relative mode\n", - symbol_absolute(table[i]) ? "absolute" : "relative", + "relative symbol value %#llx out of range\n", table[i]->addr); exit(EXIT_FAILURE); } @@ -725,36 +701,15 @@ static void sort_symbols(void) qsort(table, table_cnt, sizeof(table[0]), compare_symbols); } -static void make_percpus_absolute(void) -{ - unsigned int i; - - for (i = 0; i < table_cnt; i++) - if (symbol_in_range(table[i], &percpu_range, 1)) { - /* - * Keep the 'A' override for percpu symbols to - * ensure consistent behavior compared to older - * versions of this tool. - */ - table[i]->sym[0] = 'A'; - table[i]->percpu_absolute = true; - } -} - /* find the minimum non-absolute symbol address */ static void record_relative_base(void) { - unsigned int i; - - for (i = 0; i < table_cnt; i++) - if (!symbol_absolute(table[i])) { - /* - * The table is sorted by address. - * Take the first non-absolute symbol value. - */ - relative_base = table[i]->addr; - return; - } + /* + * The table is sorted by address. + * Take the first symbol value. + */ + if (table_cnt) + relative_base = table[0]->addr; } int main(int argc, char **argv) @@ -762,7 +717,6 @@ int main(int argc, char **argv) while (1) { static const struct option long_options[] = { {"all-symbols", no_argument, &all_symbols, 1}, - {"absolute-percpu", no_argument, &absolute_percpu, 1}, {}, }; @@ -779,8 +733,6 @@ int main(int argc, char **argv) read_map(argv[optind]); shrink_table(); - if (absolute_percpu) - make_percpus_absolute(); sort_symbols(); record_relative_base(); optimize_token_table(); diff --git a/scripts/kernel-doc b/scripts/kernel-doc index e57c5e989a0a..af6cf408b96d 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -26,7 +26,7 @@ kernel-doc - Print formatted kernel documentation to stdout kernel-doc [-h] [-v] [-Werror] [-Wall] [-Wreturn] [-Wshort-desc[ription]] [-Wcontents-before-sections] [ -man | - -rst [-sphinx-version VERSION] [-enable-lineno] | + -rst [-enable-lineno] | -none ] [ @@ -130,7 +130,6 @@ if ($#ARGV == -1) { } my $kernelversion; -my ($sphinx_major, $sphinx_minor, $sphinx_patch); my $dohighlight = ""; @@ -138,7 +137,6 @@ my $verbose = 0; my $Werror = 0; my $Wreturn = 0; my $Wshort_desc = 0; -my $Wcontents_before_sections = 0; my $output_mode = "rst"; my $output_preformatted = 0; my $no_doc_sections = 0; @@ -179,7 +177,7 @@ my ($function, %function_table, %parametertypes, $declaration_purpose); my %nosymbol_table = (); my $declaration_start_line; my ($type, $declaration_name, $return_type); -my ($newsection, $newcontents, $prototype, $brcount, %source_map); +my ($newsection, $newcontents, $prototype, $brcount); if (defined($ENV{'KBUILD_VERBOSE'}) && $ENV{'KBUILD_VERBOSE'} =~ '1') { $verbose = 1; @@ -224,7 +222,6 @@ use constant { STATE_INLINE => 7, # gathering doc outside main block }; my $state; -my $in_doc_sect; my $leading_space; # Inline documentation state @@ -333,12 +330,9 @@ while ($ARGV[0] =~ m/^--?(.*)/) { $Wreturn = 1; } elsif ($cmd eq "Wshort-desc" or $cmd eq "Wshort-description") { $Wshort_desc = 1; - } elsif ($cmd eq "Wcontents-before-sections") { - $Wcontents_before_sections = 1; } elsif ($cmd eq "Wall") { $Wreturn = 1; $Wshort_desc = 1; - $Wcontents_before_sections = 1; } elsif (($cmd eq "h") || ($cmd eq "help")) { pod2usage(-exitval => 0, -verbose => 2); } elsif ($cmd eq 'no-doc-sections') { @@ -347,23 +341,6 @@ while ($ARGV[0] =~ m/^--?(.*)/) { $enable_lineno = 1; } elsif ($cmd eq 'show-not-found') { $show_not_found = 1; # A no-op but don't fail - } elsif ($cmd eq "sphinx-version") { - my $ver_string = shift @ARGV; - if ($ver_string =~ m/^(\d+)(\.\d+)?(\.\d+)?/) { - $sphinx_major = $1; - if (defined($2)) { - $sphinx_minor = substr($2,1); - } else { - $sphinx_minor = 0; - } - if (defined($3)) { - $sphinx_patch = substr($3,1) - } else { - $sphinx_patch = 0; - } - } else { - die "Sphinx version should either major.minor or major.minor.patch format\n"; - } } else { # Unknown argument pod2usage( @@ -387,8 +364,6 @@ while ($ARGV[0] =~ m/^--?(.*)/) { # continue execution near EOF; -# The C domain dialect changed on Sphinx 3. So, we need to check the -# version in order to produce the right tags. sub findprog($) { foreach(split(/:/, $ENV{PATH})) { @@ -396,42 +371,6 @@ sub findprog($) } } -sub get_sphinx_version() -{ - my $ver; - - my $cmd = "sphinx-build"; - if (!findprog($cmd)) { - my $cmd = "sphinx-build3"; - if (!findprog($cmd)) { - $sphinx_major = 1; - $sphinx_minor = 2; - $sphinx_patch = 0; - printf STDERR "Warning: Sphinx version not found. Using default (Sphinx version %d.%d.%d)\n", - $sphinx_major, $sphinx_minor, $sphinx_patch; - return; - } - } - - open IN, "$cmd --version 2>&1 |"; - while (<IN>) { - if (m/^\s*sphinx-build\s+([\d]+)\.([\d\.]+)(\+\/[\da-f]+)?$/) { - $sphinx_major = $1; - $sphinx_minor = $2; - $sphinx_patch = $3; - last; - } - # Sphinx 1.2.x uses a different format - if (m/^\s*Sphinx.*\s+([\d]+)\.([\d\.]+)$/) { - $sphinx_major = $1; - $sphinx_minor = $2; - $sphinx_patch = $3; - last; - } - } - close IN; -} - # get kernel version from env sub get_kernel_version() { my $version = 'unknown kernel version'; @@ -816,6 +755,10 @@ sub output_highlight_rst { if ($block) { $output .= highlight_block($block); } + + $output =~ s/^\n+//g; + $output =~ s/\n+$//g; + foreach $line (split "\n", $output) { print $lineprefix . $line . "\n"; } @@ -859,9 +802,10 @@ sub output_function_rst(%) { $signature .= ")"; } - if ($sphinx_major < 3) { + if ($args{'typedef'} || $args{'functiontype'} eq "") { + print ".. c:macro:: ". $args{'function'} . "\n\n"; + if ($args{'typedef'}) { - print ".. c:type:: ". $args{'function'} . "\n\n"; print_lineno($declaration_start_line); print " **Typedef**: "; $lineprefix = ""; @@ -869,25 +813,10 @@ sub output_function_rst(%) { print "\n\n**Syntax**\n\n"; print " ``$signature``\n\n"; } else { - print ".. c:function:: $signature\n\n"; + print "``$signature``\n\n"; } } else { - if ($args{'typedef'} || $args{'functiontype'} eq "") { - print ".. c:macro:: ". $args{'function'} . "\n\n"; - - if ($args{'typedef'}) { - print_lineno($declaration_start_line); - print " **Typedef**: "; - $lineprefix = ""; - output_highlight_rst($args{'purpose'}); - print "\n\n**Syntax**\n\n"; - print " ``$signature``\n\n"; - } else { - print "``$signature``\n\n"; - } - } else { - print ".. c:function:: $signature\n\n"; - } + print ".. c:function:: $signature\n\n"; } if (!$args{'typedef'}) { @@ -955,13 +884,9 @@ sub output_enum_rst(%) { my $count; my $outer; - if ($sphinx_major < 3) { - my $name = "enum " . $args{'enum'}; - print "\n\n.. c:type:: " . $name . "\n\n"; - } else { - my $name = $args{'enum'}; - print "\n\n.. c:enum:: " . $name . "\n\n"; - } + my $name = $args{'enum'}; + print "\n\n.. c:enum:: " . $name . "\n\n"; + print_lineno($declaration_start_line); $lineprefix = " "; output_highlight_rst($args{'purpose'}); @@ -992,11 +917,8 @@ sub output_typedef_rst(%) { my $oldprefix = $lineprefix; my $name; - if ($sphinx_major < 3) { - $name = "typedef " . $args{'typedef'}; - } else { - $name = $args{'typedef'}; - } + $name = $args{'typedef'}; + print "\n\n.. c:type:: " . $name . "\n\n"; print_lineno($declaration_start_line); $lineprefix = " "; @@ -1012,17 +934,13 @@ sub output_struct_rst(%) { my ($parameter); my $oldprefix = $lineprefix; - if ($sphinx_major < 3) { - my $name = $args{'type'} . " " . $args{'struct'}; - print "\n\n.. c:type:: " . $name . "\n\n"; + my $name = $args{'struct'}; + if ($args{'type'} eq 'union') { + print "\n\n.. c:union:: " . $name . "\n\n"; } else { - my $name = $args{'struct'}; - if ($args{'type'} eq 'union') { - print "\n\n.. c:union:: " . $name . "\n\n"; - } else { - print "\n\n.. c:struct:: " . $name . "\n\n"; - } + print "\n\n.. c:struct:: " . $name . "\n\n"; } + print_lineno($declaration_start_line); $lineprefix = " "; output_highlight_rst($args{'purpose'}); @@ -2005,10 +1923,6 @@ sub map_filename($) { $file = $orig_file; } - if (defined($source_map{$file})) { - $file = $source_map{$file}; - } - return $file; } @@ -2044,7 +1958,6 @@ sub process_export_file($) { sub process_normal() { if (/$doc_start/o) { $state = STATE_NAME; # next line is always the function name - $in_doc_sect = 0; $declaration_start_line = $. + 1; } } @@ -2149,7 +2062,6 @@ sub process_body($$) { } if (/$doc_sect/i) { # case insensitive for supported section names - $in_doc_sect = 1; $newsection = $1; $newcontents = $2; @@ -2166,14 +2078,10 @@ sub process_body($$) { } if (($contents ne "") && ($contents ne "\n")) { - if (!$in_doc_sect && $Wcontents_before_sections) { - emit_warning("${file}:$.", "contents before sections\n"); - } dump_section($file, $section, $contents); $section = $section_default; } - $in_doc_sect = 1; $state = STATE_BODY; $contents = $newcontents; $new_start_line = $.; @@ -2387,11 +2295,6 @@ sub process_file($) { close IN_FILE; } - -if ($output_mode eq "rst") { - get_sphinx_version() if (!$sphinx_major); -} - $kernelversion = get_kernel_version(); # generate a sequence of code that will splice in highlighting information @@ -2403,19 +2306,6 @@ for (my $k = 0; $k < @highlights; $k++) { $dohighlight .= "\$contents =~ s:$pattern:$result:gs;\n"; } -# Read the file that maps relative names to absolute names for -# separate source and object directories and for shadow trees. -if (open(SOURCE_MAP, "<.tmp_filelist.txt")) { - my ($relname, $absname); - while(<SOURCE_MAP>) { - chop(); - ($relname, $absname) = (split())[0..1]; - $relname =~ s:^/+::; - $source_map{$relname} = $absname; - } - close(SOURCE_MAP); -} - if ($output_selection == OUTPUT_EXPORTED || $output_selection == OUTPUT_INTERNAL) { @@ -2471,17 +2361,6 @@ Do not output documentation, only warnings. =head3 reStructuredText only -=over 8 - -=item -sphinx-version VERSION - -Use the ReST C domain dialect compatible with a specific Sphinx Version. - -If not specified, kernel-doc will auto-detect using the sphinx-build version -found on PATH. - -=back - =head2 Output selection (mutually exclusive): =over 8 diff --git a/scripts/lib/abi/abi_parser.py b/scripts/lib/abi/abi_parser.py new file mode 100644 index 000000000000..66a738013ce1 --- /dev/null +++ b/scripts/lib/abi/abi_parser.py @@ -0,0 +1,628 @@ +#!/usr/bin/env python3 +# pylint: disable=R0902,R0903,R0911,R0912,R0913,R0914,R0915,R0917,C0302 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +from argparse import Namespace +import logging +import os +import re + +from pprint import pformat +from random import randrange, seed + +# Import Python modules + +from helpers import AbiDebug, ABI_DIR + + +class AbiParser: + """Main class to parse ABI files""" + + TAGS = r"(what|where|date|kernelversion|contact|description|users)" + XREF = r"(?:^|\s|\()(\/(?:sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)(?:[,.:;\)\s]|\Z)" + + def __init__(self, directory, logger=None, + enable_lineno=False, show_warnings=True, debug=0): + """Stores arguments for the class and initialize class vars""" + + self.directory = directory + self.enable_lineno = enable_lineno + self.show_warnings = show_warnings + self.debug = debug + + if not logger: + self.log = logging.getLogger("get_abi") + else: + self.log = logger + + self.data = {} + self.what_symbols = {} + self.file_refs = {} + self.what_refs = {} + + # Ignore files that contain such suffixes + self.ignore_suffixes = (".rej", ".org", ".orig", ".bak", "~") + + # Regular expressions used on parser + self.re_abi_dir = re.compile(r"(.*)" + ABI_DIR) + self.re_tag = re.compile(r"(\S+)(:\s*)(.*)", re.I) + self.re_valid = re.compile(self.TAGS) + self.re_start_spc = re.compile(r"(\s*)(\S.*)") + self.re_whitespace = re.compile(r"^\s+") + + # Regular used on print + self.re_what = re.compile(r"(\/?(?:[\w\-]+\/?){1,2})") + self.re_escape = re.compile(r"([\.\x01-\x08\x0e-\x1f\x21-\x2f\x3a-\x40\x7b-\xff])") + self.re_unprintable = re.compile(r"([\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\xff]+)") + self.re_title_mark = re.compile(r"\n[\-\*\=\^\~]+\n") + self.re_doc = re.compile(r"Documentation/(?!devicetree)(\S+)\.rst") + self.re_abi = re.compile(r"(Documentation/ABI/)([\w\/\-]+)") + self.re_xref_node = re.compile(self.XREF) + + def warn(self, fdata, msg, extra=None): + """Displays a parse error if warning is enabled""" + + if not self.show_warnings: + return + + msg = f"{fdata.fname}:{fdata.ln}: {msg}" + if extra: + msg += "\n\t\t" + extra + + self.log.warning(msg) + + def add_symbol(self, what, fname, ln=None, xref=None): + """Create a reference table describing where each 'what' is located""" + + if what not in self.what_symbols: + self.what_symbols[what] = {"file": {}} + + if fname not in self.what_symbols[what]["file"]: + self.what_symbols[what]["file"][fname] = [] + + if ln and ln not in self.what_symbols[what]["file"][fname]: + self.what_symbols[what]["file"][fname].append(ln) + + if xref: + self.what_symbols[what]["xref"] = xref + + def _parse_line(self, fdata, line): + """Parse a single line of an ABI file""" + + new_what = False + new_tag = False + content = None + + match = self.re_tag.match(line) + if match: + new = match.group(1).lower() + sep = match.group(2) + content = match.group(3) + + match = self.re_valid.search(new) + if match: + new_tag = match.group(1) + else: + if fdata.tag == "description": + # New "tag" is actually part of description. + # Don't consider it a tag + new_tag = False + elif fdata.tag != "": + self.warn(fdata, f"tag '{fdata.tag}' is invalid", line) + + if new_tag: + # "where" is Invalid, but was a common mistake. Warn if found + if new_tag == "where": + self.warn(fdata, "tag 'Where' is invalid. Should be 'What:' instead") + new_tag = "what" + + if new_tag == "what": + fdata.space = None + + if content not in self.what_symbols: + self.add_symbol(what=content, fname=fdata.fname, ln=fdata.ln) + + if fdata.tag == "what": + fdata.what.append(content.strip("\n")) + else: + if fdata.key: + if "description" not in self.data.get(fdata.key, {}): + self.warn(fdata, f"{fdata.key} doesn't have a description") + + for w in fdata.what: + self.add_symbol(what=w, fname=fdata.fname, + ln=fdata.what_ln, xref=fdata.key) + + fdata.label = content + new_what = True + + key = "abi_" + content.lower() + fdata.key = self.re_unprintable.sub("_", key).strip("_") + + # Avoid duplicated keys but using a defined seed, to make + # the namespace identical if there aren't changes at the + # ABI symbols + seed(42) + + while fdata.key in self.data: + char = randrange(0, 51) + ord("A") + if char > ord("Z"): + char += ord("a") - ord("Z") - 1 + + fdata.key += chr(char) + + if fdata.key and fdata.key not in self.data: + self.data[fdata.key] = { + "what": [content], + "file": [fdata.file_ref], + "path": fdata.ftype, + "line_no": fdata.ln, + } + + fdata.what = self.data[fdata.key]["what"] + + self.what_refs[content] = fdata.key + fdata.tag = new_tag + fdata.what_ln = fdata.ln + + if fdata.nametag["what"]: + t = (content, fdata.key) + if t not in fdata.nametag["symbols"]: + fdata.nametag["symbols"].append(t) + + return + + if fdata.tag and new_tag: + fdata.tag = new_tag + + if new_what: + fdata.label = "" + + if "description" in self.data[fdata.key]: + self.data[fdata.key]["description"] += "\n\n" + + if fdata.file_ref not in self.data[fdata.key]["file"]: + self.data[fdata.key]["file"].append(fdata.file_ref) + + if self.debug == AbiDebug.WHAT_PARSING: + self.log.debug("what: %s", fdata.what) + + if not fdata.what: + self.warn(fdata, "'What:' should come first:", line) + return + + if new_tag == "description": + fdata.space = None + + if content: + sep = sep.replace(":", " ") + + c = " " * len(new_tag) + sep + content + c = c.expandtabs() + + match = self.re_start_spc.match(c) + if match: + # Preserve initial spaces for the first line + fdata.space = match.group(1) + content = match.group(2) + "\n" + + self.data[fdata.key][fdata.tag] = content + + return + + # Store any contents before tags at the database + if not fdata.tag and "what" in fdata.nametag: + fdata.nametag["description"] += line + return + + if fdata.tag == "description": + content = line.expandtabs() + + if self.re_whitespace.sub("", content) == "": + self.data[fdata.key][fdata.tag] += "\n" + return + + if fdata.space is None: + match = self.re_start_spc.match(content) + if match: + # Preserve initial spaces for the first line + fdata.space = match.group(1) + + content = match.group(2) + "\n" + else: + if content.startswith(fdata.space): + content = content[len(fdata.space):] + + else: + fdata.space = "" + + if fdata.tag == "what": + w = content.strip("\n") + if w: + self.data[fdata.key][fdata.tag].append(w) + else: + self.data[fdata.key][fdata.tag] += content + return + + content = line.strip() + if fdata.tag: + if fdata.tag == "what": + w = content.strip("\n") + if w: + self.data[fdata.key][fdata.tag].append(w) + else: + self.data[fdata.key][fdata.tag] += "\n" + content.rstrip("\n") + return + + # Everything else is error + if content: + self.warn(fdata, "Unexpected content", line) + + def parse_readme(self, nametag, fname): + """Parse ABI README file""" + + nametag["what"] = ["Introduction"] + nametag["path"] = "README" + with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: + for line in fp: + match = self.re_tag.match(line) + if match: + new = match.group(1).lower() + + match = self.re_valid.search(new) + if match: + nametag["description"] += "\n:" + line + continue + + nametag["description"] += line + + def parse_file(self, fname, path, basename): + """Parse a single file""" + + ref = f"abi_file_{path}_{basename}" + ref = self.re_unprintable.sub("_", ref).strip("_") + + # Store per-file state into a namespace variable. This will be used + # by the per-line parser state machine and by the warning function. + fdata = Namespace + + fdata.fname = fname + fdata.name = basename + + pos = fname.find(ABI_DIR) + if pos > 0: + f = fname[pos:] + else: + f = fname + + fdata.file_ref = (f, ref) + self.file_refs[f] = ref + + fdata.ln = 0 + fdata.what_ln = 0 + fdata.tag = "" + fdata.label = "" + fdata.what = [] + fdata.key = None + fdata.xrefs = None + fdata.space = None + fdata.ftype = path.split("/")[0] + + fdata.nametag = {} + fdata.nametag["what"] = [f"ABI file {path}/{basename}"] + fdata.nametag["type"] = "File" + fdata.nametag["path"] = fdata.ftype + fdata.nametag["file"] = [fdata.file_ref] + fdata.nametag["line_no"] = 1 + fdata.nametag["description"] = "" + fdata.nametag["symbols"] = [] + + self.data[ref] = fdata.nametag + + if self.debug & AbiDebug.WHAT_OPEN: + self.log.debug("Opening file %s", fname) + + if basename == "README": + self.parse_readme(fdata.nametag, fname) + return + + with open(fname, "r", encoding="utf8", errors="backslashreplace") as fp: + for line in fp: + fdata.ln += 1 + + self._parse_line(fdata, line) + + if "description" in fdata.nametag: + fdata.nametag["description"] = fdata.nametag["description"].lstrip("\n") + + if fdata.key: + if "description" not in self.data.get(fdata.key, {}): + self.warn(fdata, f"{fdata.key} doesn't have a description") + + for w in fdata.what: + self.add_symbol(what=w, fname=fname, xref=fdata.key) + + def _parse_abi(self, root=None): + """Internal function to parse documentation ABI recursively""" + + if not root: + root = self.directory + + with os.scandir(root) as obj: + for entry in obj: + name = os.path.join(root, entry.name) + + if entry.is_dir(): + self._parse_abi(name) + continue + + if not entry.is_file(): + continue + + basename = os.path.basename(name) + + if basename.startswith("."): + continue + + if basename.endswith(self.ignore_suffixes): + continue + + path = self.re_abi_dir.sub("", os.path.dirname(name)) + + self.parse_file(name, path, basename) + + def parse_abi(self, root=None): + """Parse documentation ABI""" + + self._parse_abi(root) + + if self.debug & AbiDebug.DUMP_ABI_STRUCTS: + self.log.debug(pformat(self.data)) + + def desc_txt(self, desc): + """Print description as found inside ABI files""" + + desc = desc.strip(" \t\n") + + return desc + "\n\n" + + def xref(self, fname): + """ + Converts a Documentation/ABI + basename into a ReST cross-reference + """ + + xref = self.file_refs.get(fname) + if not xref: + return None + else: + return xref + + def desc_rst(self, desc): + """Enrich ReST output by creating cross-references""" + + # Remove title markups from the description + # Having titles inside ABI files will only work if extra + # care would be taken in order to strictly follow the same + # level order for each markup. + desc = self.re_title_mark.sub("\n\n", "\n" + desc) + desc = desc.rstrip(" \t\n").lstrip("\n") + + # Python's regex performance for non-compiled expressions is a lot + # than Perl, as Perl automatically caches them at their + # first usage. Here, we'll need to do the same, as otherwise the + # performance penalty is be high + + new_desc = "" + for d in desc.split("\n"): + if d == "": + new_desc += "\n" + continue + + # Use cross-references for doc files where needed + d = self.re_doc.sub(r":doc:`/\1`", d) + + # Use cross-references for ABI generated docs where needed + matches = self.re_abi.findall(d) + for m in matches: + abi = m[0] + m[1] + + xref = self.file_refs.get(abi) + if not xref: + # This may happen if ABI is on a separate directory, + # like parsing ABI testing and symbol is at stable. + # The proper solution is to move this part of the code + # for it to be inside sphinx/kernel_abi.py + self.log.info("Didn't find ABI reference for '%s'", abi) + else: + new = self.re_escape.sub(r"\\\1", m[1]) + d = re.sub(fr"\b{abi}\b", f":ref:`{new} <{xref}>`", d) + + # Seek for cross reference symbols like /sys/... + # Need to be careful to avoid doing it on a code block + if d[0] not in [" ", "\t"]: + matches = self.re_xref_node.findall(d) + for m in matches: + # Finding ABI here is more complex due to wildcards + xref = self.what_refs.get(m) + if xref: + new = self.re_escape.sub(r"\\\1", m) + d = re.sub(fr"\b{m}\b", f":ref:`{new} <{xref}>`", d) + + new_desc += d + "\n" + + return new_desc + "\n\n" + + def doc(self, output_in_txt=False, show_symbols=True, show_file=True, + filter_path=None): + """Print ABI at stdout""" + + part = None + for key, v in sorted(self.data.items(), + key=lambda x: (x[1].get("type", ""), + x[1].get("what"))): + + wtype = v.get("type", "Symbol") + file_ref = v.get("file") + names = v.get("what", [""]) + + if wtype == "File": + if not show_file: + continue + else: + if not show_symbols: + continue + + if filter_path: + if v.get("path") != filter_path: + continue + + msg = "" + + if wtype != "File": + cur_part = names[0] + if cur_part.find("/") >= 0: + match = self.re_what.match(cur_part) + if match: + symbol = match.group(1).rstrip("/") + cur_part = "Symbols under " + symbol + + if cur_part and cur_part != part: + part = cur_part + msg += part + "\n"+ "-" * len(part) +"\n\n" + + msg += f".. _{key}:\n\n" + + max_len = 0 + for i in range(0, len(names)): # pylint: disable=C0200 + names[i] = "**" + self.re_escape.sub(r"\\\1", names[i]) + "**" + + max_len = max(max_len, len(names[i])) + + msg += "+-" + "-" * max_len + "-+\n" + for name in names: + msg += f"| {name}" + " " * (max_len - len(name)) + " |\n" + msg += "+-" + "-" * max_len + "-+\n" + msg += "\n" + + for ref in file_ref: + if wtype == "File": + msg += f".. _{ref[1]}:\n\n" + else: + base = os.path.basename(ref[0]) + msg += f"Defined on file :ref:`{base} <{ref[1]}>`\n\n" + + if wtype == "File": + msg += names[0] +"\n" + "-" * len(names[0]) +"\n\n" + + desc = v.get("description") + if not desc and wtype != "File": + msg += f"DESCRIPTION MISSING for {names[0]}\n\n" + + if desc: + if output_in_txt: + msg += self.desc_txt(desc) + else: + msg += self.desc_rst(desc) + + symbols = v.get("symbols") + if symbols: + msg += "Has the following ABI:\n\n" + + for w, label in symbols: + # Escape special chars from content + content = self.re_escape.sub(r"\\\1", w) + + msg += f"- :ref:`{content} <{label}>`\n\n" + + users = v.get("users") + if users and users.strip(" \t\n"): + users = users.strip("\n").replace('\n', '\n\t') + msg += f"Users:\n\t{users}\n\n" + + ln = v.get("line_no", 1) + + yield (msg, file_ref[0][0], ln) + + def check_issues(self): + """Warn about duplicated ABI entries""" + + for what, v in self.what_symbols.items(): + files = v.get("file") + if not files: + # Should never happen if the parser works properly + self.log.warning("%s doesn't have a file associated", what) + continue + + if len(files) == 1: + continue + + f = [] + for fname, lines in sorted(files.items()): + if not lines: + f.append(f"{fname}") + elif len(lines) == 1: + f.append(f"{fname}:{lines[0]}") + else: + m = fname + "lines " + m += ", ".join(str(x) for x in lines) + f.append(m) + + self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f)) + + def search_symbols(self, expr): + """ Searches for ABI symbols """ + + regex = re.compile(expr, re.I) + + found_keys = 0 + for t in sorted(self.data.items(), key=lambda x: [0]): + v = t[1] + + wtype = v.get("type", "") + if wtype == "File": + continue + + for what in v.get("what", [""]): + if regex.search(what): + found_keys += 1 + + kernelversion = v.get("kernelversion", "").strip(" \t\n") + date = v.get("date", "").strip(" \t\n") + contact = v.get("contact", "").strip(" \t\n") + users = v.get("users", "").strip(" \t\n") + desc = v.get("description", "").strip(" \t\n") + + files = [] + for f in v.get("file", ()): + files.append(f[0]) + + what = str(found_keys) + ". " + what + title_tag = "-" * len(what) + + print(f"\n{what}\n{title_tag}\n") + + if kernelversion: + print(f"Kernel version:\t\t{kernelversion}") + + if date: + print(f"Date:\t\t\t{date}") + + if contact: + print(f"Contact:\t\t{contact}") + + if users: + print(f"Users:\t\t\t{users}") + + print("Defined on file(s):\t" + ", ".join(files)) + + if desc: + desc = desc.strip("\n") + print(f"\n{desc}\n") + + if not found_keys: + print(f"Regular expression /{expr}/ not found.") diff --git a/scripts/lib/abi/abi_regex.py b/scripts/lib/abi/abi_regex.py new file mode 100644 index 000000000000..8a57846cbc69 --- /dev/null +++ b/scripts/lib/abi/abi_regex.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +# xxpylint: disable=R0903 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + +""" +Convert ABI what into regular expressions +""" + +import re +import sys + +from pprint import pformat + +from abi_parser import AbiParser +from helpers import AbiDebug + +class AbiRegex(AbiParser): + """Extends AbiParser to search ABI nodes with regular expressions""" + + # Escape only ASCII visible characters + escape_symbols = r"([\x21-\x29\x2b-\x2d\x3a-\x40\x5c\x60\x7b-\x7e])" + leave_others = "others" + + # Tuples with regular expressions to be compiled and replacement data + re_whats = [ + # Drop escape characters that might exist + (re.compile("\\\\"), ""), + + # Temporarily escape dot characters + (re.compile(r"\."), "\xf6"), + + # Temporarily change [0-9]+ type of patterns + (re.compile(r"\[0\-9\]\+"), "\xff"), + + # Temporarily change [\d+-\d+] type of patterns + (re.compile(r"\[0\-\d+\]"), "\xff"), + (re.compile(r"\[0:\d+\]"), "\xff"), + (re.compile(r"\[(\d+)\]"), "\xf4\\\\d+\xf5"), + + # Temporarily change [0-9] type of patterns + (re.compile(r"\[(\d)\-(\d)\]"), "\xf4\1-\2\xf5"), + + # Handle multiple option patterns + (re.compile(r"[\{\<\[]([\w_]+)(?:[,|]+([\w_]+)){1,}[\}\>\]]"), r"(\1|\2)"), + + # Handle wildcards + (re.compile(r"([^\/])\*"), "\\1\\\\w\xf7"), + (re.compile(r"/\*/"), "/.*/"), + (re.compile(r"/\xf6\xf6\xf6"), "/.*"), + (re.compile(r"\<[^\>]+\>"), "\\\\w\xf7"), + (re.compile(r"\{[^\}]+\}"), "\\\\w\xf7"), + (re.compile(r"\[[^\]]+\]"), "\\\\w\xf7"), + + (re.compile(r"XX+"), "\\\\w\xf7"), + (re.compile(r"([^A-Z])[XYZ]([^A-Z])"), "\\1\\\\w\xf7\\2"), + (re.compile(r"([^A-Z])[XYZ]$"), "\\1\\\\w\xf7"), + (re.compile(r"_[AB]_"), "_\\\\w\xf7_"), + + # Recover [0-9] type of patterns + (re.compile(r"\xf4"), "["), + (re.compile(r"\xf5"), "]"), + + # Remove duplicated spaces + (re.compile(r"\s+"), r" "), + + # Special case: drop comparison as in: + # What: foo = <something> + # (this happens on a few IIO definitions) + (re.compile(r"\s*\=.*$"), ""), + + # Escape all other symbols + (re.compile(escape_symbols), r"\\\1"), + (re.compile(r"\\\\"), r"\\"), + (re.compile(r"\\([\[\]\(\)\|])"), r"\1"), + (re.compile(r"(\d+)\\(-\d+)"), r"\1\2"), + + (re.compile(r"\xff"), r"\\d+"), + + # Special case: IIO ABI which a parenthesis. + (re.compile(r"sqrt(.*)"), r"sqrt(.*)"), + + # Simplify regexes with multiple .* + (re.compile(r"(?:\.\*){2,}"), ""), + + # Recover dot characters + (re.compile(r"\xf6"), "\\."), + # Recover plus characters + (re.compile(r"\xf7"), "+"), + ] + re_has_num = re.compile(r"\\d") + + # Symbol name after escape_chars that are considered a devnode basename + re_symbol_name = re.compile(r"(\w|\\[\.\-\:])+$") + + # List of popular group names to be skipped to minimize regex group size + # Use AbiDebug.SUBGROUP_SIZE to detect those + skip_names = set(["devices", "hwmon"]) + + def regex_append(self, what, new): + """ + Get a search group for a subset of regular expressions. + + As ABI may have thousands of symbols, using a for to search all + regular expressions is at least O(n^2). When there are wildcards, + the complexity increases substantially, eventually becoming exponential. + + To avoid spending too much time on them, use a logic to split + them into groups. The smaller the group, the better, as it would + mean that searches will be confined to a small number of regular + expressions. + + The conversion to a regex subset is tricky, as we need something + that can be easily obtained from the sysfs symbol and from the + regular expression. So, we need to discard nodes that have + wildcards. + + If it can't obtain a subgroup, place the regular expression inside + a special group (self.leave_others). + """ + + search_group = None + + for search_group in reversed(new.split("/")): + if not search_group or search_group in self.skip_names: + continue + if self.re_symbol_name.match(search_group): + break + + if not search_group: + search_group = self.leave_others + + if self.debug & AbiDebug.SUBGROUP_MAP: + self.log.debug("%s: mapped as %s", what, search_group) + + try: + if search_group not in self.regex_group: + self.regex_group[search_group] = [] + + self.regex_group[search_group].append(re.compile(new)) + if self.search_string: + if what.find(self.search_string) >= 0: + print(f"What: {what}") + except re.PatternError: + self.log.warning("Ignoring '%s' as it produced an invalid regex:\n" + " '%s'", what, new) + + def get_regexes(self, what): + """ + Given an ABI devnode, return a list of all regular expressions that + may match it, based on the sub-groups created by regex_append() + """ + + re_list = [] + + patches = what.split("/") + patches.reverse() + patches.append(self.leave_others) + + for search_group in patches: + if search_group in self.regex_group: + re_list += self.regex_group[search_group] + + return re_list + + def __init__(self, *args, **kwargs): + """ + Override init method to get verbose argument + """ + + self.regex_group = None + self.search_string = None + self.re_string = None + + if "search_string" in kwargs: + self.search_string = kwargs.get("search_string") + del kwargs["search_string"] + + if self.search_string: + + try: + self.re_string = re.compile(self.search_string) + except re.PatternError as e: + msg = f"{self.search_string} is not a valid regular expression" + raise ValueError(msg) from e + + super().__init__(*args, **kwargs) + + def parse_abi(self, *args, **kwargs): + + super().parse_abi(*args, **kwargs) + + self.regex_group = {} + + print("Converting ABI What fields into regexes...", file=sys.stderr) + + for t in sorted(self.data.items(), key=lambda x: x[0]): + v = t[1] + if v.get("type") == "File": + continue + + v["regex"] = [] + + for what in v.get("what", []): + if not what.startswith("/sys"): + continue + + new = what + for r, s in self.re_whats: + try: + new = r.sub(s, new) + except re.PatternError as e: + # Help debugging troubles with new regexes + raise re.PatternError(f"{e}\nwhile re.sub('{r.pattern}', {s}, str)") from e + + v["regex"].append(new) + + if self.debug & AbiDebug.REGEX: + self.log.debug("%-90s <== %s", new, what) + + # Store regex into a subgroup to speedup searches + self.regex_append(what, new) + + if self.debug & AbiDebug.SUBGROUP_DICT: + self.log.debug("%s", pformat(self.regex_group)) + + if self.debug & AbiDebug.SUBGROUP_SIZE: + biggestd_keys = sorted(self.regex_group.keys(), + key= lambda k: len(self.regex_group[k]), + reverse=True) + + print("Top regex subgroups:", file=sys.stderr) + for k in biggestd_keys[:10]: + print(f"{k} has {len(self.regex_group[k])} elements", file=sys.stderr) diff --git a/scripts/lib/abi/helpers.py b/scripts/lib/abi/helpers.py new file mode 100644 index 000000000000..639b23e4ca33 --- /dev/null +++ b/scripts/lib/abi/helpers.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# pylint: disable=R0903 +# SPDX-License-Identifier: GPL-2.0 + +""" +Helper classes for ABI parser +""" + +ABI_DIR = "Documentation/ABI/" + + +class AbiDebug: + """Debug levels""" + + WHAT_PARSING = 1 + WHAT_OPEN = 2 + DUMP_ABI_STRUCTS = 4 + UNDEFINED = 8 + REGEX = 16 + SUBGROUP_MAP = 32 + SUBGROUP_DICT = 64 + SUBGROUP_SIZE = 128 + GRAPH = 256 + + +DEBUG_HELP = """ +1 - enable debug parsing logic +2 - enable debug messages on file open +4 - enable debug for ABI parse data +8 - enable extra debug information to identify troubles + with ABI symbols found at the local machine that + weren't found on ABI documentation (used only for + undefined subcommand) +16 - enable debug for what to regex conversion +32 - enable debug for symbol regex subgroups +64 - enable debug for sysfs graph tree variable +""" diff --git a/scripts/lib/abi/system_symbols.py b/scripts/lib/abi/system_symbols.py new file mode 100644 index 000000000000..f15c94a6e33c --- /dev/null +++ b/scripts/lib/abi/system_symbols.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +# pylint: disable=R0902,R0912,R0914,R0915,R1702 +# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. +# SPDX-License-Identifier: GPL-2.0 + +""" +Parse ABI documentation and produce results from it. +""" + +import os +import re +import sys + +from concurrent import futures +from datetime import datetime +from random import shuffle + +from helpers import AbiDebug + +class SystemSymbols: + """Stores arguments for the class and initialize class vars""" + + def graph_add_file(self, path, link=None): + """ + add a file path to the sysfs graph stored at self.root + """ + + if path in self.files: + return + + name = "" + ref = self.root + for edge in path.split("/"): + name += edge + "/" + if edge not in ref: + ref[edge] = {"__name": [name.rstrip("/")]} + + ref = ref[edge] + + if link and link not in ref["__name"]: + ref["__name"].append(link.rstrip("/")) + + self.files.add(path) + + def print_graph(self, root_prefix="", root=None, level=0): + """Prints a reference tree graph using UTF-8 characters""" + + if not root: + root = self.root + level = 0 + + # Prevent endless traverse + if level > 5: + return + + if level > 0: + prefix = "├──" + last_prefix = "└──" + else: + prefix = "" + last_prefix = "" + + items = list(root.items()) + + names = root.get("__name", []) + for k, edge in items: + if k == "__name": + continue + + if not k: + k = "/" + + if len(names) > 1: + k += " links: " + ",".join(names[1:]) + + if edge == items[-1][1]: + print(root_prefix + last_prefix + k) + p = root_prefix + if level > 0: + p += " " + self.print_graph(p, edge, level + 1) + else: + print(root_prefix + prefix + k) + p = root_prefix + "│ " + self.print_graph(p, edge, level + 1) + + def _walk(self, root): + """ + Walk through sysfs to get all devnodes that aren't ignored. + + By default, uses /sys as sysfs mounting point. If another + directory is used, it replaces them to /sys at the patches. + """ + + with os.scandir(root) as obj: + for entry in obj: + path = os.path.join(root, entry.name) + if self.sysfs: + p = path.replace(self.sysfs, "/sys", count=1) + else: + p = path + + if self.re_ignore.search(p): + return + + # Handle link first to avoid directory recursion + if entry.is_symlink(): + real = os.path.realpath(path) + if not self.sysfs: + self.aliases[path] = real + else: + real = real.replace(self.sysfs, "/sys", count=1) + + # Add absfile location to graph if it doesn't exist + if not self.re_ignore.search(real): + # Add link to the graph + self.graph_add_file(real, p) + + elif entry.is_file(): + self.graph_add_file(p) + + elif entry.is_dir(): + self._walk(path) + + def __init__(self, abi, sysfs="/sys", hints=False): + """ + Initialize internal variables and get a list of all files inside + sysfs that can currently be parsed. + + Please notice that there are several entries on sysfs that aren't + documented as ABI. Ignore those. + + The real paths will be stored under self.files. Aliases will be + stored in separate, as self.aliases. + """ + + self.abi = abi + self.log = abi.log + + if sysfs != "/sys": + self.sysfs = sysfs.rstrip("/") + else: + self.sysfs = None + + self.hints = hints + + self.root = {} + self.aliases = {} + self.files = set() + + dont_walk = [ + # Those require root access and aren't documented at ABI + f"^{sysfs}/kernel/debug", + f"^{sysfs}/kernel/tracing", + f"^{sysfs}/fs/pstore", + f"^{sysfs}/fs/bpf", + f"^{sysfs}/fs/fuse", + + # This is not documented at ABI + f"^{sysfs}/module", + + f"^{sysfs}/fs/cgroup", # this is big and has zero docs under ABI + f"^{sysfs}/firmware", # documented elsewhere: ACPI, DT bindings + "sections|notes", # aren't actually part of ABI + + # kernel-parameters.txt - not easy to parse + "parameters", + ] + + self.re_ignore = re.compile("|".join(dont_walk)) + + print(f"Reading {sysfs} directory contents...", file=sys.stderr) + self._walk(sysfs) + + def check_file(self, refs, found): + """Check missing ABI symbols for a given sysfs file""" + + res_list = [] + + try: + for names in refs: + fname = names[0] + + res = { + "found": False, + "fname": fname, + "msg": "", + } + res_list.append(res) + + re_what = self.abi.get_regexes(fname) + if not re_what: + self.abi.log.warning(f"missing rules for {fname}") + continue + + for name in names: + for r in re_what: + if self.abi.debug & AbiDebug.UNDEFINED: + self.log.debug("check if %s matches '%s'", name, r.pattern) + if r.match(name): + res["found"] = True + if found: + res["msg"] += f" {fname}: regex:\n\t" + continue + + if self.hints and not res["found"]: + res["msg"] += f" {fname} not found. Tested regexes:\n" + for r in re_what: + res["msg"] += " " + r.pattern + "\n" + + except KeyboardInterrupt: + pass + + return res_list + + def _ref_interactor(self, root): + """Recursive function to interact over the sysfs tree""" + + for k, v in root.items(): + if isinstance(v, dict): + yield from self._ref_interactor(v) + + if root == self.root or k == "__name": + continue + + if self.abi.re_string: + fname = v["__name"][0] + if self.abi.re_string.search(fname): + yield v + else: + yield v + + + def get_fileref(self, all_refs, chunk_size): + """Interactor to group refs into chunks""" + + n = 0 + refs = [] + + for ref in all_refs: + refs.append(ref) + + n += 1 + if n >= chunk_size: + yield refs + n = 0 + refs = [] + + yield refs + + def check_undefined_symbols(self, max_workers=None, chunk_size=50, + found=None, dry_run=None): + """Seach ABI for sysfs symbols missing documentation""" + + self.abi.parse_abi() + + if self.abi.debug & AbiDebug.GRAPH: + self.print_graph() + + all_refs = [] + for ref in self._ref_interactor(self.root): + all_refs.append(ref["__name"]) + + if dry_run: + print("Would check", file=sys.stderr) + for ref in all_refs: + print(", ".join(ref)) + + return + + print("Starting to search symbols (it may take several minutes):", + file=sys.stderr) + start = datetime.now() + old_elapsed = None + + # Python doesn't support multithreading due to limitations on its + # global lock (GIL). While Python 3.13 finally made GIL optional, + # there are still issues related to it. Also, we want to have + # backward compatibility with older versions of Python. + # + # So, use instead multiprocess. However, Python is very slow passing + # data from/to multiple processes. Also, it may consume lots of memory + # if the data to be shared is not small. So, we need to group workload + # in chunks that are big enough to generate performance gains while + # not being so big that would cause out-of-memory. + + num_refs = len(all_refs) + print(f"Number of references to parse: {num_refs}", file=sys.stderr) + + if not max_workers: + max_workers = os.cpu_count() + elif max_workers > os.cpu_count(): + max_workers = os.cpu_count() + + max_workers = max(max_workers, 1) + + max_chunk_size = int((num_refs + max_workers - 1) / max_workers) + chunk_size = min(chunk_size, max_chunk_size) + chunk_size = max(1, chunk_size) + + if max_workers > 1: + executor = futures.ProcessPoolExecutor + + # Place references in a random order. This may help improving + # performance, by mixing complex/simple expressions when creating + # chunks + shuffle(all_refs) + else: + # Python has a high overhead with processes. When there's just + # one worker, it is faster to not create a new process. + # Yet, User still deserves to have a progress print. So, use + # python's "thread", which is actually a single process, using + # an internal schedule to switch between tasks. No performance + # gains for non-IO tasks, but still it can be quickly interrupted + # from time to time to display progress. + executor = futures.ThreadPoolExecutor + + not_found = [] + f_list = [] + with executor(max_workers=max_workers) as exe: + for refs in self.get_fileref(all_refs, chunk_size): + if refs: + try: + f_list.append(exe.submit(self.check_file, refs, found)) + + except KeyboardInterrupt: + return + + total = len(f_list) + + if not total: + if self.abi.re_string: + print(f"No ABI symbol matches {self.abi.search_string}") + else: + self.abi.log.warning("No ABI symbols found") + return + + print(f"{len(f_list):6d} jobs queued on {max_workers} workers", + file=sys.stderr) + + while f_list: + try: + t = futures.wait(f_list, timeout=1, + return_when=futures.FIRST_COMPLETED) + + done = t[0] + + for fut in done: + res_list = fut.result() + + for res in res_list: + if not res["found"]: + not_found.append(res["fname"]) + if res["msg"]: + print(res["msg"]) + + f_list.remove(fut) + except KeyboardInterrupt: + return + + except RuntimeError as e: + self.abi.log.warning(f"Future: {e}") + break + + if sys.stderr.isatty(): + elapsed = str(datetime.now() - start).split(".", maxsplit=1)[0] + if len(f_list) < total: + elapsed += f" ({total - len(f_list)}/{total} jobs completed). " + if elapsed != old_elapsed: + print(elapsed + "\r", end="", flush=True, + file=sys.stderr) + old_elapsed = elapsed + + elapsed = str(datetime.now() - start).split(".", maxsplit=1)[0] + print(elapsed, file=sys.stderr) + + for f in sorted(not_found): + print(f"{f} not found.") diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 56a077d204cf..67e66333bd2a 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -144,10 +144,6 @@ kallsyms() kallsymopt="${kallsymopt} --all-symbols" fi - if is_enabled CONFIG_KALLSYMS_ABSOLUTE_PERCPU; then - kallsymopt="${kallsymopt} --absolute-percpu" - fi - info KSYMS "${2}.S" scripts/kallsyms ${kallsymopt} "${1}" > "${2}.S" diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index 91c91201212c..787868183b84 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -19,12 +19,14 @@ binutils) gcc) if [ "$ARCH" = parisc64 ]; then echo 12.0.0 + elif [ "$SRCARCH" = x86 ]; then + echo 8.1.0 else echo 5.1.0 fi ;; llvm) - if [ "$SRCARCH" = s390 ]; then + if [ "$SRCARCH" = s390 -o "$SRCARCH" = x86 ]; then echo 15.0.0 elif [ "$SRCARCH" = loongarch ]; then echo 18.0.0 diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index ebbdb3c42e9f..580b4e246aec 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -407,3 +407,4 @@ 464 common getxattrat sys_getxattrat 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr diff --git a/security/Kconfig b/security/Kconfig index f10dbf15c294..536061cf33a9 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -164,27 +164,6 @@ config LSM_MMAP_MIN_ADDR this low address space will need the permission specific to the systems running LSM. -config HARDENED_USERCOPY - bool "Harden memory copies between kernel and userspace" - imply STRICT_DEVMEM - help - This option checks for obviously wrong memory regions when - copying memory to/from the kernel (via copy_to_user() and - copy_from_user() functions) by rejecting memory ranges that - are larger than the specified heap object, span multiple - separately allocated pages, are not on the process stack, - or are part of the kernel text. This prevents entire classes - of heap overflow exploits and similar kernel memory exposures. - -config FORTIFY_SOURCE - bool "Harden common str/mem functions against buffer overflows" - depends on ARCH_HAS_FORTIFY_SOURCE - # https://github.com/llvm/llvm-project/issues/53645 - depends on !CC_IS_CLANG || !X86_32 - help - Detect overflows of buffers in common string and memory functions - where the compiler can determine and validate the buffer sizes. - config STATIC_USERMODEHELPER bool "Force all usermode helper calls through a single binary" help diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index b56e001e0c6a..c17366ce8224 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -280,6 +280,39 @@ config ZERO_CALL_USED_REGS endmenu +menu "Bounds checking" + +config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE + # https://github.com/llvm/llvm-project/issues/53645 + depends on !X86_32 || !CC_IS_CLANG || CLANG_VERSION >= 160000 + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + +config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + imply STRICT_DEVMEM + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and + copy_from_user() functions) by rejecting memory ranges that + are larger than the specified heap object, span multiple + separately allocated pages, are not on the process stack, + or are part of the kernel text. This prevents entire classes + of heap overflow exploits and similar kernel memory exposures. + +config HARDENED_USERCOPY_DEFAULT_ON + bool "Harden memory copies by default" + depends on HARDENED_USERCOPY + default HARDENED_USERCOPY + help + This has the effect of setting "hardened_usercopy=on" on the kernel + command line. This can be disabled with "hardened_usercopy=off". + +endmenu + menu "Hardening of kernel data structures" config LIST_HARDENED diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index c07d150685d7..6039afae4bfc 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1795,8 +1795,8 @@ fail2: return error; } -static int ns_mkdir_op(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ns_mkdir_op(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct aa_ns *ns, *parent; /* TODO: improve permission check */ @@ -1808,7 +1808,7 @@ static int ns_mkdir_op(struct mnt_idmap *idmap, struct inode *dir, AA_MAY_LOAD_POLICY); end_current_label_crit_section(label); if (error) - return error; + return ERR_PTR(error); parent = aa_get_ns(dir->i_private); AA_BUG(d_inode(ns_subns_dir(parent)) != dir); @@ -1843,7 +1843,7 @@ out: mutex_unlock(&parent->lock); aa_put_ns(parent); - return error; + return ERR_PTR(error); } static int ns_rmdir_op(struct inode *dir, struct dentry *dentry) diff --git a/security/keys/gc.c b/security/keys/gc.c index 7d687b0962b1..f27223ea4578 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -218,8 +218,10 @@ continue_scanning: key = rb_entry(cursor, struct key, serial_node); cursor = rb_next(cursor); - if (refcount_read(&key->usage) == 0) + if (test_bit(KEY_FLAG_FINAL_PUT, &key->flags)) { + smp_mb(); /* Clobber key->user after FINAL_PUT seen. */ goto found_unreferenced_key; + } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) { if (key->type == key_gc_dead_keytype) { diff --git a/security/keys/key.c b/security/keys/key.c index 3d7d185019d3..7198cd2ac3a3 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -658,6 +658,8 @@ void key_put(struct key *key) key->user->qnbytes -= key->quotalen; spin_unlock_irqrestore(&key->user->lock, flags); } + smp_mb(); /* key->user before FINAL_PUT set. */ + set_bit(KEY_FLAG_FINAL_PUT, &key->flags); schedule_work(&key_gc_work); } } diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 71b9dc331aae..582769ae830e 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1216,7 +1216,7 @@ static void hook_inode_free_security_rcu(void *inode_security) /* * Release the inodes used in a security policy. * - * Cf. fsnotify_unmount_inodes() and invalidate_inodes() + * Cf. fsnotify_unmount_inodes() and evict_inodes() */ static void hook_sb_delete(struct super_block *const sb) { diff --git a/security/loadpin/Kconfig b/security/loadpin/Kconfig index 848f8b4a6019..aef63d3e30df 100644 --- a/security/loadpin/Kconfig +++ b/security/loadpin/Kconfig @@ -16,7 +16,7 @@ config SECURITY_LOADPIN_ENFORCE depends on SECURITY_LOADPIN # Module compression breaks LoadPin unless modules are decompressed in # the kernel. - depends on !MODULES || (MODULE_COMPRESS_NONE || MODULE_DECOMPRESS) + depends on !MODULE_COMPRESS || MODULE_DECOMPRESS help If selected, LoadPin will enforce pinning at boot. If not selected, it can be enabled at boot with the kernel parameter diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7b867dfec88b..212cdead2b52 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3395,6 +3395,9 @@ static int selinux_path_notify(const struct path *path, u64 mask, case FSNOTIFY_OBJ_TYPE_INODE: perm = FILE__WATCH; break; + case FSNOTIFY_OBJ_TYPE_MNTNS: + perm = FILE__WATCH_MOUNTNS; + break; default: return -EINVAL; } diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 03e82477dce9..f9b5ca92a825 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -8,7 +8,7 @@ COMMON_FILE_SOCK_PERMS, "unlink", "link", "rename", "execute", \ "quotaon", "mounton", "audit_access", "open", "execmod", \ "watch", "watch_mount", "watch_sb", "watch_with_perm", \ - "watch_reads" + "watch_reads", "watch_mountns" #define COMMON_SOCK_PERMS \ COMMON_FILE_SOCK_PERMS, "bind", "connect", "listen", "accept", \ diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 1971710620c1..3d064dd4e03f 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -222,7 +222,7 @@ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int rc = -ENOSYS; - struct task_struct *myself = current; + struct task_struct *myself; switch (option) { case PR_SET_PTRACER: @@ -232,11 +232,7 @@ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, * leader checking is handled later when walking the ancestry * at the time of PTRACE_ATTACH check. */ - rcu_read_lock(); - if (!thread_group_leader(myself)) - myself = rcu_dereference(myself->group_leader); - get_task_struct(myself); - rcu_read_unlock(); + myself = current->group_leader; if (arg2 == 0) { yama_ptracer_del(NULL, myself); @@ -255,7 +251,6 @@ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3, } } - put_task_struct(myself); break; } diff --git a/sound/soc/fsl/imx-pcm-fiq.c b/sound/soc/fsl/imx-pcm-fiq.c index 3391430e4253..83de3ae33691 100644 --- a/sound/soc/fsl/imx-pcm-fiq.c +++ b/sound/soc/fsl/imx-pcm-fiq.c @@ -185,8 +185,7 @@ static int snd_imx_open(struct snd_soc_component *component, atomic_set(&iprtd->playing, 0); atomic_set(&iprtd->capturing, 0); - hrtimer_init(&iprtd->hrt, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - iprtd->hrt.function = snd_hrtimer_callback; + hrtimer_setup(&iprtd->hrt, snd_hrtimer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ret = snd_pcm_hw_constraint_integer(substream->runtime, SNDRV_PCM_HW_PARAM_PERIODS); diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 93807b437e4d..cb1740bc3da2 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; diff --git a/tools/arch/x86/include/asm/asm.h b/tools/arch/x86/include/asm/asm.h index 3ad3da9a7d97..dbe39b44256b 100644 --- a/tools/arch/x86/include/asm/asm.h +++ b/tools/arch/x86/include/asm/asm.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, @@ -123,7 +123,7 @@ #ifdef __KERNEL__ /* Exception table entry */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ @@ -154,7 +154,7 @@ # define _ASM_NOKPROBE(entry) # endif -#else /* ! __ASSEMBLY__ */ +#else /* ! __ASSEMBLER__ */ # define _EXPAND_EXTABLE_HANDLE(x) #x # define _ASM_EXTABLE_HANDLE(from, to, handler) \ " .pushsection \"__ex_table\",\"a\"\n" \ @@ -186,7 +186,7 @@ */ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 17b6590748c0..9e3fa7942e7d 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -2,14 +2,6 @@ #ifndef _ASM_X86_CPUFEATURES_H #define _ASM_X86_CPUFEATURES_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include <asm/required-features.h> -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include <asm/disabled-features.h> -#endif - /* * Defines x86 CPU feature bits */ @@ -210,7 +202,6 @@ #define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ #define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h deleted file mode 100644 index c492bdc97b05..000000000000 --- a/tools/arch/x86/include/asm/disabled-features.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef _ASM_X86_DISABLED_FEATURES_H -#define _ASM_X86_DISABLED_FEATURES_H - -/* These features, although they might be available in a CPU - * will not be used because the compile options to support - * them are not present. - * - * This code allows them to be checked and disabled at - * compile time without an explicit #ifdef. Use - * cpu_feature_enabled(). - */ - -#ifdef CONFIG_X86_UMIP -# define DISABLE_UMIP 0 -#else -# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) -#endif - -#ifdef CONFIG_X86_64 -# define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) -# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) -# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) -# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) -# define DISABLE_PCID 0 -#else -# define DISABLE_VME 0 -# define DISABLE_K6_MTRR 0 -# define DISABLE_CYRIX_ARR 0 -# define DISABLE_CENTAUR_MCR 0 -# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -# define DISABLE_PKU 0 -# define DISABLE_OSPKE 0 -#else -# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) -# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ - -#ifdef CONFIG_X86_5LEVEL -# define DISABLE_LA57 0 -#else -# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#endif - -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION -# define DISABLE_PTI 0 -#else -# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) -#endif - -#ifdef CONFIG_MITIGATION_RETPOLINE -# define DISABLE_RETPOLINE 0 -#else -# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ - (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) -#endif - -#ifdef CONFIG_MITIGATION_RETHUNK -# define DISABLE_RETHUNK 0 -#else -# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) -#endif - -#ifdef CONFIG_MITIGATION_UNRET_ENTRY -# define DISABLE_UNRET 0 -#else -# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) -#endif - -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING -# define DISABLE_CALL_DEPTH_TRACKING 0 -#else -# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) -#endif - -#ifdef CONFIG_ADDRESS_MASKING -# define DISABLE_LAM 0 -#else -# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) -#endif - -#ifdef CONFIG_INTEL_IOMMU_SVM -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif - -#ifdef CONFIG_X86_SGX -# define DISABLE_SGX 0 -#else -# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) -#endif - -#ifdef CONFIG_XEN_PV -# define DISABLE_XENPV 0 -#else -# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31)) -#endif - -#ifdef CONFIG_INTEL_TDX_GUEST -# define DISABLE_TDX_GUEST 0 -#else -# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) -#endif - -#ifdef CONFIG_X86_USER_SHADOW_STACK -#define DISABLE_USER_SHSTK 0 -#else -#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) -#endif - -#ifdef CONFIG_X86_KERNEL_IBT -#define DISABLE_IBT 0 -#else -#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) -#endif - -#ifdef CONFIG_X86_FRED -# define DISABLE_FRED 0 -#else -# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) -#endif - -#ifdef CONFIG_KVM_AMD_SEV -#define DISABLE_SEV_SNP 0 -#else -#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) -#endif - -/* - * Make sure to add features to the correct mask - */ -#define DISABLED_MASK0 (DISABLE_VME) -#define DISABLED_MASK1 0 -#define DISABLED_MASK2 0 -#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 (DISABLE_PCID) -#define DISABLED_MASK5 0 -#define DISABLED_MASK6 0 -#define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST) -#define DISABLED_MASK9 (DISABLE_SGX) -#define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ - DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) -#define DISABLED_MASK13 0 -#define DISABLED_MASK14 0 -#define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ - DISABLE_ENQCMD) -#define DISABLED_MASK17 0 -#define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 (DISABLE_SEV_SNP) -#define DISABLED_MASK20 0 -#define DISABLED_MASK21 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 3ae84c3b8e6d..dc1c1057f26e 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -25,6 +25,7 @@ #define _EFER_SVME 12 /* Enable virtualization */ #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ +#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ #define EFER_SCE (1<<_EFER_SCE) @@ -34,6 +35,7 @@ #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) +#define EFER_TCE (1<<_EFER_TCE) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) /* diff --git a/tools/arch/x86/include/asm/nops.h b/tools/arch/x86/include/asm/nops.h index 1c1b7550fa55..cd94221d8335 100644 --- a/tools/arch/x86/include/asm/nops.h +++ b/tools/arch/x86/include/asm/nops.h @@ -82,7 +82,7 @@ #define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) #define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern const unsigned char * const x86_nops[]; #endif diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h index 46d7e06763c9..e0125afa53fb 100644 --- a/tools/arch/x86/include/asm/orc_types.h +++ b/tools/arch/x86/include/asm/orc_types.h @@ -45,7 +45,7 @@ #define ORC_TYPE_REGS 3 #define ORC_TYPE_REGS_PARTIAL 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/byteorder.h> /* @@ -73,6 +73,6 @@ struct orc_entry { #endif } __packed; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ORC_TYPES_H */ diff --git a/tools/arch/x86/include/asm/pvclock-abi.h b/tools/arch/x86/include/asm/pvclock-abi.h index 1436226efe3e..b9fece5fc96d 100644 --- a/tools/arch/x86/include/asm/pvclock-abi.h +++ b/tools/arch/x86/include/asm/pvclock-abi.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PVCLOCK_ABI_H #define _ASM_X86_PVCLOCK_ABI_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * These structs MUST NOT be changed. @@ -44,5 +44,5 @@ struct pvclock_wall_clock { #define PVCLOCK_GUEST_STOPPED (1 << 1) /* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */ #define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h deleted file mode 100644 index e9187ddd3d1f..000000000000 --- a/tools/arch/x86/include/asm/required-features.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#define _ASM_X86_REQUIRED_FEATURES_H - -/* Define minimum CPUID feature set for kernel These bits are checked - really early to actually display a visible error message before the - kernel dies. Make sure to assign features to the proper mask! - - Some requirements that are not in CPUID yet are also in the - CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. - - The real information is in arch/x86/Kconfig.cpu, this just converts - the CONFIGs into a bitmask */ - -#ifndef CONFIG_MATH_EMULATION -# define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) -#else -# define NEED_FPU 0 -#endif - -#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -# define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) -#else -# define NEED_PAE 0 -#endif - -#ifdef CONFIG_X86_CMPXCHG64 -# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) -#else -# define NEED_CX8 0 -#endif - -#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) -# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) -#else -# define NEED_CMOV 0 -#endif - -# define NEED_3DNOW 0 - -#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) -# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) -#else -# define NEED_NOPL 0 -#endif - -#ifdef CONFIG_MATOM -# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) -#else -# define NEED_MOVBE 0 -#endif - -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL -/* Paravirtualized systems may not have PSE or PGE available */ -#define NEED_PSE 0 -#define NEED_PGE 0 -#else -#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) -#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) -#endif -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) -#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) -#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) -#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) -#define NEED_LM (1<<(X86_FEATURE_LM & 31)) -#else -#define NEED_PSE 0 -#define NEED_MSR 0 -#define NEED_PGE 0 -#define NEED_FXSR 0 -#define NEED_XMM 0 -#define NEED_XMM2 0 -#define NEED_LM 0 -#endif - -#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ - NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ - NEED_XMM|NEED_XMM2) -#define SSE_MASK (NEED_XMM|NEED_XMM2) - -#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) - -#define REQUIRED_MASK2 0 -#define REQUIRED_MASK3 (NEED_NOPL) -#define REQUIRED_MASK4 (NEED_MOVBE) -#define REQUIRED_MASK5 0 -#define REQUIRED_MASK6 0 -#define REQUIRED_MASK7 0 -#define REQUIRED_MASK8 0 -#define REQUIRED_MASK9 0 -#define REQUIRED_MASK10 0 -#define REQUIRED_MASK11 0 -#define REQUIRED_MASK12 0 -#define REQUIRED_MASK13 0 -#define REQUIRED_MASK14 0 -#define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 -#define REQUIRED_MASK17 0 -#define REQUIRED_MASK18 0 -#define REQUIRED_MASK19 0 -#define REQUIRED_MASK20 0 -#define REQUIRED_MASK21 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 60044b608817..8de2914e6510 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -41,7 +41,7 @@ * Missing asm support * * __GENMASK_U128() depends on _BIT128() which would not work - * in the asm code, as it shifts an 'unsigned __init128' data + * in the asm code, as it shifts an 'unsigned __int128' data * type instead of direct representation of 128 bit constants * such as long and unsigned long. The fundamental problem is * that a 128 bit constant will get silently truncated by the diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index a1f55fb24bb3..f9702877ac21 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -29,7 +29,9 @@ all_files := \ compiler.h \ crt.h \ ctype.h \ + dirent.h \ errno.h \ + limits.h \ nolibc.h \ signal.h \ stackprotector.h \ diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index 1791a8ce58da..753a8ed2cf69 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -179,6 +179,7 @@ }) /* startup code, note that it's called __start on MIPS */ +void __start(void); void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void) { __asm__ volatile ( diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index f9ab83a219b8..df4c3cc713ac 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -5,8 +5,8 @@ #ifndef _NOLIBC_ARCH_S390_H #define _NOLIBC_ARCH_S390_H -#include <asm/signal.h> -#include <asm/unistd.h> +#include <linux/signal.h> +#include <linux/unistd.h> #include "compiler.h" #include "crt.h" @@ -143,8 +143,13 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( +#ifdef __s390x__ "lgr %r2, %r15\n" /* save stack pointer to %r2, as arg1 of _start_c */ "aghi %r15, -160\n" /* allocate new stackframe */ +#else + "lr %r2, %r15\n" + "ahi %r15, -96\n" +#endif "xc 0(8,%r15), 0(%r15)\n" /* clear backchain */ "brasl %r14, _start_c\n" /* transfer to c runtime */ ); diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h index c8f4e5d3add9..8a2c143c0fba 100644 --- a/tools/include/nolibc/arch.h +++ b/tools/include/nolibc/arch.h @@ -29,7 +29,7 @@ #include "arch-powerpc.h" #elif defined(__riscv) #include "arch-riscv.h" -#elif defined(__s390x__) +#elif defined(__s390x__) || defined(__s390__) #include "arch-s390.h" #elif defined(__loongarch__) #include "arch-loongarch.h" diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h index bbcd5fd09806..c4b10103bbec 100644 --- a/tools/include/nolibc/crt.h +++ b/tools/include/nolibc/crt.h @@ -10,6 +10,7 @@ char **environ __attribute__((weak)); const unsigned long *_auxv __attribute__((weak)); +void _start(void); static void __stack_chk_init(void); static void exit(int); @@ -22,6 +23,7 @@ extern void (*const __init_array_end[])(int, char **, char**) __attribute__((wea extern void (*const __fini_array_start[])(void) __attribute__((weak)); extern void (*const __fini_array_end[])(void) __attribute__((weak)); +void _start_c(long *sp); __attribute__((weak,used)) void _start_c(long *sp) { diff --git a/tools/include/nolibc/dirent.h b/tools/include/nolibc/dirent.h new file mode 100644 index 000000000000..c5c30d0dd680 --- /dev/null +++ b/tools/include/nolibc/dirent.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Directory access for NOLIBC + * Copyright (C) 2025 Thomas Weißschuh <linux@weissschuh.net> + */ + +#ifndef _NOLIBC_DIRENT_H +#define _NOLIBC_DIRENT_H + +#include "stdint.h" +#include "types.h" + +#include <linux/limits.h> + +struct dirent { + ino_t d_ino; + char d_name[NAME_MAX + 1]; +}; + +/* See comment of FILE in stdio.h */ +typedef struct { + char dummy[1]; +} DIR; + +static __attribute__((unused)) +DIR *fdopendir(int fd) +{ + if (fd < 0) { + SET_ERRNO(EBADF); + return NULL; + } + return (DIR *)(intptr_t)~fd; +} + +static __attribute__((unused)) +DIR *opendir(const char *name) +{ + int fd; + + fd = open(name, O_RDONLY); + if (fd == -1) + return NULL; + return fdopendir(fd); +} + +static __attribute__((unused)) +int closedir(DIR *dirp) +{ + intptr_t i = (intptr_t)dirp; + + if (i >= 0) { + SET_ERRNO(EBADF); + return -1; + } + return close(~i); +} + +static __attribute__((unused)) +int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result) +{ + char buf[sizeof(struct linux_dirent64) + NAME_MAX + 1]; + struct linux_dirent64 *ldir = (void *)buf; + intptr_t i = (intptr_t)dirp; + int fd, ret; + + if (i >= 0) + return EBADF; + + fd = ~i; + + ret = sys_getdents64(fd, ldir, sizeof(buf)); + if (ret < 0) + return -ret; + if (ret == 0) { + *result = NULL; + return 0; + } + + /* + * getdents64() returns as many entries as fit the buffer. + * readdir() can only return one entry at a time. + * Make sure the non-returned ones are not skipped. + */ + ret = lseek(fd, ldir->d_off, SEEK_SET); + if (ret == -1) + return errno; + + entry->d_ino = ldir->d_ino; + /* the destination should always be big enough */ + strlcpy(entry->d_name, ldir->d_name, sizeof(entry->d_name)); + *result = entry; + return 0; +} + +/* make sure to include all global symbols */ +#include "nolibc.h" + +#endif /* _NOLIBC_DIRENT_H */ diff --git a/tools/include/nolibc/errno.h b/tools/include/nolibc/errno.h index a44486ff0477..1d8d8033e8ff 100644 --- a/tools/include/nolibc/errno.h +++ b/tools/include/nolibc/errno.h @@ -7,7 +7,7 @@ #ifndef _NOLIBC_ERRNO_H #define _NOLIBC_ERRNO_H -#include <asm/errno.h> +#include <linux/errno.h> #ifndef NOLIBC_IGNORE_ERRNO #define SET_ERRNO(v) do { errno = (v); } while (0) diff --git a/tools/include/nolibc/limits.h b/tools/include/nolibc/limits.h new file mode 100644 index 000000000000..306d4141f4d2 --- /dev/null +++ b/tools/include/nolibc/limits.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Shim limits.h header for NOLIBC. + * Copyright (C) 2025 Thomas Weißschuh <thomas.weissschuh@linutronix.de> + */ + +#include "nolibc.h" diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 92436b1e4441..70872401aca8 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -31,8 +31,7 @@ * - The third level is the libc call definition. It exposes the lower raw * sys_<name>() calls in a way that looks like what a libc usually does, * takes care of specific input values, and of setting errno upon error. - * There can be minor variations compared to standard libc calls. For - * example the open() call always takes 3 args here. + * There can be minor variations compared to standard libc calls. * * The errno variable is declared static and unused. This way it can be * optimized away if not used. However this means that a program made of @@ -105,6 +104,7 @@ #include "string.h" #include "time.h" #include "stackprotector.h" +#include "dirent.h" /* Used by programs to avoid std includes */ #define NOLIBC diff --git a/tools/include/nolibc/signal.h b/tools/include/nolibc/signal.h index 137552216e46..cdcc5904c51e 100644 --- a/tools/include/nolibc/signal.h +++ b/tools/include/nolibc/signal.h @@ -13,6 +13,7 @@ #include "sys.h" /* This one is not marked static as it's needed by libgcc for divide by zero */ +int raise(int signal); __attribute__((weak,unused,section(".text.nolibc_raise"))) int raise(int signal) { diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h index 1d0d5259ec41..c71a2c257177 100644 --- a/tools/include/nolibc/stackprotector.h +++ b/tools/include/nolibc/stackprotector.h @@ -18,6 +18,7 @@ * triggering stack protector errors themselves */ +void __stack_chk_fail(void); __attribute__((weak,used,noreturn,section(".text.nolibc_stack_chk"))) void __stack_chk_fail(void) { @@ -28,6 +29,7 @@ void __stack_chk_fail(void) for (;;); } +void __stack_chk_fail_local(void); __attribute__((weak,noreturn,section(".text.nolibc_stack_chk"))) void __stack_chk_fail_local(void) { diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 3892034198dd..a403351dbf60 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -350,6 +350,104 @@ int printf(const char *fmt, ...) } static __attribute__((unused)) +int vsscanf(const char *str, const char *format, va_list args) +{ + uintmax_t uval; + intmax_t ival; + int base; + char *endptr; + int matches; + int lpref; + + matches = 0; + + while (1) { + if (*format == '%') { + /* start of pattern */ + lpref = 0; + format++; + + if (*format == 'l') { + /* same as in printf() */ + lpref = 1; + format++; + if (*format == 'l') { + lpref = 2; + format++; + } + } + + if (*format == '%') { + /* literal % */ + if ('%' != *str) + goto done; + str++; + format++; + continue; + } else if (*format == 'd') { + ival = strtoll(str, &endptr, 10); + if (lpref == 0) + *va_arg(args, int *) = ival; + else if (lpref == 1) + *va_arg(args, long *) = ival; + else if (lpref == 2) + *va_arg(args, long long *) = ival; + } else if (*format == 'u' || *format == 'x' || *format == 'X') { + base = *format == 'u' ? 10 : 16; + uval = strtoull(str, &endptr, base); + if (lpref == 0) + *va_arg(args, unsigned int *) = uval; + else if (lpref == 1) + *va_arg(args, unsigned long *) = uval; + else if (lpref == 2) + *va_arg(args, unsigned long long *) = uval; + } else if (*format == 'p') { + *va_arg(args, void **) = (void *)strtoul(str, &endptr, 16); + } else { + SET_ERRNO(EILSEQ); + goto done; + } + + format++; + str = endptr; + matches++; + + } else if (*format == '\0') { + goto done; + } else if (isspace(*format)) { + /* skip spaces in format and str */ + while (isspace(*format)) + format++; + while (isspace(*str)) + str++; + } else if (*format == *str) { + /* literal match */ + format++; + str++; + } else { + if (!matches) + matches = EOF; + goto done; + } + } + +done: + return matches; +} + +static __attribute__((unused, format(scanf, 2, 3))) +int sscanf(const char *str, const char *format, ...) +{ + va_list args; + int ret; + + va_start(args, format); + ret = vsscanf(str, format, args); + va_end(args); + return ret; +} + +static __attribute__((unused)) void perror(const char *msg) { fprintf(stderr, "%s%serrno=%d\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "", errno); diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 75aa273c23a6..86ad378ab1ea 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -30,6 +30,7 @@ static __attribute__((unused)) char itoa_buffer[21]; */ /* must be exported, as it's used by libgcc for various divide functions */ +void abort(void); __attribute__((weak,unused,noreturn,section(".text.nolibc_abort"))) void abort(void) { diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 9ec9c24f38c0..ba84ab700e30 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -32,6 +32,7 @@ int memcmp(const void *s1, const void *s2, size_t n) /* might be ignored by the compiler without -ffreestanding, then found as * missing. */ +void *memmove(void *dst, const void *src, size_t len); __attribute__((weak,unused,section(".text.nolibc_memmove"))) void *memmove(void *dst, const void *src, size_t len) { @@ -56,6 +57,7 @@ void *memmove(void *dst, const void *src, size_t len) #ifndef NOLIBC_ARCH_HAS_MEMCPY /* must be exported, as it's used by libgcc on ARM */ +void *memcpy(void *dst, const void *src, size_t len); __attribute__((weak,unused,section(".text.nolibc_memcpy"))) void *memcpy(void *dst, const void *src, size_t len) { @@ -73,6 +75,7 @@ void *memcpy(void *dst, const void *src, size_t len) /* might be ignored by the compiler without -ffreestanding, then found as * missing. */ +void *memset(void *dst, int b, size_t len); __attribute__((weak,unused,section(".text.nolibc_memset"))) void *memset(void *dst, int b, size_t len) { @@ -124,6 +127,7 @@ char *strcpy(char *dst, const char *src) * thus itself, hence the asm() statement below that's meant to disable this * confusing practice. */ +size_t strlen(const char *str); __attribute__((weak,unused,section(".text.nolibc_strlen"))) size_t strlen(const char *str) { diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index d4a5c2399a66..08c1c074bec8 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -10,10 +10,10 @@ #include "std.h" /* system includes */ -#include <asm/unistd.h> -#include <asm/signal.h> /* for SIGCHLD */ -#include <asm/ioctls.h> -#include <asm/mman.h> +#include <linux/unistd.h> +#include <linux/signal.h> /* for SIGCHLD */ +#include <linux/termios.h> +#include <linux/mman.h> #include <linux/fs.h> #include <linux/loop.h> #include <linux/time.h> @@ -23,7 +23,6 @@ #include <linux/prctl.h> #include <linux/resource.h> #include <linux/utsname.h> -#include <linux/signal.h> #include "arch.h" #include "errno.h" @@ -532,20 +531,16 @@ uid_t getuid(void) /* - * int ioctl(int fd, unsigned long req, void *value); + * int ioctl(int fd, unsigned long cmd, ... arg); */ static __attribute__((unused)) -int sys_ioctl(int fd, unsigned long req, void *value) +long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { - return my_syscall3(__NR_ioctl, fd, req, value); + return my_syscall3(__NR_ioctl, fd, cmd, arg); } -static __attribute__((unused)) -int ioctl(int fd, unsigned long req, void *value) -{ - return __sysret(sys_ioctl(fd, req, value)); -} +#define ioctl(fd, cmd, arg) __sysret(sys_ioctl(fd, cmd, (unsigned long)(arg))) /* * int kill(pid_t pid, int signal); @@ -602,9 +597,36 @@ off_t sys_lseek(int fd, off_t offset, int whence) } static __attribute__((unused)) +int sys_llseek(int fd, unsigned long offset_high, unsigned long offset_low, + __kernel_loff_t *result, int whence) +{ +#ifdef __NR_llseek + return my_syscall5(__NR_llseek, fd, offset_high, offset_low, result, whence); +#else + return __nolibc_enosys(__func__, fd, offset_high, offset_low, result, whence); +#endif +} + +static __attribute__((unused)) off_t lseek(int fd, off_t offset, int whence) { - return __sysret(sys_lseek(fd, offset, whence)); + __kernel_loff_t loff = 0; + off_t result; + int ret; + + result = sys_lseek(fd, offset, whence); + if (result == -ENOSYS) { + /* Only exists on 32bit where nolibc off_t is also 32bit */ + ret = sys_llseek(fd, 0, offset, &loff, whence); + if (ret < 0) + result = ret; + else if (loff != (off_t)loff) + result = -EOVERFLOW; + else + result = loff; + } + + return __sysret(result); } @@ -742,6 +764,31 @@ int mount(const char *src, const char *tgt, return __sysret(sys_mount(src, tgt, fst, flags, data)); } +/* + * int openat(int dirfd, const char *path, int flags[, mode_t mode]); + */ + +static __attribute__((unused)) +int sys_openat(int dirfd, const char *path, int flags, mode_t mode) +{ + return my_syscall4(__NR_openat, dirfd, path, flags, mode); +} + +static __attribute__((unused)) +int openat(int dirfd, const char *path, int flags, ...) +{ + mode_t mode = 0; + + if (flags & O_CREAT) { + va_list args; + + va_start(args, flags); + mode = va_arg(args, mode_t); + va_end(args); + } + + return __sysret(sys_openat(dirfd, path, flags, mode)); +} /* * int open(const char *path, int flags[, mode_t mode]); @@ -750,13 +797,7 @@ int mount(const char *src, const char *tgt, static __attribute__((unused)) int sys_open(const char *path, int flags, mode_t mode) { -#ifdef __NR_openat return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); -#elif defined(__NR_open) - return my_syscall3(__NR_open, path, flags, mode); -#else - return __nolibc_enosys(__func__, path, flags, mode); -#endif } static __attribute__((unused)) @@ -768,7 +809,7 @@ int open(const char *path, int flags, ...) va_list args; va_start(args, flags); - mode = va_arg(args, int); + mode = va_arg(args, mode_t); va_end(args); } diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index ffff554a5230..aa5016ff3d91 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -119,14 +119,31 @@ #define SO_DETACH_REUSEPORT_BPF 68 +#define SO_PREFER_BUSY_POLL 69 +#define SO_BUSY_POLL_BUDGET 70 + +#define SO_NETNS_COOKIE 71 + +#define SO_BUF_LOCK 72 + +#define SO_RESERVE_MEM 73 + +#define SO_TXREHASH 74 + #define SO_RCVMARK 75 #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 -#define SCM_TS_OPT_ID 78 +#define SO_DEVMEM_LINEAR 78 +#define SCM_DEVMEM_LINEAR SO_DEVMEM_LINEAR +#define SO_DEVMEM_DMABUF 79 +#define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF +#define SO_DEVMEM_DONTNEED 80 + +#define SCM_TS_OPT_ID 81 -#define SO_RCVPRIORITY 79 +#define SO_RCVPRIORITY 82 #if !defined(__KERNEL__) diff --git a/tools/include/uapi/linux/const.h b/tools/include/uapi/linux/const.h index e16be0d37746..b8f629ef135f 100644 --- a/tools/include/uapi/linux/const.h +++ b/tools/include/uapi/linux/const.h @@ -33,7 +33,7 @@ * Missing asm support * * __BIT128() would not work in the asm code, as it shifts an - * 'unsigned __init128' data type as direct representation of + * 'unsigned __int128' data type as direct representation of * 128 bit constants is not supported in the gcc compiler, as * they get silently truncated. * diff --git a/tools/include/uapi/linux/elf.h b/tools/include/uapi/linux/elf.h new file mode 100644 index 000000000000..5834b83d7f9a --- /dev/null +++ b/tools/include/uapi/linux/elf.h @@ -0,0 +1,524 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_ELF_H +#define _LINUX_ELF_H + +#include <linux/types.h> +#include <linux/elf-em.h> + +/* 32-bit ELF base types. */ +typedef __u32 Elf32_Addr; +typedef __u16 Elf32_Half; +typedef __u32 Elf32_Off; +typedef __s32 Elf32_Sword; +typedef __u32 Elf32_Word; +typedef __u16 Elf32_Versym; + +/* 64-bit ELF base types. */ +typedef __u64 Elf64_Addr; +typedef __u16 Elf64_Half; +typedef __s16 Elf64_SHalf; +typedef __u64 Elf64_Off; +typedef __s32 Elf64_Sword; +typedef __u32 Elf64_Word; +typedef __u64 Elf64_Xword; +typedef __s64 Elf64_Sxword; +typedef __u16 Elf64_Versym; + +/* These constants are for the segment types stored in the image headers */ +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 /* Thread local storage segment */ +#define PT_LOOS 0x60000000 /* OS-specific */ +#define PT_HIOS 0x6fffffff /* OS-specific */ +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7fffffff +#define PT_GNU_EH_FRAME (PT_LOOS + 0x474e550) +#define PT_GNU_STACK (PT_LOOS + 0x474e551) +#define PT_GNU_RELRO (PT_LOOS + 0x474e552) +#define PT_GNU_PROPERTY (PT_LOOS + 0x474e553) + + +/* ARM MTE memory tag segment type */ +#define PT_AARCH64_MEMTAG_MTE (PT_LOPROC + 0x2) + +/* + * Extended Numbering + * + * If the real number of program header table entries is larger than + * or equal to PN_XNUM(0xffff), it is set to sh_info field of the + * section header at index 0, and PN_XNUM is set to e_phnum + * field. Otherwise, the section header at index 0 is zero + * initialized, if it exists. + * + * Specifications are available in: + * + * - Oracle: Linker and Libraries. + * Part No: 817–1984–19, August 2011. + * https://docs.oracle.com/cd/E18752_01/pdf/817-1984.pdf + * + * - System V ABI AMD64 Architecture Processor Supplement + * Draft Version 0.99.4, + * January 13, 2010. + * http://www.cs.washington.edu/education/courses/cse351/12wi/supp-docs/abi.pdf + */ +#define PN_XNUM 0xffff + +/* These constants define the different elf file types */ +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_LOPROC 0xff00 +#define ET_HIPROC 0xffff + +/* This is the info that is needed to parse the dynamic section of the file */ +#define DT_NULL 0 +#define DT_NEEDED 1 +#define DT_PLTRELSZ 2 +#define DT_PLTGOT 3 +#define DT_HASH 4 +#define DT_STRTAB 5 +#define DT_SYMTAB 6 +#define DT_RELA 7 +#define DT_RELASZ 8 +#define DT_RELAENT 9 +#define DT_STRSZ 10 +#define DT_SYMENT 11 +#define DT_INIT 12 +#define DT_FINI 13 +#define DT_SONAME 14 +#define DT_RPATH 15 +#define DT_SYMBOLIC 16 +#define DT_REL 17 +#define DT_RELSZ 18 +#define DT_RELENT 19 +#define DT_PLTREL 20 +#define DT_DEBUG 21 +#define DT_TEXTREL 22 +#define DT_JMPREL 23 +#define DT_ENCODING 32 +#define OLD_DT_LOOS 0x60000000 +#define DT_LOOS 0x6000000d +#define DT_HIOS 0x6ffff000 +#define DT_VALRNGLO 0x6ffffd00 +#define DT_VALRNGHI 0x6ffffdff +#define DT_ADDRRNGLO 0x6ffffe00 +#define DT_GNU_HASH 0x6ffffef5 +#define DT_ADDRRNGHI 0x6ffffeff +#define DT_VERSYM 0x6ffffff0 +#define DT_RELACOUNT 0x6ffffff9 +#define DT_RELCOUNT 0x6ffffffa +#define DT_FLAGS_1 0x6ffffffb +#define DT_VERDEF 0x6ffffffc +#define DT_VERDEFNUM 0x6ffffffd +#define DT_VERNEED 0x6ffffffe +#define DT_VERNEEDNUM 0x6fffffff +#define OLD_DT_HIOS 0x6fffffff +#define DT_LOPROC 0x70000000 +#define DT_HIPROC 0x7fffffff + +/* This info is needed when parsing the symbol table */ +#define STB_LOCAL 0 +#define STB_GLOBAL 1 +#define STB_WEAK 2 + +#define STN_UNDEF 0 + +#define STT_NOTYPE 0 +#define STT_OBJECT 1 +#define STT_FUNC 2 +#define STT_SECTION 3 +#define STT_FILE 4 +#define STT_COMMON 5 +#define STT_TLS 6 + +#define VER_FLG_BASE 0x1 +#define VER_FLG_WEAK 0x2 + +#define ELF_ST_BIND(x) ((x) >> 4) +#define ELF_ST_TYPE(x) ((x) & 0xf) +#define ELF32_ST_BIND(x) ELF_ST_BIND(x) +#define ELF32_ST_TYPE(x) ELF_ST_TYPE(x) +#define ELF64_ST_BIND(x) ELF_ST_BIND(x) +#define ELF64_ST_TYPE(x) ELF_ST_TYPE(x) + +typedef struct { + Elf32_Sword d_tag; + union { + Elf32_Sword d_val; + Elf32_Addr d_ptr; + } d_un; +} Elf32_Dyn; + +typedef struct { + Elf64_Sxword d_tag; /* entry tag value */ + union { + Elf64_Xword d_val; + Elf64_Addr d_ptr; + } d_un; +} Elf64_Dyn; + +/* The following are used with relocations */ +#define ELF32_R_SYM(x) ((x) >> 8) +#define ELF32_R_TYPE(x) ((x) & 0xff) + +#define ELF64_R_SYM(i) ((i) >> 32) +#define ELF64_R_TYPE(i) ((i) & 0xffffffff) + +typedef struct elf32_rel { + Elf32_Addr r_offset; + Elf32_Word r_info; +} Elf32_Rel; + +typedef struct elf64_rel { + Elf64_Addr r_offset; /* Location at which to apply the action */ + Elf64_Xword r_info; /* index and type of relocation */ +} Elf64_Rel; + +typedef struct elf32_rela { + Elf32_Addr r_offset; + Elf32_Word r_info; + Elf32_Sword r_addend; +} Elf32_Rela; + +typedef struct elf64_rela { + Elf64_Addr r_offset; /* Location at which to apply the action */ + Elf64_Xword r_info; /* index and type of relocation */ + Elf64_Sxword r_addend; /* Constant addend used to compute value */ +} Elf64_Rela; + +typedef struct elf32_sym { + Elf32_Word st_name; + Elf32_Addr st_value; + Elf32_Word st_size; + unsigned char st_info; + unsigned char st_other; + Elf32_Half st_shndx; +} Elf32_Sym; + +typedef struct elf64_sym { + Elf64_Word st_name; /* Symbol name, index in string tbl */ + unsigned char st_info; /* Type and binding attributes */ + unsigned char st_other; /* No defined meaning, 0 */ + Elf64_Half st_shndx; /* Associated section index */ + Elf64_Addr st_value; /* Value of the symbol */ + Elf64_Xword st_size; /* Associated symbol size */ +} Elf64_Sym; + + +#define EI_NIDENT 16 + +typedef struct elf32_hdr { + unsigned char e_ident[EI_NIDENT]; + Elf32_Half e_type; + Elf32_Half e_machine; + Elf32_Word e_version; + Elf32_Addr e_entry; /* Entry point */ + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf32_Word e_flags; + Elf32_Half e_ehsize; + Elf32_Half e_phentsize; + Elf32_Half e_phnum; + Elf32_Half e_shentsize; + Elf32_Half e_shnum; + Elf32_Half e_shstrndx; +} Elf32_Ehdr; + +typedef struct elf64_hdr { + unsigned char e_ident[EI_NIDENT]; /* ELF "magic number" */ + Elf64_Half e_type; + Elf64_Half e_machine; + Elf64_Word e_version; + Elf64_Addr e_entry; /* Entry point virtual address */ + Elf64_Off e_phoff; /* Program header table file offset */ + Elf64_Off e_shoff; /* Section header table file offset */ + Elf64_Word e_flags; + Elf64_Half e_ehsize; + Elf64_Half e_phentsize; + Elf64_Half e_phnum; + Elf64_Half e_shentsize; + Elf64_Half e_shnum; + Elf64_Half e_shstrndx; +} Elf64_Ehdr; + +/* These constants define the permissions on sections in the program + header, p_flags. */ +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +typedef struct elf32_phdr { + Elf32_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf32_Word p_filesz; + Elf32_Word p_memsz; + Elf32_Word p_flags; + Elf32_Word p_align; +} Elf32_Phdr; + +typedef struct elf64_phdr { + Elf64_Word p_type; + Elf64_Word p_flags; + Elf64_Off p_offset; /* Segment file offset */ + Elf64_Addr p_vaddr; /* Segment virtual address */ + Elf64_Addr p_paddr; /* Segment physical address */ + Elf64_Xword p_filesz; /* Segment size in file */ + Elf64_Xword p_memsz; /* Segment size in memory */ + Elf64_Xword p_align; /* Segment alignment, file & memory */ +} Elf64_Phdr; + +/* sh_type */ +#define SHT_NULL 0 +#define SHT_PROGBITS 1 +#define SHT_SYMTAB 2 +#define SHT_STRTAB 3 +#define SHT_RELA 4 +#define SHT_HASH 5 +#define SHT_DYNAMIC 6 +#define SHT_NOTE 7 +#define SHT_NOBITS 8 +#define SHT_REL 9 +#define SHT_SHLIB 10 +#define SHT_DYNSYM 11 +#define SHT_NUM 12 +#define SHT_LOPROC 0x70000000 +#define SHT_HIPROC 0x7fffffff +#define SHT_LOUSER 0x80000000 +#define SHT_HIUSER 0xffffffff + +/* sh_flags */ +#define SHF_WRITE 0x1 +#define SHF_ALLOC 0x2 +#define SHF_EXECINSTR 0x4 +#define SHF_RELA_LIVEPATCH 0x00100000 +#define SHF_RO_AFTER_INIT 0x00200000 +#define SHF_MASKPROC 0xf0000000 + +/* special section indexes */ +#define SHN_UNDEF 0 +#define SHN_LORESERVE 0xff00 +#define SHN_LOPROC 0xff00 +#define SHN_HIPROC 0xff1f +#define SHN_LIVEPATCH 0xff20 +#define SHN_ABS 0xfff1 +#define SHN_COMMON 0xfff2 +#define SHN_HIRESERVE 0xffff + +typedef struct elf32_shdr { + Elf32_Word sh_name; + Elf32_Word sh_type; + Elf32_Word sh_flags; + Elf32_Addr sh_addr; + Elf32_Off sh_offset; + Elf32_Word sh_size; + Elf32_Word sh_link; + Elf32_Word sh_info; + Elf32_Word sh_addralign; + Elf32_Word sh_entsize; +} Elf32_Shdr; + +typedef struct elf64_shdr { + Elf64_Word sh_name; /* Section name, index in string tbl */ + Elf64_Word sh_type; /* Type of section */ + Elf64_Xword sh_flags; /* Miscellaneous section attributes */ + Elf64_Addr sh_addr; /* Section virtual addr at execution */ + Elf64_Off sh_offset; /* Section file offset */ + Elf64_Xword sh_size; /* Size of section in bytes */ + Elf64_Word sh_link; /* Index of another section */ + Elf64_Word sh_info; /* Additional section information */ + Elf64_Xword sh_addralign; /* Section alignment */ + Elf64_Xword sh_entsize; /* Entry size if section holds table */ +} Elf64_Shdr; + +#define EI_MAG0 0 /* e_ident[] indexes */ +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_PAD 8 + +#define ELFMAG0 0x7f /* EI_MAG */ +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' +#define ELFMAG "\177ELF" +#define SELFMAG 4 + +#define ELFCLASSNONE 0 /* EI_CLASS */ +#define ELFCLASS32 1 +#define ELFCLASS64 2 +#define ELFCLASSNUM 3 + +#define ELFDATANONE 0 /* e_ident[EI_DATA] */ +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +#define EV_NONE 0 /* e_version, EI_VERSION */ +#define EV_CURRENT 1 +#define EV_NUM 2 + +#define ELFOSABI_NONE 0 +#define ELFOSABI_LINUX 3 + +#ifndef ELF_OSABI +#define ELF_OSABI ELFOSABI_NONE +#endif + +/* + * Notes used in ET_CORE. Architectures export some of the arch register sets + * using the corresponding note types via the PTRACE_GETREGSET and + * PTRACE_SETREGSET requests. + * The note name for these types is "LINUX", except NT_PRFPREG that is named + * "CORE". + */ +#define NT_PRSTATUS 1 +#define NT_PRFPREG 2 +#define NT_PRPSINFO 3 +#define NT_TASKSTRUCT 4 +#define NT_AUXV 6 +/* + * Note to userspace developers: size of NT_SIGINFO note may increase + * in the future to accomodate more fields, don't assume it is fixed! + */ +#define NT_SIGINFO 0x53494749 +#define NT_FILE 0x46494c45 +#define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ +#define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ +#define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ +#define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ +#define NT_PPC_TAR 0x103 /* Target Address Register */ +#define NT_PPC_PPR 0x104 /* Program Priority Register */ +#define NT_PPC_DSCR 0x105 /* Data Stream Control Register */ +#define NT_PPC_EBB 0x106 /* Event Based Branch Registers */ +#define NT_PPC_PMU 0x107 /* Performance Monitor Registers */ +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NT_PPC_TM_CTAR 0x10d /* TM checkpointed Target Address Register */ +#define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ +#define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ +#define NT_PPC_PKEY 0x110 /* Memory Protection Keys registers */ +#define NT_PPC_DEXCR 0x111 /* PowerPC DEXCR registers */ +#define NT_PPC_HASHKEYR 0x112 /* PowerPC HASHKEYR register */ +#define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ +#define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ +#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +/* Old binutils treats 0x203 as a CET state */ +#define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ +#define NT_X86_XSAVE_LAYOUT 0x205 /* XSAVE layout description */ +#define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ +#define NT_S390_TIMER 0x301 /* s390 timer register */ +#define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ +#define NT_S390_TODPREG 0x303 /* s390 TOD programmable register */ +#define NT_S390_CTRS 0x304 /* s390 control registers */ +#define NT_S390_PREFIX 0x305 /* s390 prefix register */ +#define NT_S390_LAST_BREAK 0x306 /* s390 breaking event address */ +#define NT_S390_SYSTEM_CALL 0x307 /* s390 system call restart data */ +#define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ +#define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ +#define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ +#define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ +#define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ +#define NT_S390_RI_CB 0x30d /* s390 runtime instrumentation */ +#define NT_S390_PV_CPU_DATA 0x30e /* s390 protvirt cpu dump data */ +#define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ +#define NT_ARM_TLS 0x401 /* ARM TLS register */ +#define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ +#define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ +#define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ +#define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ +#define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ +#define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* arm64 ptr auth enabled keys (prctl()) */ +#define NT_ARM_SSVE 0x40b /* ARM Streaming SVE registers */ +#define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ +#define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ +#define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NT_ARM_GCS 0x410 /* ARM GCS state */ +#define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ +#define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ +#define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ +#define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ +#define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +#define NT_RISCV_CSR 0x900 /* RISC-V Control and Status Registers */ +#define NT_RISCV_VECTOR 0x901 /* RISC-V vector registers */ +#define NT_RISCV_TAGGED_ADDR_CTRL 0x902 /* RISC-V tagged address control (prctl()) */ +#define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ +#define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ +#define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ +#define NT_LOONGARCH_LASX 0xa03 /* LoongArch Loongson Advanced SIMD Extension registers */ +#define NT_LOONGARCH_LBT 0xa04 /* LoongArch Loongson Binary Translation registers */ +#define NT_LOONGARCH_HW_BREAK 0xa05 /* LoongArch hardware breakpoint registers */ +#define NT_LOONGARCH_HW_WATCH 0xa06 /* LoongArch hardware watchpoint registers */ + +/* Note types with note name "GNU" */ +#define NT_GNU_PROPERTY_TYPE_0 5 + +/* Note header in a PT_NOTE section */ +typedef struct elf32_note { + Elf32_Word n_namesz; /* Name size */ + Elf32_Word n_descsz; /* Content size */ + Elf32_Word n_type; /* Content type */ +} Elf32_Nhdr; + +/* Note header in a PT_NOTE section */ +typedef struct elf64_note { + Elf64_Word n_namesz; /* Name size */ + Elf64_Word n_descsz; /* Content size */ + Elf64_Word n_type; /* Content type */ +} Elf64_Nhdr; + +/* .note.gnu.property types for EM_AARCH64: */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +/* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) + +typedef struct { + Elf32_Half vd_version; + Elf32_Half vd_flags; + Elf32_Half vd_ndx; + Elf32_Half vd_cnt; + Elf32_Word vd_hash; + Elf32_Word vd_aux; + Elf32_Word vd_next; +} Elf32_Verdef; + +typedef struct { + Elf64_Half vd_version; + Elf64_Half vd_flags; + Elf64_Half vd_ndx; + Elf64_Half vd_cnt; + Elf64_Word vd_hash; + Elf64_Word vd_aux; + Elf64_Word vd_next; +} Elf64_Verdef; + +typedef struct { + Elf32_Word vda_name; + Elf32_Word vda_next; +} Elf32_Verdaux; + +typedef struct { + Elf64_Word vda_name; + Elf64_Word vda_next; +} Elf64_Verdaux; + +#endif /* _LINUX_ELF_H */ diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt index 6f3d16dbf467..7ead94bffa4e 100644 --- a/tools/memory-model/Documentation/glossary.txt +++ b/tools/memory-model/Documentation/glossary.txt @@ -15,14 +15,14 @@ Address Dependency: When the address of a later memory access is computed 3 do_something(p->a); 4 rcu_read_unlock(); - In this case, because the address of "p->a" on line 3 is computed - from the value returned by the rcu_dereference() on line 2, the - address dependency extends from that rcu_dereference() to that - "p->a". In rare cases, optimizing compilers can destroy address - dependencies. Please see Documentation/RCU/rcu_dereference.rst - for more information. + In this case, because the address of "p->a" on line 3 is computed + from the value returned by the rcu_dereference() on line 2, the + address dependency extends from that rcu_dereference() to that + "p->a". In rare cases, optimizing compilers can destroy address + dependencies. Please see Documentation/RCU/rcu_dereference.rst + for more information. - See also "Control Dependency" and "Data Dependency". + See also "Control Dependency" and "Data Dependency". Acquire: With respect to a lock, acquiring that lock, for example, using spin_lock(). With respect to a non-lock shared variable, @@ -59,12 +59,12 @@ Control Dependency: When a later store's execution depends on a test 1 if (READ_ONCE(x)) 2 WRITE_ONCE(y, 1); - Here, the control dependency extends from the READ_ONCE() on - line 1 to the WRITE_ONCE() on line 2. Control dependencies are - fragile, and can be easily destroyed by optimizing compilers. - Please see control-dependencies.txt for more information. + Here, the control dependency extends from the READ_ONCE() on + line 1 to the WRITE_ONCE() on line 2. Control dependencies are + fragile, and can be easily destroyed by optimizing compilers. + Please see control-dependencies.txt for more information. - See also "Address Dependency" and "Data Dependency". + See also "Address Dependency" and "Data Dependency". Cycle: Memory-barrier pairing is restricted to a pair of CPUs, as the name suggests. And in a great many cases, a pair of CPUs is all @@ -72,10 +72,10 @@ Cycle: Memory-barrier pairing is restricted to a pair of CPUs, as the extended to additional CPUs, and the result is called a "cycle". In a cycle, each CPU's ordering interacts with that of the next: - CPU 0 CPU 1 CPU 2 - WRITE_ONCE(x, 1); WRITE_ONCE(y, 1); WRITE_ONCE(z, 1); - smp_mb(); smp_mb(); smp_mb(); - r0 = READ_ONCE(y); r1 = READ_ONCE(z); r2 = READ_ONCE(x); + CPU 0 CPU 1 CPU 2 + WRITE_ONCE(x, 1); WRITE_ONCE(y, 1); WRITE_ONCE(z, 1); + smp_mb(); smp_mb(); smp_mb(); + r0 = READ_ONCE(y); r1 = READ_ONCE(z); r2 = READ_ONCE(x); CPU 0's smp_mb() interacts with that of CPU 1, which interacts with that of CPU 2, which in turn interacts with that of CPU 0 diff --git a/tools/memory-model/Documentation/herd-representation.txt b/tools/memory-model/Documentation/herd-representation.txt index ed988906f2b7..4e19b4f2a476 100644 --- a/tools/memory-model/Documentation/herd-representation.txt +++ b/tools/memory-model/Documentation/herd-representation.txt @@ -18,6 +18,11 @@ # # By convention, a blank line in a cell means "same as the preceding line". # +# Note that the syntactic representation does not always match the sets and +# relations in linux-kernel.cat, due to redefinitions in linux-kernel.bell and +# lock.cat. For example, the po link between LKR and LKW is upgraded to an rmw +# link, and W[ACQUIRE] are not included in the Acquire set. +# # Disclaimer. The table includes representations of "add" and "and" operations; # corresponding/identical representations of "sub", "inc", "dec" and "or", "xor", # "andnot" operations are omitted. @@ -27,16 +32,16 @@ ------------------------------------------------------------------------------ | Non-RMW ops | | ------------------------------------------------------------------------------ - | READ_ONCE | R[once] | + | READ_ONCE | R[ONCE] | | atomic_read | | - | WRITE_ONCE | W[once] | + | WRITE_ONCE | W[ONCE] | | atomic_set | | - | smp_load_acquire | R[acquire] | + | smp_load_acquire | R[ACQUIRE] | | atomic_read_acquire | | - | smp_store_release | W[release] | + | smp_store_release | W[RELEASE] | | atomic_set_release | | - | smp_store_mb | W[once] ->po F[mb] | - | smp_mb | F[mb] | + | smp_store_mb | W[ONCE] ->po F[MB] | + | smp_mb | F[MB] | | smp_rmb | F[rmb] | | smp_wmb | F[wmb] | | smp_mb__before_atomic | F[before-atomic] | @@ -49,8 +54,8 @@ | rcu_read_lock | F[rcu-lock] | | rcu_read_unlock | F[rcu-unlock] | | synchronize_rcu | F[sync-rcu] | - | rcu_dereference | R[once] | - | rcu_assign_pointer | W[release] | + | rcu_dereference | R[ONCE] | + | rcu_assign_pointer | W[RELEASE] | | srcu_read_lock | R[srcu-lock] | | srcu_down_read | | | srcu_read_unlock | W[srcu-unlock] | @@ -60,32 +65,31 @@ ------------------------------------------------------------------------------ | RMW ops w/o return value | | ------------------------------------------------------------------------------ - | atomic_add | R*[noreturn] ->rmw W*[once] | + | atomic_add | R*[NORETURN] ->rmw W*[NORETURN] | | atomic_and | | | spin_lock | LKR ->po LKW | ------------------------------------------------------------------------------ | RMW ops w/ return value | | ------------------------------------------------------------------------------ - | atomic_add_return | F[mb] ->po R*[once] | - | | ->rmw W*[once] ->po F[mb] | + | atomic_add_return | R*[MB] ->rmw W*[MB] | | atomic_fetch_add | | | atomic_fetch_and | | | atomic_xchg | | | xchg | | | atomic_add_negative | | - | atomic_add_return_relaxed | R*[once] ->rmw W*[once] | + | atomic_add_return_relaxed | R*[ONCE] ->rmw W*[ONCE] | | atomic_fetch_add_relaxed | | | atomic_fetch_and_relaxed | | | atomic_xchg_relaxed | | | xchg_relaxed | | | atomic_add_negative_relaxed | | - | atomic_add_return_acquire | R*[acquire] ->rmw W*[once] | + | atomic_add_return_acquire | R*[ACQUIRE] ->rmw W*[ACQUIRE] | | atomic_fetch_add_acquire | | | atomic_fetch_and_acquire | | | atomic_xchg_acquire | | | xchg_acquire | | | atomic_add_negative_acquire | | - | atomic_add_return_release | R*[once] ->rmw W*[release] | + | atomic_add_return_release | R*[RELEASE] ->rmw W*[RELEASE] | | atomic_fetch_add_release | | | atomic_fetch_and_release | | | atomic_xchg_release | | @@ -94,17 +98,16 @@ ------------------------------------------------------------------------------ | Conditional RMW ops | | ------------------------------------------------------------------------------ - | atomic_cmpxchg | On success: F[mb] ->po R*[once] | - | | ->rmw W*[once] ->po F[mb] | - | | On failure: R*[once] | + | atomic_cmpxchg | On success: R*[MB] ->rmw W*[MB] | + | | On failure: R*[MB] | | cmpxchg | | | atomic_add_unless | | - | atomic_cmpxchg_relaxed | On success: R*[once] ->rmw W*[once] | - | | On failure: R*[once] | - | atomic_cmpxchg_acquire | On success: R*[acquire] ->rmw W*[once] | - | | On failure: R*[once] | - | atomic_cmpxchg_release | On success: R*[once] ->rmw W*[release] | - | | On failure: R*[once] | + | atomic_cmpxchg_relaxed | On success: R*[ONCE] ->rmw W*[ONCE] | + | | On failure: R*[ONCE] | + | atomic_cmpxchg_acquire | On success: R*[ACQUIRE] ->rmw W*[ACQUIRE] | + | | On failure: R*[ACQUIRE] | + | atomic_cmpxchg_release | On success: R*[RELEASE] ->rmw W*[RELEASE] | + | | On failure: R*[RELEASE] | | spin_trylock | On success: LKR ->po LKW | | | On failure: LF | ------------------------------------------------------------------------------ diff --git a/tools/memory-model/README b/tools/memory-model/README index dab38904206a..64c860863aa9 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -20,7 +20,7 @@ that litmus test to be exercised within the Linux kernel. REQUIREMENTS ============ -Version 7.52 or higher of the "herd7" and "klitmus7" tools must be +Version 7.58 or higher of the "herd7" and "klitmus7" tools must be downloaded separately: https://github.com/herd/herdtools7 @@ -79,7 +79,7 @@ Several thousand more example litmus tests are available here: https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/herd https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/litmus -Documentation describing litmus tests and now to use them may be found +Documentation describing litmus tests and how to use them may be found here: tools/memory-model/Documentation/litmus-tests.txt diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell index ce068700939c..fe65998002b9 100644 --- a/tools/memory-model/linux-kernel.bell +++ b/tools/memory-model/linux-kernel.bell @@ -13,17 +13,18 @@ "Linux-kernel memory consistency model" -enum Accesses = 'once (*READ_ONCE,WRITE_ONCE*) || - 'release (*smp_store_release*) || - 'acquire (*smp_load_acquire*) || - 'noreturn (* R of non-return RMW *) -instructions R[{'once,'acquire,'noreturn}] -instructions W[{'once,'release}] -instructions RMW[{'once,'acquire,'release}] +enum Accesses = 'ONCE (*READ_ONCE,WRITE_ONCE*) || + 'RELEASE (*smp_store_release*) || + 'ACQUIRE (*smp_load_acquire*) || + 'NORETURN (* R of non-return RMW *) || + 'MB (*xchg(),cmpxchg(),...*) +instructions R[Accesses] +instructions W[Accesses] +instructions RMW[Accesses] enum Barriers = 'wmb (*smp_wmb*) || 'rmb (*smp_rmb*) || - 'mb (*smp_mb*) || + 'MB (*smp_mb*) || 'barrier (*barrier*) || 'rcu-lock (*rcu_read_lock*) || 'rcu-unlock (*rcu_read_unlock*) || @@ -35,6 +36,17 @@ enum Barriers = 'wmb (*smp_wmb*) || 'after-srcu-read-unlock (*smp_mb__after_srcu_read_unlock*) instructions F[Barriers] + +(* + * Filter out syntactic annotations that do not provide the corresponding + * semantic ordering, such as Acquire on a store or Mb on a failed RMW. + *) +let FailedRMW = RMW \ (domain(rmw) | range(rmw)) +let Acquire = ACQUIRE \ W \ FailedRMW +let Release = RELEASE \ R \ FailedRMW +let Mb = MB \ FailedRMW +let Noreturn = NORETURN \ W + (* SRCU *) enum SRCU = 'srcu-lock || 'srcu-unlock || 'sync-srcu instructions SRCU[SRCU] @@ -73,7 +85,7 @@ flag ~empty rcu-rscs & (po ; [Sync-srcu] ; po) as invalid-sleep flag ~empty different-values(srcu-rscs) as srcu-bad-value-match (* Compute marked and plain memory accesses *) -let Marked = (~M) | IW | Once | Release | Acquire | domain(rmw) | range(rmw) | +let Marked = (~M) | IW | ONCE | RELEASE | ACQUIRE | MB | RMW | LKR | LKW | UL | LF | RL | RU | Srcu-lock | Srcu-unlock let Plain = M \ Marked @@ -82,3 +94,6 @@ let carry-dep = (data ; [~ Srcu-unlock] ; rfi)* let addr = carry-dep ; addr let ctrl = carry-dep ; ctrl let data = carry-dep ; data + +flag ~empty (if "lkmmv2" then 0 else _) + as this-model-requires-variant-higher-than-lkmmv1 diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index adf3c4f41229..d7e7bf13c831 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -34,6 +34,16 @@ let R4rmb = R \ Noreturn (* Reads for which rmb works *) let rmb = [R4rmb] ; fencerel(Rmb) ; [R4rmb] let wmb = [W] ; fencerel(Wmb) ; [W] let mb = ([M] ; fencerel(Mb) ; [M]) | + (* + * full-barrier RMWs (successful cmpxchg(), xchg(), etc.) act as + * though there were enclosed by smp_mb(). + * The effect of these virtual smp_mb() is formalized by adding + * Mb tags to the read and write of the operation, and providing + * the same ordering as though there were additional po edges + * between the Mb tag and the read resp. write. + *) + ([M] ; po ; [Mb & R]) | + ([Mb & W] ; po ; [M]) | ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | diff --git a/tools/memory-model/linux-kernel.cfg b/tools/memory-model/linux-kernel.cfg index 3c8098e99f41..69b04f3aad73 100644 --- a/tools/memory-model/linux-kernel.cfg +++ b/tools/memory-model/linux-kernel.cfg @@ -1,6 +1,7 @@ macros linux-kernel.def bell linux-kernel.bell model linux-kernel.cat +variant lkmmv2 graph columns squished true showevents noregs diff --git a/tools/memory-model/linux-kernel.def b/tools/memory-model/linux-kernel.def index 88a39601f525..49e402782e49 100644 --- a/tools/memory-model/linux-kernel.def +++ b/tools/memory-model/linux-kernel.def @@ -6,18 +6,18 @@ // which appeared in ASPLOS 2018. // ONCE -READ_ONCE(X) __load{once}(X) -WRITE_ONCE(X,V) { __store{once}(X,V); } +READ_ONCE(X) __load{ONCE}(X) +WRITE_ONCE(X,V) { __store{ONCE}(X,V); } // Release Acquire and friends -smp_store_release(X,V) { __store{release}(*X,V); } -smp_load_acquire(X) __load{acquire}(*X) -rcu_assign_pointer(X,V) { __store{release}(X,V); } -rcu_dereference(X) __load{once}(X) -smp_store_mb(X,V) { __store{once}(X,V); __fence{mb}; } +smp_store_release(X,V) { __store{RELEASE}(*X,V); } +smp_load_acquire(X) __load{ACQUIRE}(*X) +rcu_assign_pointer(X,V) { __store{RELEASE}(X,V); } +rcu_dereference(X) __load{ONCE}(X) +smp_store_mb(X,V) { __store{ONCE}(X,V); __fence{MB}; } // Fences -smp_mb() { __fence{mb}; } +smp_mb() { __fence{MB}; } smp_rmb() { __fence{rmb}; } smp_wmb() { __fence{wmb}; } smp_mb__before_atomic() { __fence{before-atomic}; } @@ -28,14 +28,14 @@ smp_mb__after_srcu_read_unlock() { __fence{after-srcu-read-unlock}; } barrier() { __fence{barrier}; } // Exchange -xchg(X,V) __xchg{mb}(X,V) -xchg_relaxed(X,V) __xchg{once}(X,V) -xchg_release(X,V) __xchg{release}(X,V) -xchg_acquire(X,V) __xchg{acquire}(X,V) -cmpxchg(X,V,W) __cmpxchg{mb}(X,V,W) -cmpxchg_relaxed(X,V,W) __cmpxchg{once}(X,V,W) -cmpxchg_acquire(X,V,W) __cmpxchg{acquire}(X,V,W) -cmpxchg_release(X,V,W) __cmpxchg{release}(X,V,W) +xchg(X,V) __xchg{MB}(X,V) +xchg_relaxed(X,V) __xchg{ONCE}(X,V) +xchg_release(X,V) __xchg{RELEASE}(X,V) +xchg_acquire(X,V) __xchg{ACQUIRE}(X,V) +cmpxchg(X,V,W) __cmpxchg{MB}(X,V,W) +cmpxchg_relaxed(X,V,W) __cmpxchg{ONCE}(X,V,W) +cmpxchg_acquire(X,V,W) __cmpxchg{ACQUIRE}(X,V,W) +cmpxchg_release(X,V,W) __cmpxchg{RELEASE}(X,V,W) // Spinlocks spin_lock(X) { __lock(X); } @@ -63,57 +63,86 @@ atomic_set(X,V) { WRITE_ONCE(*X,V); } atomic_read_acquire(X) smp_load_acquire(X) atomic_set_release(X,V) { smp_store_release(X,V); } -atomic_add(V,X) { __atomic_op(X,+,V); } -atomic_sub(V,X) { __atomic_op(X,-,V); } -atomic_inc(X) { __atomic_op(X,+,1); } -atomic_dec(X) { __atomic_op(X,-,1); } - -atomic_add_return(V,X) __atomic_op_return{mb}(X,+,V) -atomic_add_return_relaxed(V,X) __atomic_op_return{once}(X,+,V) -atomic_add_return_acquire(V,X) __atomic_op_return{acquire}(X,+,V) -atomic_add_return_release(V,X) __atomic_op_return{release}(X,+,V) -atomic_fetch_add(V,X) __atomic_fetch_op{mb}(X,+,V) -atomic_fetch_add_relaxed(V,X) __atomic_fetch_op{once}(X,+,V) -atomic_fetch_add_acquire(V,X) __atomic_fetch_op{acquire}(X,+,V) -atomic_fetch_add_release(V,X) __atomic_fetch_op{release}(X,+,V) - -atomic_inc_return(X) __atomic_op_return{mb}(X,+,1) -atomic_inc_return_relaxed(X) __atomic_op_return{once}(X,+,1) -atomic_inc_return_acquire(X) __atomic_op_return{acquire}(X,+,1) -atomic_inc_return_release(X) __atomic_op_return{release}(X,+,1) -atomic_fetch_inc(X) __atomic_fetch_op{mb}(X,+,1) -atomic_fetch_inc_relaxed(X) __atomic_fetch_op{once}(X,+,1) -atomic_fetch_inc_acquire(X) __atomic_fetch_op{acquire}(X,+,1) -atomic_fetch_inc_release(X) __atomic_fetch_op{release}(X,+,1) - -atomic_sub_return(V,X) __atomic_op_return{mb}(X,-,V) -atomic_sub_return_relaxed(V,X) __atomic_op_return{once}(X,-,V) -atomic_sub_return_acquire(V,X) __atomic_op_return{acquire}(X,-,V) -atomic_sub_return_release(V,X) __atomic_op_return{release}(X,-,V) -atomic_fetch_sub(V,X) __atomic_fetch_op{mb}(X,-,V) -atomic_fetch_sub_relaxed(V,X) __atomic_fetch_op{once}(X,-,V) -atomic_fetch_sub_acquire(V,X) __atomic_fetch_op{acquire}(X,-,V) -atomic_fetch_sub_release(V,X) __atomic_fetch_op{release}(X,-,V) - -atomic_dec_return(X) __atomic_op_return{mb}(X,-,1) -atomic_dec_return_relaxed(X) __atomic_op_return{once}(X,-,1) -atomic_dec_return_acquire(X) __atomic_op_return{acquire}(X,-,1) -atomic_dec_return_release(X) __atomic_op_return{release}(X,-,1) -atomic_fetch_dec(X) __atomic_fetch_op{mb}(X,-,1) -atomic_fetch_dec_relaxed(X) __atomic_fetch_op{once}(X,-,1) -atomic_fetch_dec_acquire(X) __atomic_fetch_op{acquire}(X,-,1) -atomic_fetch_dec_release(X) __atomic_fetch_op{release}(X,-,1) - -atomic_xchg(X,V) __xchg{mb}(X,V) -atomic_xchg_relaxed(X,V) __xchg{once}(X,V) -atomic_xchg_release(X,V) __xchg{release}(X,V) -atomic_xchg_acquire(X,V) __xchg{acquire}(X,V) -atomic_cmpxchg(X,V,W) __cmpxchg{mb}(X,V,W) -atomic_cmpxchg_relaxed(X,V,W) __cmpxchg{once}(X,V,W) -atomic_cmpxchg_acquire(X,V,W) __cmpxchg{acquire}(X,V,W) -atomic_cmpxchg_release(X,V,W) __cmpxchg{release}(X,V,W) - -atomic_sub_and_test(V,X) __atomic_op_return{mb}(X,-,V) == 0 -atomic_dec_and_test(X) __atomic_op_return{mb}(X,-,1) == 0 -atomic_inc_and_test(X) __atomic_op_return{mb}(X,+,1) == 0 -atomic_add_negative(V,X) __atomic_op_return{mb}(X,+,V) < 0 +atomic_add(V,X) { __atomic_op{NORETURN}(X,+,V); } +atomic_sub(V,X) { __atomic_op{NORETURN}(X,-,V); } +atomic_and(V,X) { __atomic_op{NORETURN}(X,&,V); } +atomic_or(V,X) { __atomic_op{NORETURN}(X,|,V); } +atomic_xor(V,X) { __atomic_op{NORETURN}(X,^,V); } +atomic_inc(X) { __atomic_op{NORETURN}(X,+,1); } +atomic_dec(X) { __atomic_op{NORETURN}(X,-,1); } +atomic_andnot(V,X) { __atomic_op{NORETURN}(X,&~,V); } + +atomic_add_return(V,X) __atomic_op_return{MB}(X,+,V) +atomic_add_return_relaxed(V,X) __atomic_op_return{ONCE}(X,+,V) +atomic_add_return_acquire(V,X) __atomic_op_return{ACQUIRE}(X,+,V) +atomic_add_return_release(V,X) __atomic_op_return{RELEASE}(X,+,V) +atomic_fetch_add(V,X) __atomic_fetch_op{MB}(X,+,V) +atomic_fetch_add_relaxed(V,X) __atomic_fetch_op{ONCE}(X,+,V) +atomic_fetch_add_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,+,V) +atomic_fetch_add_release(V,X) __atomic_fetch_op{RELEASE}(X,+,V) + +atomic_fetch_and(V,X) __atomic_fetch_op{MB}(X,&,V) +atomic_fetch_and_relaxed(V,X) __atomic_fetch_op{ONCE}(X,&,V) +atomic_fetch_and_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,&,V) +atomic_fetch_and_release(V,X) __atomic_fetch_op{RELEASE}(X,&,V) + +atomic_fetch_or(V,X) __atomic_fetch_op{MB}(X,|,V) +atomic_fetch_or_relaxed(V,X) __atomic_fetch_op{ONCE}(X,|,V) +atomic_fetch_or_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,|,V) +atomic_fetch_or_release(V,X) __atomic_fetch_op{RELEASE}(X,|,V) + +atomic_fetch_xor(V,X) __atomic_fetch_op{MB}(X,^,V) +atomic_fetch_xor_relaxed(V,X) __atomic_fetch_op{ONCE}(X,^,V) +atomic_fetch_xor_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,^,V) +atomic_fetch_xor_release(V,X) __atomic_fetch_op{RELEASE}(X,^,V) + +atomic_inc_return(X) __atomic_op_return{MB}(X,+,1) +atomic_inc_return_relaxed(X) __atomic_op_return{ONCE}(X,+,1) +atomic_inc_return_acquire(X) __atomic_op_return{ACQUIRE}(X,+,1) +atomic_inc_return_release(X) __atomic_op_return{RELEASE}(X,+,1) +atomic_fetch_inc(X) __atomic_fetch_op{MB}(X,+,1) +atomic_fetch_inc_relaxed(X) __atomic_fetch_op{ONCE}(X,+,1) +atomic_fetch_inc_acquire(X) __atomic_fetch_op{ACQUIRE}(X,+,1) +atomic_fetch_inc_release(X) __atomic_fetch_op{RELEASE}(X,+,1) + +atomic_sub_return(V,X) __atomic_op_return{MB}(X,-,V) +atomic_sub_return_relaxed(V,X) __atomic_op_return{ONCE}(X,-,V) +atomic_sub_return_acquire(V,X) __atomic_op_return{ACQUIRE}(X,-,V) +atomic_sub_return_release(V,X) __atomic_op_return{RELEASE}(X,-,V) +atomic_fetch_sub(V,X) __atomic_fetch_op{MB}(X,-,V) +atomic_fetch_sub_relaxed(V,X) __atomic_fetch_op{ONCE}(X,-,V) +atomic_fetch_sub_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,-,V) +atomic_fetch_sub_release(V,X) __atomic_fetch_op{RELEASE}(X,-,V) + +atomic_dec_return(X) __atomic_op_return{MB}(X,-,1) +atomic_dec_return_relaxed(X) __atomic_op_return{ONCE}(X,-,1) +atomic_dec_return_acquire(X) __atomic_op_return{ACQUIRE}(X,-,1) +atomic_dec_return_release(X) __atomic_op_return{RELEASE}(X,-,1) +atomic_fetch_dec(X) __atomic_fetch_op{MB}(X,-,1) +atomic_fetch_dec_relaxed(X) __atomic_fetch_op{ONCE}(X,-,1) +atomic_fetch_dec_acquire(X) __atomic_fetch_op{ACQUIRE}(X,-,1) +atomic_fetch_dec_release(X) __atomic_fetch_op{RELEASE}(X,-,1) + +atomic_xchg(X,V) __xchg{MB}(X,V) +atomic_xchg_relaxed(X,V) __xchg{ONCE}(X,V) +atomic_xchg_release(X,V) __xchg{RELEASE}(X,V) +atomic_xchg_acquire(X,V) __xchg{ACQUIRE}(X,V) +atomic_cmpxchg(X,V,W) __cmpxchg{MB}(X,V,W) +atomic_cmpxchg_relaxed(X,V,W) __cmpxchg{ONCE}(X,V,W) +atomic_cmpxchg_acquire(X,V,W) __cmpxchg{ACQUIRE}(X,V,W) +atomic_cmpxchg_release(X,V,W) __cmpxchg{RELEASE}(X,V,W) + +atomic_sub_and_test(V,X) __atomic_op_return{MB}(X,-,V) == 0 +atomic_dec_and_test(X) __atomic_op_return{MB}(X,-,1) == 0 +atomic_inc_and_test(X) __atomic_op_return{MB}(X,+,1) == 0 +atomic_add_negative(V,X) __atomic_op_return{MB}(X,+,V) < 0 +atomic_add_negative_relaxed(V,X) __atomic_op_return{ONCE}(X,+,V) < 0 +atomic_add_negative_acquire(V,X) __atomic_op_return{ACQUIRE}(X,+,V) < 0 +atomic_add_negative_release(V,X) __atomic_op_return{RELEASE}(X,+,V) < 0 + +atomic_fetch_andnot(V,X) __atomic_fetch_op{MB}(X,&~,V) +atomic_fetch_andnot_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,&~,V) +atomic_fetch_andnot_release(V,X) __atomic_fetch_op{RELEASE}(X,&~,V) +atomic_fetch_andnot_relaxed(V,X) __atomic_fetch_op{ONCE}(X,&~,V) + +atomic_add_unless(X,V,W) __atomic_add_unless{MB}(X,V,W) diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt index 7c3ee959b63c..28ac57b9e102 100644 --- a/tools/objtool/Documentation/objtool.txt +++ b/tools/objtool/Documentation/objtool.txt @@ -28,6 +28,15 @@ Objtool has the following features: sites, enabling the kernel to patch them inline, to prevent "thunk funneling" for both security and performance reasons +- Return thunk validation -- validates return thunks are used for + certain CPU mitigations including Retbleed and SRSO + +- Return thunk annotation -- annotates all return thunk sites so kernel + can patch them inline, depending on enabled mitigations + +- Return thunk training valiation -- validate that all entry paths + untrain a "safe return" before the first return (or call) + - Non-instrumentation validation -- validates non-instrumentable ("noinstr") code rules, preventing instrumentation in low-level C entry code @@ -53,6 +62,9 @@ Objtool has the following features: - Function entry annotation -- annotates function entries, enabling kernel function tracing +- Function preamble (prefix) annotation and/or symbol generation -- used + for FineIBT and call depth tracking + - Other toolchain hacks which will go unmentioned at this time... Each feature can be enabled individually or in combination using the @@ -197,19 +209,17 @@ To achieve the validation, objtool enforces the following rules: 1. Each callable function must be annotated as such with the ELF function type. In asm code, this is typically done using the - ENTRY/ENDPROC macros. If objtool finds a return instruction + SYM_FUNC_{START,END} macros. If objtool finds a return instruction outside of a function, it flags an error since that usually indicates callable code which should be annotated accordingly. This rule is needed so that objtool can properly identify each callable function in order to analyze its stack metadata. -2. Conversely, each section of code which is *not* callable should *not* - be annotated as an ELF function. The ENDPROC macro shouldn't be used - in this case. - - This rule is needed so that objtool can ignore non-callable code. - Such code doesn't have to follow any of the other rules. +2. Conversely, each section of code which is *not* callable, or is + otherwise doing funny things with the stack or registers, should + *not* be annotated as an ELF function. Rather, SYM_CODE_{START,END} + should be used along with unwind hints. 3. Each callable function which calls another function must have the correct frame pointer logic, if required by CONFIG_FRAME_POINTER or @@ -221,7 +231,7 @@ To achieve the validation, objtool enforces the following rules: function B, the _caller_ of function A will be skipped on the stack trace. -4. Dynamic jumps and jumps to undefined symbols are only allowed if: +4. Indirect jumps and jumps to undefined symbols are only allowed if: a) the jump is part of a switch statement; or @@ -304,29 +314,29 @@ the objtool maintainers. 001e 2823e: 80 ce 02 or $0x2,%dh ... + 2. file.o: warning: objtool: .text+0x53: unreachable instruction Objtool couldn't find a code path to reach the instruction. If the error is for an asm file, and the instruction is inside (or reachable from) a callable function, the function should be annotated - with the ENTRY/ENDPROC macros (ENDPROC is the important one). - Otherwise, the code should probably be annotated with the unwind hint - macros in asm/unwind_hints.h so objtool and the unwinder can know the - stack state associated with the code. + with the SYM_FUNC_START and SYM_FUNC_END macros. - If you're 100% sure the code won't affect stack traces, or if you're - a just a bad person, you can tell objtool to ignore it. See the - "Adding exceptions" section below. + Otherwise, SYM_CODE_START can be used. In that case the code needs + to be annotated with unwind hint macros. - If it's not actually in a callable function (e.g. kernel entry code), - change ENDPROC to END. + If you're sure the code won't affect the reliability of runtime stack + traces and want objtool to ignore it, see "Adding exceptions" below. -3. file.o: warning: objtool: foo+0x48c: bar() is missing a __noreturn annotation - The call from foo() to bar() doesn't return, but bar() is missing the - __noreturn annotation. NOTE: In addition to annotating the function - with __noreturn, please also add it to tools/objtool/noreturns.h. +3. file.o: warning: objtool: foo+0x48c: bar() missing __noreturn in .c/.h or NORETURN() in noreturns.h + + The call from foo() to bar() doesn't return, but bar() is incorrectly + annotated. A noreturn function must be marked __noreturn in both its + declaration and its definition, and must have a NORETURN() annotation + in tools/objtool/noreturns.h. + 4. file.o: warning: objtool: func(): can't find starting instruction or @@ -341,23 +351,21 @@ the objtool maintainers. This is a kernel entry/exit instruction like sysenter or iret. Such instructions aren't allowed in a callable function, and are most - likely part of the kernel entry code. They should usually not have - the callable function annotation (ENDPROC) and should always be - annotated with the unwind hint macros in asm/unwind_hints.h. + likely part of the kernel entry code. Such code should probably be + placed in a SYM_FUNC_CODE block with unwind hints. 6. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame - This is a dynamic jump or a jump to an undefined symbol. Objtool - assumed it's a sibling call and detected that the frame pointer - wasn't first restored to its original state. + This is a branch to an UNDEF symbol. Objtool assumed it's a + sibling call and detected that the stack wasn't first restored to its + original state. - If it's not really a sibling call, you may need to move the - destination code to the local file. + If it's not really a sibling call, you may need to use unwind hints + and/or move the destination code to the local file. If the instruction is not actually in a callable function (e.g. - kernel entry code), change ENDPROC to END and annotate manually with - the unwind hint macros in asm/unwind_hints.h. + kernel entry code), use SYM_CODE_{START,END} and unwind hints. 7. file: warning: objtool: func()+0x5c: stack state mismatch @@ -373,8 +381,8 @@ the objtool maintainers. Another possibility is that the code has some asm or inline asm which does some unusual things to the stack or the frame pointer. In such - cases it's probably appropriate to use the unwind hint macros in - asm/unwind_hints.h. + cases it's probably appropriate to use SYM_FUNC_CODE with unwind + hints. 8. file.o: warning: objtool: funcA() falls through to next function funcB() @@ -384,17 +392,16 @@ the objtool maintainers. can fall through into the next function. There could be different reasons for this: - 1) funcA()'s last instruction is a call to a "noreturn" function like + a) funcA()'s last instruction is a call to a "noreturn" function like panic(). In this case the noreturn function needs to be added to objtool's hard-coded global_noreturns array. Feel free to bug the objtool maintainer, or you can submit a patch. - 2) funcA() uses the unreachable() annotation in a section of code + b) funcA() uses the unreachable() annotation in a section of code that is actually reachable. - 3) If funcA() calls an inline function, the object code for funcA() - might be corrupt due to a gcc bug. For more details, see: - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70646 + c) Some undefined behavior like divide by zero. + 9. file.o: warning: objtool: funcA() call to funcB() with UACCESS enabled @@ -432,24 +439,26 @@ the objtool maintainers. This limitation can be overcome by massaging the alternatives with NOPs to shift the stack changes around so they no longer conflict. + 11. file.o: warning: unannotated intra-function call - This warning means that a direct call is done to a destination which - is not at the beginning of a function. If this is a legit call, you - can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL - directive right before the call. + This warning means that a direct call is done to a destination which + is not at the beginning of a function. If this is a legit call, you + can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL + directive right before the call. + 12. file.o: warning: func(): not an indirect call target - This means that objtool is running with --ibt and a function expected - to be an indirect call target is not. In particular, this happens for - init_module() or cleanup_module() if a module relies on these special - names and does not use module_init() / module_exit() macros to create - them. + This means that objtool is running with --ibt and a function + expected to be an indirect call target is not. In particular, this + happens for init_module() or cleanup_module() if a module relies on + these special names and does not use module_init() / module_exit() + macros to create them. If the error doesn't seem to make sense, it could be a bug in objtool. -Feel free to ask the objtool maintainer for help. +Feel free to ask objtool maintainers for help. Adding exceptions diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 7a65948892e5..8c20361dd100 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -37,7 +37,7 @@ OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBE OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) # Allow old libelf to be used: -elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - | grep elf_getshdr) +elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - 2>/dev/null | grep elf_getshdr) OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) # Always want host compilation. diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index 69b66994f2a1..02e490555966 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -5,10 +5,7 @@ #include <asm/inst.h> #include <asm/orc_types.h> #include <linux/objtool_types.h> - -#ifndef EM_LOONGARCH -#define EM_LOONGARCH 258 -#endif +#include <arch/elf.h> int arch_ftrace_match(char *name) { @@ -363,3 +360,26 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state) state->cfa.base = CFI_SP; state->cfa.offset = 0; } + +unsigned int arch_reloc_size(struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_LARCH_32: + case R_LARCH_32_PCREL: + return 4; + default: + return 8; + } +} + +unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table) +{ + switch (reloc_type(reloc)) { + case R_LARCH_32_PCREL: + case R_LARCH_64_PCREL: + return reloc->sym->offset + reloc_addend(reloc) - + (reloc_offset(reloc) - reloc_offset(table)); + default: + return reloc->sym->offset + reloc_addend(reloc); + } +} diff --git a/tools/objtool/arch/loongarch/include/arch/elf.h b/tools/objtool/arch/loongarch/include/arch/elf.h index 9623d663220e..ec79062c9554 100644 --- a/tools/objtool/arch/loongarch/include/arch/elf.h +++ b/tools/objtool/arch/loongarch/include/arch/elf.h @@ -18,6 +18,13 @@ #ifndef R_LARCH_32_PCREL #define R_LARCH_32_PCREL 99 #endif +#ifndef R_LARCH_64_PCREL +#define R_LARCH_64_PCREL 109 +#endif + +#ifndef EM_LOONGARCH +#define EM_LOONGARCH 258 +#endif #define R_NONE R_LARCH_NONE #define R_ABS32 R_LARCH_32 diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index 87230ed570fd..e39f86d97002 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <string.h> #include <objtool/special.h> +#include <objtool/warn.h> bool arch_support_alt_relocation(struct special_alt *special_alt, struct instruction *insn, @@ -8,9 +10,164 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, return false; } +struct table_info { + struct list_head jump_info; + unsigned long insn_offset; + unsigned long rodata_offset; +}; + +static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, + struct instruction *insn, + unsigned long *table_size) +{ + struct section *rsec; + struct reloc *reloc; + struct list_head table_list; + struct table_info *orig_table; + struct table_info *next_table; + unsigned long tmp_insn_offset; + unsigned long tmp_rodata_offset; + + rsec = find_section_by_name(file->elf, ".rela.discard.tablejump_annotate"); + if (!rsec) + return; + + INIT_LIST_HEAD(&table_list); + + for_each_reloc(rsec, reloc) { + orig_table = malloc(sizeof(struct table_info)); + if (!orig_table) { + WARN("malloc failed"); + return; + } + + orig_table->insn_offset = reloc->sym->offset + reloc_addend(reloc); + reloc++; + orig_table->rodata_offset = reloc->sym->offset + reloc_addend(reloc); + + list_add_tail(&orig_table->jump_info, &table_list); + + if (reloc_idx(reloc) + 1 == sec_num_entries(rsec)) + break; + } + + list_for_each_entry(orig_table, &table_list, jump_info) { + next_table = list_next_entry(orig_table, jump_info); + list_for_each_entry_from(next_table, &table_list, jump_info) { + if (next_table->rodata_offset < orig_table->rodata_offset) { + tmp_insn_offset = next_table->insn_offset; + tmp_rodata_offset = next_table->rodata_offset; + next_table->insn_offset = orig_table->insn_offset; + next_table->rodata_offset = orig_table->rodata_offset; + orig_table->insn_offset = tmp_insn_offset; + orig_table->rodata_offset = tmp_rodata_offset; + } + } + } + + list_for_each_entry(orig_table, &table_list, jump_info) { + if (insn->offset == orig_table->insn_offset) { + next_table = list_next_entry(orig_table, jump_info); + if (&next_table->jump_info == &table_list) { + *table_size = 0; + return; + } + + while (next_table->rodata_offset == orig_table->rodata_offset) { + next_table = list_next_entry(next_table, jump_info); + if (&next_table->jump_info == &table_list) { + *table_size = 0; + return; + } + } + + *table_size = next_table->rodata_offset - orig_table->rodata_offset; + } + } +} + +static struct reloc *find_reloc_by_table_annotate(struct objtool_file *file, + struct instruction *insn, + unsigned long *table_size) +{ + struct section *rsec; + struct reloc *reloc; + unsigned long offset; + + rsec = find_section_by_name(file->elf, ".rela.discard.tablejump_annotate"); + if (!rsec) + return NULL; + + for_each_reloc(rsec, reloc) { + if (reloc->sym->sec->rodata) + continue; + + if (strcmp(insn->sec->name, reloc->sym->sec->name)) + continue; + + offset = reloc->sym->offset + reloc_addend(reloc); + if (insn->offset == offset) { + get_rodata_table_size_by_table_annotate(file, insn, table_size); + reloc++; + return reloc; + } + } + + return NULL; +} + +static struct reloc *find_reloc_of_rodata_c_jump_table(struct section *sec, + unsigned long offset, + unsigned long *table_size) +{ + struct section *rsec; + struct reloc *reloc; + + rsec = sec->rsec; + if (!rsec) + return NULL; + + for_each_reloc(rsec, reloc) { + if (reloc_offset(reloc) > offset) + break; + + if (!strcmp(reloc->sym->sec->name, C_JUMP_TABLE_SECTION)) { + *table_size = 0; + return reloc; + } + } + + return NULL; +} + struct reloc *arch_find_switch_table(struct objtool_file *file, struct instruction *insn, unsigned long *table_size) { - return NULL; + struct reloc *annotate_reloc; + struct reloc *rodata_reloc; + struct section *table_sec; + unsigned long table_offset; + + annotate_reloc = find_reloc_by_table_annotate(file, insn, table_size); + if (!annotate_reloc) { + annotate_reloc = find_reloc_of_rodata_c_jump_table( + insn->sec, insn->offset, table_size); + if (!annotate_reloc) + return NULL; + } + + table_sec = annotate_reloc->sym->sec; + table_offset = annotate_reloc->sym->offset + reloc_addend(annotate_reloc); + + /* + * Each table entry has a rela associated with it. The rela + * should reference text in the same function as the original + * instruction. + */ + rodata_reloc = find_reloc_by_dest(file->elf, table_sec, table_offset); + if (!rodata_reloc) + return NULL; + + return rodata_reloc; } diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c index 53b55690f320..7c0bf2429067 100644 --- a/tools/objtool/arch/powerpc/decode.c +++ b/tools/objtool/arch/powerpc/decode.c @@ -106,3 +106,17 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state) state->regs[CFI_RA].base = CFI_CFA; state->regs[CFI_RA].offset = 0; } + +unsigned int arch_reloc_size(struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_PPC_REL32: + case R_PPC_ADDR32: + case R_PPC_UADDR32: + case R_PPC_PLT32: + case R_PPC_PLTREL32: + return 4; + default: + return 8; + } +} diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index fe1362c34564..7567c893f45e 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -850,5 +850,19 @@ bool arch_is_rethunk(struct symbol *sym) bool arch_is_embedded_insn(struct symbol *sym) { return !strcmp(sym->name, "retbleed_return_thunk") || + !strcmp(sym->name, "srso_alias_safe_ret") || !strcmp(sym->name, "srso_safe_ret"); } + +unsigned int arch_reloc_size(struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_X86_64_32: + case R_X86_64_32S: + case R_X86_64_PC32: + case R_X86_64_PLT32: + return 4; + default: + return 8; + } +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 387d56a7f5fb..5f761f420b8c 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -6,6 +6,10 @@ #include <subcmd/parse-options.h> #include <string.h> #include <stdlib.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/sendfile.h> #include <objtool/builtin.h> #include <objtool/objtool.h> @@ -14,6 +18,8 @@ "error: objtool: " format "\n", \ ##__VA_ARGS__) +const char *objname; + struct opts opts; static const char * const check_usage[] = { @@ -71,7 +77,7 @@ static const struct option check_options[] = { OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), - OPT_BOOLEAN('o', "orc", &opts.orc, "generate ORC metadata"), + OPT_BOOLEAN(0, "orc", &opts.orc, "generate ORC metadata"), OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), @@ -84,16 +90,17 @@ static const struct option check_options[] = { OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), - OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"), - OPT_BOOLEAN(0, "backup", &opts.backup, "create .orig files before modification"), - OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), - OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), - OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), - OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), - OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), - OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), - OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), + OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"), + OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), + OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), + OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), + OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), + OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), + OPT_STRING('o', "output", &opts.output, "file", "output file name"), + OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), + OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"), + OPT_BOOLEAN(0, "Werror", &opts.werror, "return error on warnings"), OPT_END(), }; @@ -131,6 +138,26 @@ int cmd_parse_options(int argc, const char **argv, const char * const usage[]) static bool opts_valid(void) { + if (opts.mnop && !opts.mcount) { + ERROR("--mnop requires --mcount"); + return false; + } + + if (opts.noinstr && !opts.link) { + ERROR("--noinstr requires --link"); + return false; + } + + if (opts.ibt && !opts.link) { + ERROR("--ibt requires --link"); + return false; + } + + if (opts.unret && !opts.link) { + ERROR("--unret requires --link"); + return false; + } + if (opts.hack_jump_label || opts.hack_noinstr || opts.ibt || @@ -151,11 +178,6 @@ static bool opts_valid(void) return true; } - if (opts.unret && !opts.rethunk) { - ERROR("--unret requires --rethunk"); - return false; - } - if (opts.dump_orc) return true; @@ -163,76 +185,156 @@ static bool opts_valid(void) return false; } -static bool mnop_opts_valid(void) +static int copy_file(const char *src, const char *dst) { - if (opts.mnop && !opts.mcount) { - ERROR("--mnop requires --mcount"); - return false; + size_t to_copy, copied; + int dst_fd, src_fd; + struct stat stat; + off_t offset = 0; + + src_fd = open(src, O_RDONLY); + if (src_fd == -1) { + ERROR("can't open '%s' for reading", src); + return 1; } - return true; -} - -static bool link_opts_valid(struct objtool_file *file) -{ - if (opts.link) - return true; + dst_fd = open(dst, O_WRONLY | O_CREAT | O_TRUNC, 0400); + if (dst_fd == -1) { + ERROR("can't open '%s' for writing", dst); + return 1; + } - if (has_multiple_files(file->elf)) { - ERROR("Linked object detected, forcing --link"); - opts.link = true; - return true; + if (fstat(src_fd, &stat) == -1) { + perror("fstat"); + return 1; } - if (opts.noinstr) { - ERROR("--noinstr requires --link"); - return false; + if (fchmod(dst_fd, stat.st_mode) == -1) { + perror("fchmod"); + return 1; } - if (opts.ibt) { - ERROR("--ibt requires --link"); - return false; + for (to_copy = stat.st_size; to_copy > 0; to_copy -= copied) { + copied = sendfile(dst_fd, src_fd, &offset, to_copy); + if (copied == -1) { + perror("sendfile"); + return 1; + } } - if (opts.unret) { - ERROR("--unret requires --link"); - return false; + close(dst_fd); + close(src_fd); + return 0; +} + +static char **save_argv(int argc, const char **argv) +{ + char **orig_argv; + + orig_argv = calloc(argc, sizeof(char *)); + if (!orig_argv) { + perror("calloc"); + return NULL; } - return true; + for (int i = 0; i < argc; i++) { + orig_argv[i] = strdup(argv[i]); + if (!orig_argv[i]) { + perror("strdup"); + return NULL; + } + }; + + return orig_argv; } +#define ORIG_SUFFIX ".orig" + int objtool_run(int argc, const char **argv) { - const char *objname; struct objtool_file *file; - int ret; + char *backup = NULL; + char **orig_argv; + int ret = 0; - argc = cmd_parse_options(argc, argv, check_usage); - objname = argv[0]; + orig_argv = save_argv(argc, argv); + if (!orig_argv) + return 1; + + cmd_parse_options(argc, argv, check_usage); if (!opts_valid()) return 1; + objname = argv[0]; + if (opts.dump_orc) return orc_dump(objname); + if (!opts.dryrun && opts.output) { + /* copy original .o file to output file */ + if (copy_file(objname, opts.output)) + return 1; + + /* from here on, work directly on the output file */ + objname = opts.output; + } + file = objtool_open_read(objname); if (!file) - return 1; + goto err; - if (!mnop_opts_valid()) - return 1; - - if (!link_opts_valid(file)) - return 1; + if (!opts.link && has_multiple_files(file->elf)) { + ERROR("Linked object requires --link"); + goto err; + } ret = check(file); if (ret) - return ret; + goto err; - if (file->elf->changed) - return elf_write(file->elf); + if (!opts.dryrun && file->elf->changed && elf_write(file->elf)) + goto err; return 0; + +err: + if (opts.dryrun) + goto err_msg; + + if (opts.output) { + unlink(opts.output); + goto err_msg; + } + + /* + * Make a backup before kbuild deletes the file so the error + * can be recreated without recompiling or relinking. + */ + backup = malloc(strlen(objname) + strlen(ORIG_SUFFIX) + 1); + if (!backup) { + perror("malloc"); + return 1; + } + + strcpy(backup, objname); + strcat(backup, ORIG_SUFFIX); + if (copy_file(objname, backup)) + return 1; + +err_msg: + fprintf(stderr, "%s", orig_argv[0]); + + for (int i = 1; i < argc; i++) { + char *arg = orig_argv[i]; + + if (backup && !strcmp(arg, objname)) + fprintf(stderr, " %s -o %s", backup, objname); + else + fprintf(stderr, " %s", arg); + } + + fprintf(stderr, "\n"); + + return 1; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index ce973d9d8e6d..ca3435acc326 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1284,15 +1284,6 @@ static void annotate_call_site(struct objtool_file *file, if (!sym) sym = reloc->sym; - /* - * Alternative replacement code is just template code which is - * sometimes copied to the original instruction. For now, don't - * annotate it. (In the future we might consider annotating the - * original instruction if/when it ever makes sense to do so.) - */ - if (!strcmp(insn->sec->name, ".altinstr_replacement")) - return; - if (sym->static_call_tramp) { list_add_tail(&insn->call_node, &file->static_call_list); return; @@ -1350,7 +1341,8 @@ static void annotate_call_site(struct objtool_file *file, return; } - if (insn->type == INSN_CALL && !insn->sec->init) + if (insn->type == INSN_CALL && !insn->sec->init && + !insn->_call_dest->embedded_insn) list_add_tail(&insn->call_node, &file->call_list); if (!sibling && dead_end_function(file, sym)) @@ -1944,6 +1936,11 @@ out: return ret; } +__weak unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table) +{ + return reloc->sym->offset + reloc_addend(reloc); +} + static int add_jump_table(struct objtool_file *file, struct instruction *insn, struct reloc *next_table) { @@ -1954,6 +1951,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, unsigned int prev_offset = 0; struct reloc *reloc = table; struct alternative *alt; + unsigned long sym_offset; /* * Each @reloc is a switch table relocation which points to the target @@ -1968,12 +1966,13 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, break; /* Make sure the table entries are consecutive: */ - if (prev_offset && reloc_offset(reloc) != prev_offset + 8) + if (prev_offset && reloc_offset(reloc) != prev_offset + arch_reloc_size(reloc)) break; + sym_offset = arch_jump_table_sym_offset(reloc, table); + /* Detect function pointers from contiguous objects: */ - if (reloc->sym->sec == pfunc->sec && - reloc_addend(reloc) == pfunc->offset) + if (reloc->sym->sec == pfunc->sec && sym_offset == pfunc->offset) break; /* @@ -1981,10 +1980,10 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, * which point to the end of the function. Ignore them. */ if (reloc->sym->sec == pfunc->sec && - reloc_addend(reloc) == pfunc->offset + pfunc->len) + sym_offset == pfunc->offset + pfunc->len) goto next; - dest_insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); + dest_insn = find_insn(file, reloc->sym->sec, sym_offset); if (!dest_insn) break; @@ -2023,6 +2022,7 @@ static void find_jump_table(struct objtool_file *file, struct symbol *func, struct reloc *table_reloc; struct instruction *dest_insn, *orig_insn = insn; unsigned long table_size; + unsigned long sym_offset; /* * Backward search using the @first_jump_src links, these help avoid @@ -2046,7 +2046,10 @@ static void find_jump_table(struct objtool_file *file, struct symbol *func, table_reloc = arch_find_switch_table(file, insn, &table_size); if (!table_reloc) continue; - dest_insn = find_insn(file, table_reloc->sym->sec, reloc_addend(table_reloc)); + + sym_offset = table_reloc->sym->offset + reloc_addend(table_reloc); + + dest_insn = find_insn(file, table_reloc->sym->sec, sym_offset); if (!dest_insn || !insn_func(dest_insn) || insn_func(dest_insn)->pfunc != func) continue; @@ -4449,35 +4452,6 @@ static int validate_sls(struct objtool_file *file) return warnings; } -static bool ignore_noreturn_call(struct instruction *insn) -{ - struct symbol *call_dest = insn_call_dest(insn); - - /* - * FIXME: hack, we need a real noreturn solution - * - * Problem is, exc_double_fault() may or may not return, depending on - * whether CONFIG_X86_ESPFIX64 is set. But objtool has no visibility - * to the kernel config. - * - * Other potential ways to fix it: - * - * - have compiler communicate __noreturn functions somehow - * - remove CONFIG_X86_ESPFIX64 - * - read the .config file - * - add a cmdline option - * - create a generic objtool annotation format (vs a bunch of custom - * formats) and annotate it - */ - if (!strcmp(call_dest->name, "exc_double_fault")) { - /* prevent further unreachable warnings for the caller */ - insn->sym->warned = 1; - return true; - } - - return false; -} - static int validate_reachable_instructions(struct objtool_file *file) { struct instruction *insn, *prev_insn; @@ -4494,8 +4468,8 @@ static int validate_reachable_instructions(struct objtool_file *file) prev_insn = prev_insn_same_sec(file, insn); if (prev_insn && prev_insn->dead_end) { call_dest = insn_call_dest(prev_insn); - if (call_dest && !ignore_noreturn_call(prev_insn)) { - WARN_INSN(insn, "%s() is missing a __noreturn annotation", + if (call_dest) { + WARN_INSN(insn, "%s() missing __noreturn in .c/.h or NORETURN() in noreturns.h", call_dest->name); warnings++; continue; @@ -4565,7 +4539,7 @@ static int disas_warned_funcs(struct objtool_file *file) char *funcs = NULL, *tmp; for_each_sym(file, sym) { - if (sym->warned) { + if (sym->warnings) { if (!funcs) { funcs = malloc(strlen(sym->name) + 1); strcpy(funcs, sym->name); @@ -4623,8 +4597,10 @@ int check(struct objtool_file *file) init_cfi_state(&force_undefined_cfi); force_undefined_cfi.force_undefined = true; - if (!cfi_hash_alloc(1UL << (file->elf->symbol_bits - 3))) + if (!cfi_hash_alloc(1UL << (file->elf->symbol_bits - 3))) { + ret = -1; goto out; + } cfi_hash_add(&init_cfi); cfi_hash_add(&func_cfi); @@ -4641,7 +4617,7 @@ int check(struct objtool_file *file) if (opts.retpoline) { ret = validate_retpoline(file); if (ret < 0) - return ret; + goto out; warnings += ret; } @@ -4677,7 +4653,7 @@ int check(struct objtool_file *file) */ ret = validate_unrets(file); if (ret < 0) - return ret; + goto out; warnings += ret; } @@ -4740,7 +4716,7 @@ int check(struct objtool_file *file) if (opts.prefix) { ret = add_prefix_symbols(file); if (ret < 0) - return ret; + goto out; warnings += ret; } @@ -4772,9 +4748,18 @@ int check(struct objtool_file *file) out: /* - * For now, don't fail the kernel build on fatal warnings. These - * errors are still fairly common due to the growing matrix of - * supported toolchains and their recent pace of change. + * CONFIG_OBJTOOL_WERROR upgrades all warnings (and errors) to actual + * errors. + * + * Note that even "fatal" type errors don't actually return an error + * without CONFIG_OBJTOOL_WERROR. That probably needs improved at some + * point. */ + if (opts.werror && (ret || warnings)) { + if (warnings) + WARN("%d warning(s) upgraded to errors", warnings); + return 1; + } + return 0; } diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 6f64d611faea..be4f4b62730c 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -1302,9 +1302,6 @@ int elf_write(struct elf *elf) struct section *sec; Elf_Scn *s; - if (opts.dryrun) - return 0; - /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { if (sec->truncate) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index d63b46a19f39..089a1acc48a8 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -97,4 +97,7 @@ int arch_rewrite_retpolines(struct objtool_file *file); bool arch_pc_relative_reloc(struct reloc *reloc); +unsigned int arch_reloc_size(struct reloc *reloc); +unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); + #endif /* _ARCH_H */ diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index fcca6662c8b4..0fafd0f7a209 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -29,15 +29,16 @@ struct opts { /* options: */ bool backtrace; - bool backup; bool dryrun; bool link; bool mnop; bool module; bool no_unreachable; + const char *output; bool sec_address; bool stats; bool verbose; + bool werror; }; extern struct opts opts; diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index d7e815c2fd15..223ac1c24b90 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -65,10 +65,10 @@ struct symbol { u8 return_thunk : 1; u8 fentry : 1; u8 profiling_func : 1; - u8 warned : 1; u8 embedded_insn : 1; u8 local_label : 1; u8 frame_pointer : 1; + u8 warnings : 2; struct list_head pv_target; struct reloc *relocs; }; diff --git a/tools/objtool/include/objtool/warn.h b/tools/objtool/include/objtool/warn.h index ac04d3fe4dd9..e72b9d630551 100644 --- a/tools/objtool/include/objtool/warn.h +++ b/tools/objtool/include/objtool/warn.h @@ -43,8 +43,10 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define WARN(format, ...) \ fprintf(stderr, \ - "%s: warning: objtool: " format "\n", \ - objname, ##__VA_ARGS__) + "%s: %s: objtool: " format "\n", \ + objname, \ + opts.werror ? "error" : "warning", \ + ##__VA_ARGS__) #define WARN_FUNC(format, sec, offset, ...) \ ({ \ @@ -53,14 +55,22 @@ static inline char *offstr(struct section *sec, unsigned long offset) free(_str); \ }) +#define WARN_LIMIT 2 + #define WARN_INSN(insn, format, ...) \ ({ \ struct instruction *_insn = (insn); \ - if (!_insn->sym || !_insn->sym->warned) \ + BUILD_BUG_ON(WARN_LIMIT > 2); \ + if (!_insn->sym || _insn->sym->warnings < WARN_LIMIT) { \ WARN_FUNC(format, _insn->sec, _insn->offset, \ ##__VA_ARGS__); \ - if (_insn->sym) \ - _insn->sym->warned = 1; \ + if (_insn->sym) \ + _insn->sym->warnings++; \ + } else if (_insn->sym && _insn->sym->warnings == WARN_LIMIT) { \ + WARN_FUNC("skipping duplicate warning(s)", \ + _insn->sec, _insn->offset); \ + _insn->sym->warnings++; \ + } \ }) #define BT_INSN(insn, format, ...) \ diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 6bb7edda3094..eacfe3b0a8d1 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -16,6 +16,7 @@ NORETURN(__tdx_hypercall_failed) NORETURN(__ubsan_handle_builtin_unreachable) NORETURN(__x64_sys_exit) NORETURN(__x64_sys_exit_group) +NORETURN(acpi_processor_ffh_play_dead) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) @@ -34,6 +35,7 @@ NORETURN(kunit_try_catch_throw) NORETURN(machine_real_restart) NORETURN(make_task_dead) NORETURN(mpt_halt_firmware) +NORETURN(mwait_play_dead) NORETURN(nmi_panic_self_stop) NORETURN(panic) NORETURN(panic_smp_self_stop) diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index f40febdd6e36..1c73fb62fd57 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -18,87 +18,19 @@ bool help; -const char *objname; static struct objtool_file file; -static bool objtool_create_backup(const char *_objname) +struct objtool_file *objtool_open_read(const char *filename) { - int len = strlen(_objname); - char *buf, *base, *name = malloc(len+6); - int s, d, l, t; - - if (!name) { - perror("failed backup name malloc"); - return false; - } - - strcpy(name, _objname); - strcpy(name + len, ".orig"); - - d = open(name, O_CREAT|O_WRONLY|O_TRUNC, 0644); - if (d < 0) { - perror("failed to create backup file"); - return false; - } - - s = open(_objname, O_RDONLY); - if (s < 0) { - perror("failed to open orig file"); - return false; - } - - buf = malloc(4096); - if (!buf) { - perror("failed backup data malloc"); - return false; - } - - while ((l = read(s, buf, 4096)) > 0) { - base = buf; - do { - t = write(d, base, l); - if (t < 0) { - perror("failed backup write"); - return false; - } - base += t; - l -= t; - } while (l); - } - - if (l < 0) { - perror("failed backup read"); - return false; - } - - free(name); - free(buf); - close(d); - close(s); - - return true; -} - -struct objtool_file *objtool_open_read(const char *_objname) -{ - if (objname) { - if (strcmp(objname, _objname)) { - WARN("won't handle more than one file at a time"); - return NULL; - } - return &file; + if (file.elf) { + WARN("won't handle more than one file at a time"); + return NULL; } - objname = _objname; - file.elf = elf_open_read(objname, O_RDWR); + file.elf = elf_open_read(filename, O_RDWR); if (!file.elf) return NULL; - if (opts.backup && !objtool_create_backup(objname)) { - WARN("can't create backup file"); - return NULL; - } - hash_init(file.insn_hash); INIT_LIST_HEAD(&file.retpoline_call_list); INIT_LIST_HEAD(&file.return_thunk_list); diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index a62247efb64f..05ef0e297837 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -10,7 +10,7 @@ #include <objtool/warn.h> #include <objtool/endianness.h> -int orc_dump(const char *_objname) +int orc_dump(const char *filename) { int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0; struct orc_entry *orc = NULL; @@ -26,12 +26,9 @@ int orc_dump(const char *_objname) Elf_Data *data, *symtab = NULL, *rela_orc_ip = NULL; struct elf dummy_elf = {}; - - objname = _objname; - elf_version(EV_CURRENT); - fd = open(objname, O_RDONLY); + fd = open(filename, O_RDONLY); if (fd == -1) { perror("open"); return -1; diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index d3c6e10dce73..a4499e5a6f9c 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -26,8 +26,6 @@ FILES=( "include/linux/hash.h" "include/linux/list-sort.h" "include/uapi/linux/hw_breakpoint.h" - "arch/x86/include/asm/disabled-features.h" - "arch/x86/include/asm/required-features.h" "arch/x86/include/asm/cpufeatures.h" "arch/x86/include/asm/inat_types.h" "arch/x86/include/asm/emulate_prefix.h" diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8d5011a0bf60..26057af6b5a1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1056,7 +1056,7 @@ static const struct platform_data turbostat_pdata[] = { * Missing support for * INTEL_ICELAKE * INTEL_ATOM_SILVERMONT_MID - * INTEL_ATOM_AIRMONT_MID + * INTEL_ATOM_SILVERMONT_MID2 * INTEL_ATOM_AIRMONT_NP */ { 0, NULL }, diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 7849405614b1..dc4333d23189 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -7,6 +7,13 @@ #ifndef __SCX_COMMON_BPF_H #define __SCX_COMMON_BPF_H +/* + * The generated kfunc prototypes in vmlinux.h are missing address space + * attributes which cause build failures. For now, suppress the generated + * prototypes. See https://github.com/sched-ext/scx/issues/1111. + */ +#define BPF_NO_KFUNC_PROTOTYPES + #ifdef LSP #define __bpf__ #include "../vmlinux.h" @@ -18,6 +25,7 @@ #include <bpf/bpf_tracing.h> #include <asm-generic/errno.h> #include "user_exit_info.h" +#include "enum_defs.autogen.h" #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ @@ -62,21 +70,28 @@ void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; +u32 scx_bpf_nr_node_ids(void) __ksym __weak; u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; +int scx_bpf_cpu_node(s32 cpu) __ksym __weak; const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; +const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak; const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak; const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; +void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; /* * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from @@ -84,6 +99,9 @@ u64 scx_bpf_now(void) __ksym __weak; */ #define BPF_FOR_EACH_ITER (&___it) +#define scx_read_event(e, name) \ + (bpf_core_field_exists((e)->name) ? (e)->name : 0) + static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} @@ -584,6 +602,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __u.__val; \ }) +#define READ_ONCE_ARENA(type, x) \ +({ \ + union { type __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size((void *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE_ARENA(type, x, val) \ +({ \ + union { type __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size((void *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + /* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index dc18b99e55cd..1dc76bd84296 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -16,6 +16,7 @@ #include <stdlib.h> #include <stdint.h> #include <errno.h> +#include "enum_defs.autogen.h" typedef uint8_t u8; typedef uint16_t u16; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 50e1499ae093..9252e1a00556 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -125,12 +125,107 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, false; \ }) +/** + * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on + * in a compatible way. We will preserve this __COMPAT helper until v6.16. + * + * @enq_flags: enqueue flags from ops.enqueue() + * + * Return: True if SCX_ENQ_CPU_SELECTED is turned on in @enq_flags + */ +static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) +{ +#ifdef HAVE_SCX_ENQ_CPU_SELECTED + /* + * This is the case that a BPF code compiled against vmlinux.h + * where the enum SCX_ENQ_CPU_SELECTED exists. + */ + + /* + * We should temporarily suspend the macro expansion of + * 'SCX_ENQ_CPU_SELECTED'. This avoids 'SCX_ENQ_CPU_SELECTED' being + * rewritten to '__SCX_ENQ_CPU_SELECTED' when 'SCX_ENQ_CPU_SELECTED' + * is defined in 'scripts/gen_enums.py'. + */ +#pragma push_macro("SCX_ENQ_CPU_SELECTED") +#undef SCX_ENQ_CPU_SELECTED + u64 flag; + + /* + * When the kernel did not have SCX_ENQ_CPU_SELECTED, + * select_task_rq_scx() has never been skipped. Thus, this case + * should be considered that the CPU has already been selected. + */ + if (!bpf_core_enum_value_exists(enum scx_enq_flags, + SCX_ENQ_CPU_SELECTED)) + return true; + + flag = bpf_core_enum_value(enum scx_enq_flags, SCX_ENQ_CPU_SELECTED); + return enq_flags & flag; + + /* + * Once done, resume the macro expansion of 'SCX_ENQ_CPU_SELECTED'. + */ +#pragma pop_macro("SCX_ENQ_CPU_SELECTED") +#else + /* + * This is the case that a BPF code compiled against vmlinux.h + * where the enum SCX_ENQ_CPU_SELECTED does NOT exist. + */ + return true; +#endif /* HAVE_SCX_ENQ_CPU_SELECTED */ +} + + #define scx_bpf_now() \ (bpf_ksym_exists(scx_bpf_now) ? \ scx_bpf_now() : \ bpf_ktime_get_ns()) /* + * v6.15: Introduce event counters. + * + * Preserve the following macro until v6.17. + */ +#define __COMPAT_scx_bpf_events(events, size) \ + (bpf_ksym_exists(scx_bpf_events) ? \ + scx_bpf_events(events, size) : ({})) + +/* + * v6.15: Introduce NUMA-aware kfuncs to operate with per-node idle + * cpumasks. + * + * Preserve the following __COMPAT_scx_*_node macros until v6.17. + */ +#define __COMPAT_scx_bpf_nr_node_ids() \ + (bpf_ksym_exists(scx_bpf_nr_node_ids) ? \ + scx_bpf_nr_node_ids() : 1U) + +#define __COMPAT_scx_bpf_cpu_node(cpu) \ + (bpf_ksym_exists(scx_bpf_cpu_node) ? \ + scx_bpf_cpu_node(cpu) : 0) + +#define __COMPAT_scx_bpf_get_idle_cpumask_node(node) \ + (bpf_ksym_exists(scx_bpf_get_idle_cpumask_node) ? \ + scx_bpf_get_idle_cpumask_node(node) : \ + scx_bpf_get_idle_cpumask()) \ + +#define __COMPAT_scx_bpf_get_idle_smtmask_node(node) \ + (bpf_ksym_exists(scx_bpf_get_idle_smtmask_node) ? \ + scx_bpf_get_idle_smtmask_node(node) : \ + scx_bpf_get_idle_smtmask()) + +#define __COMPAT_scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) \ + (bpf_ksym_exists(scx_bpf_pick_idle_cpu_node) ? \ + scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) : \ + scx_bpf_pick_idle_cpu(cpus_allowed, flags)) + +#define __COMPAT_scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) \ + (bpf_ksym_exists(scx_bpf_pick_any_cpu_node) ? \ + scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \ + scx_bpf_pick_any_cpu(cpus_allowed, flags)) + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index b50280e2ba2b..35c67c5174ac 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -106,8 +106,20 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field return false; } -#define SCX_OPS_SWITCH_PARTIAL \ - __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") +#define SCX_OPS_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_ops_flags", #name) + +#define SCX_OPS_KEEP_BUILTIN_IDLE SCX_OPS_FLAG(SCX_OPS_KEEP_BUILTIN_IDLE) +#define SCX_OPS_ENQ_LAST SCX_OPS_FLAG(SCX_OPS_ENQ_LAST) +#define SCX_OPS_ENQ_EXITING SCX_OPS_FLAG(SCX_OPS_ENQ_EXITING) +#define SCX_OPS_SWITCH_PARTIAL SCX_OPS_FLAG(SCX_OPS_SWITCH_PARTIAL) +#define SCX_OPS_ENQ_MIGRATION_DISABLED SCX_OPS_FLAG(SCX_OPS_ENQ_MIGRATION_DISABLED) +#define SCX_OPS_ALLOW_QUEUED_WAKEUP SCX_OPS_FLAG(SCX_OPS_ALLOW_QUEUED_WAKEUP) +#define SCX_OPS_BUILTIN_IDLE_PER_NODE SCX_OPS_FLAG(SCX_OPS_BUILTIN_IDLE_PER_NODE) + +#define SCX_PICK_IDLE_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", #name) + +#define SCX_PICK_IDLE_CORE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_CORE) +#define SCX_PICK_IDLE_IN_NODE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_NODE) static inline long scx_hotplug_seq(void) { diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h new file mode 100644 index 000000000000..6e6c45f14fe1 --- /dev/null +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h @@ -0,0 +1,120 @@ +/* + * WARNING: This file is autogenerated from gen_enum_defs.py [1]. + * + * [1] https://github.com/sched-ext/scx/blob/main/scripts/gen_enum_defs.py + */ + +#ifndef __ENUM_DEFS_AUTOGEN_H__ +#define __ENUM_DEFS_AUTOGEN_H__ + +#define HAVE_SCX_DSP_DFL_MAX_BATCH +#define HAVE_SCX_DSP_MAX_LOOPS +#define HAVE_SCX_WATCHDOG_MAX_TIMEOUT +#define HAVE_SCX_EXIT_BT_LEN +#define HAVE_SCX_EXIT_MSG_LEN +#define HAVE_SCX_EXIT_DUMP_DFL_LEN +#define HAVE_SCX_CPUPERF_ONE +#define HAVE_SCX_OPS_TASK_ITER_BATCH +#define HAVE_SCX_CPU_PREEMPT_RT +#define HAVE_SCX_CPU_PREEMPT_DL +#define HAVE_SCX_CPU_PREEMPT_STOP +#define HAVE_SCX_CPU_PREEMPT_UNKNOWN +#define HAVE_SCX_DEQ_SLEEP +#define HAVE_SCX_DEQ_CORE_SCHED_EXEC +#define HAVE_SCX_DSQ_FLAG_BUILTIN +#define HAVE_SCX_DSQ_FLAG_LOCAL_ON +#define HAVE_SCX_DSQ_INVALID +#define HAVE_SCX_DSQ_GLOBAL +#define HAVE_SCX_DSQ_LOCAL +#define HAVE_SCX_DSQ_LOCAL_ON +#define HAVE_SCX_DSQ_LOCAL_CPU_MASK +#define HAVE_SCX_DSQ_ITER_REV +#define HAVE___SCX_DSQ_ITER_HAS_SLICE +#define HAVE___SCX_DSQ_ITER_HAS_VTIME +#define HAVE___SCX_DSQ_ITER_USER_FLAGS +#define HAVE___SCX_DSQ_ITER_ALL_FLAGS +#define HAVE_SCX_DSQ_LNODE_ITER_CURSOR +#define HAVE___SCX_DSQ_LNODE_PRIV_SHIFT +#define HAVE_SCX_ENQ_WAKEUP +#define HAVE_SCX_ENQ_HEAD +#define HAVE_SCX_ENQ_CPU_SELECTED +#define HAVE_SCX_ENQ_PREEMPT +#define HAVE_SCX_ENQ_REENQ +#define HAVE_SCX_ENQ_LAST +#define HAVE___SCX_ENQ_INTERNAL_MASK +#define HAVE_SCX_ENQ_CLEAR_OPSS +#define HAVE_SCX_ENQ_DSQ_PRIQ +#define HAVE_SCX_TASK_DSQ_ON_PRIQ +#define HAVE_SCX_TASK_QUEUED +#define HAVE_SCX_TASK_RESET_RUNNABLE_AT +#define HAVE_SCX_TASK_DEQD_FOR_SLEEP +#define HAVE_SCX_TASK_STATE_SHIFT +#define HAVE_SCX_TASK_STATE_BITS +#define HAVE_SCX_TASK_STATE_MASK +#define HAVE_SCX_TASK_CURSOR +#define HAVE_SCX_ECODE_RSN_HOTPLUG +#define HAVE_SCX_ECODE_ACT_RESTART +#define HAVE_SCX_EXIT_NONE +#define HAVE_SCX_EXIT_DONE +#define HAVE_SCX_EXIT_UNREG +#define HAVE_SCX_EXIT_UNREG_BPF +#define HAVE_SCX_EXIT_UNREG_KERN +#define HAVE_SCX_EXIT_SYSRQ +#define HAVE_SCX_EXIT_ERROR +#define HAVE_SCX_EXIT_ERROR_BPF +#define HAVE_SCX_EXIT_ERROR_STALL +#define HAVE_SCX_KF_UNLOCKED +#define HAVE_SCX_KF_CPU_RELEASE +#define HAVE_SCX_KF_DISPATCH +#define HAVE_SCX_KF_ENQUEUE +#define HAVE_SCX_KF_SELECT_CPU +#define HAVE_SCX_KF_REST +#define HAVE___SCX_KF_RQ_LOCKED +#define HAVE___SCX_KF_TERMINAL +#define HAVE_SCX_KICK_IDLE +#define HAVE_SCX_KICK_PREEMPT +#define HAVE_SCX_KICK_WAIT +#define HAVE_SCX_OPI_BEGIN +#define HAVE_SCX_OPI_NORMAL_BEGIN +#define HAVE_SCX_OPI_NORMAL_END +#define HAVE_SCX_OPI_CPU_HOTPLUG_BEGIN +#define HAVE_SCX_OPI_CPU_HOTPLUG_END +#define HAVE_SCX_OPI_END +#define HAVE_SCX_OPS_ENABLING +#define HAVE_SCX_OPS_ENABLED +#define HAVE_SCX_OPS_DISABLING +#define HAVE_SCX_OPS_DISABLED +#define HAVE_SCX_OPS_KEEP_BUILTIN_IDLE +#define HAVE_SCX_OPS_ENQ_LAST +#define HAVE_SCX_OPS_ENQ_EXITING +#define HAVE_SCX_OPS_SWITCH_PARTIAL +#define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT +#define HAVE_SCX_OPS_ALL_FLAGS +#define HAVE_SCX_OPSS_NONE +#define HAVE_SCX_OPSS_QUEUEING +#define HAVE_SCX_OPSS_QUEUED +#define HAVE_SCX_OPSS_DISPATCHING +#define HAVE_SCX_OPSS_QSEQ_SHIFT +#define HAVE_SCX_PICK_IDLE_CORE +#define HAVE_SCX_OPS_NAME_LEN +#define HAVE_SCX_SLICE_DFL +#define HAVE_SCX_SLICE_INF +#define HAVE_SCX_RQ_ONLINE +#define HAVE_SCX_RQ_CAN_STOP_TICK +#define HAVE_SCX_RQ_BAL_PENDING +#define HAVE_SCX_RQ_BAL_KEEP +#define HAVE_SCX_RQ_BYPASSING +#define HAVE_SCX_RQ_IN_WAKEUP +#define HAVE_SCX_RQ_IN_BALANCE +#define HAVE_SCX_TASK_NONE +#define HAVE_SCX_TASK_INIT +#define HAVE_SCX_TASK_READY +#define HAVE_SCX_TASK_ENABLED +#define HAVE_SCX_TASK_NR_STATES +#define HAVE_SCX_TG_ONLINE +#define HAVE_SCX_TG_INITED +#define HAVE_SCX_WAKE_FORK +#define HAVE_SCX_WAKE_TTWU +#define HAVE_SCX_WAKE_SYNC + +#endif /* __ENUM_DEFS_AUTOGEN_H__ */ diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 1e9f74525d8f..6ba6e610eeaa 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -10,6 +10,7 @@ #include <unistd.h> #include <inttypes.h> #include <signal.h> +#include <assert.h> #include <libgen.h> #include <bpf/bpf.h> #include <scx/common.h> @@ -60,14 +61,22 @@ restart: skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids <= INT32_MAX); + while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; break; - case 'c': - skel->rodata->central_cpu = strtoul(optarg, NULL, 0); + case 'c': { + u32 central_cpu = strtoul(optarg, NULL, 0); + if (central_cpu >= skel->rodata->nr_cpu_ids) { + fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids); + return -1; + } + skel->rodata->central_cpu = (s32)central_cpu; break; + } case 'v': verbose = true; break; @@ -96,7 +105,7 @@ restart: */ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); - CPU_ZERO(cpuset); + CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset); CPU_SET(skel->rodata->central_cpu, cpuset); SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset), "Failed to affinitize to central CPU %d (max %d)", diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 3a20bb0c014a..26c40ca4f36c 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -231,7 +231,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) } /* if select_cpu() wasn't called, try direct dispatch */ - if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && + if (!__COMPAT_is_enq_cpu_selected(enq_flags) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { __sync_fetch_and_add(&nr_ddsp_from_enq, 1); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); @@ -763,6 +763,8 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { + struct scx_event_stats events; + bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); @@ -772,6 +774,25 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) if (print_shared_dsq) dump_shared_dsq(); + __COMPAT_scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SLICE_DFL", + scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; } diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 8daac70c2f9d..2ebaf5e6942e 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -35,6 +35,7 @@ TARGETS += filesystems/epoll TARGETS += filesystems/fat TARGETS += filesystems/overlayfs TARGETS += filesystems/statmount +TARGETS += filesystems/mount-notify TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c index 303260a6dc65..3bfcd3848432 100644 --- a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c +++ b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c @@ -227,6 +227,8 @@ static int check_child_hugetlb_memory_mapping(int mem_type, int mode, int mappin int main(int argc, char *argv[]) { int err; + void *map_ptr; + unsigned long map_size; err = mte_default_setup(); if (err) @@ -243,6 +245,15 @@ int main(int argc, char *argv[]) return KSFT_FAIL; } + /* Check if MTE supports hugetlb mappings */ + map_size = default_huge_page_size(); + map_ptr = mmap(NULL, map_size, PROT_READ | PROT_MTE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + if (map_ptr == MAP_FAILED) + ksft_exit_skip("PROT_MTE not supported with MAP_HUGETLB mappings\n"); + else + munmap(map_ptr, map_size); + /* Set test plan */ ksft_set_plan(12); @@ -270,13 +281,13 @@ int main(int argc, char *argv[]) "Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap memory\n"); + "Check child hugetlb memory with private mapping, sync error mode and mmap memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap memory\n"); + "Check child hugetlb memory with private mapping, async error mode and mmap memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n"); + "Check child hugetlb memory with private mapping, sync error mode and mmap/mprotect memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n"); + "Check child hugetlb memory with private mapping, async error mode and mmap/mprotect memory\n"); mte_restore_setup(); free_hugetlb(); diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index 93f4b411b378..fc69bfcc37c4 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -7,7 +7,7 @@ from lib.py import ksft_run, ksft_exit from lib.py import ksft_eq, KsftSkipEx, KsftFailEx from lib.py import EthtoolFamily, NetDrvEpEnv from lib.py import bkg, cmd, wait_port_listen, rand_port -from lib.py import ethtool, ip +from lib.py import defer, ethtool, ip remote_ifname="" no_sleep=False @@ -60,6 +60,7 @@ def _set_xdp_generic_sb_on(cfg) -> None: prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) cmd(f"ip link set dev {cfg.ifname} mtu 1500 xdpgeneric obj {prog} sec xdp", shell=True) + defer(cmd, f"ip link set dev {cfg.ifname} xdpgeneric off") if no_sleep != True: time.sleep(10) @@ -68,7 +69,9 @@ def _set_xdp_generic_mb_on(cfg) -> None: test_dir = os.path.dirname(os.path.realpath(__file__)) prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" cmd(f"ip link set dev {remote_ifname} mtu 9000", shell=True, host=cfg.remote) + defer(ip, f"link set dev {remote_ifname} mtu 1500", host=cfg.remote) ip("link set dev %s mtu 9000 xdpgeneric obj %s sec xdp.frags" % (cfg.ifname, prog)) + defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdpgeneric off") if no_sleep != True: time.sleep(10) @@ -78,6 +81,7 @@ def _set_xdp_native_sb_on(cfg) -> None: prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) cmd(f"ip -j link set dev {cfg.ifname} mtu 1500 xdp obj {prog} sec xdp", shell=True) + defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdp off") xdp_info = ip("-d link show %s" % (cfg.ifname), json=True)[0] if xdp_info['xdp']['mode'] != 1: """ @@ -94,10 +98,11 @@ def _set_xdp_native_mb_on(cfg) -> None: test_dir = os.path.dirname(os.path.realpath(__file__)) prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" cmd(f"ip link set dev {remote_ifname} mtu 9000", shell=True, host=cfg.remote) + defer(ip, f"link set dev {remote_ifname} mtu 1500", host=cfg.remote) try: cmd(f"ip link set dev {cfg.ifname} mtu 9000 xdp obj {prog} sec xdp.frags", shell=True) + defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdp off") except Exception as e: - cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) raise KsftSkipEx('device does not support native-multi-buffer XDP') if no_sleep != True: @@ -111,6 +116,7 @@ def _set_xdp_offload_on(cfg) -> None: cmd(f"ip link set dev {cfg.ifname} xdpoffload obj {prog} sec xdp", shell=True) except Exception as e: raise KsftSkipEx('device does not support offloaded XDP') + defer(ip, f"link set dev {cfg.ifname} xdpoffload off") cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote) if no_sleep != True: @@ -157,7 +163,6 @@ def test_xdp_generic_sb(cfg, netnl) -> None: _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdpgeneric off" % cfg.ifname) def test_xdp_generic_mb(cfg, netnl) -> None: _set_xdp_generic_mb_on(cfg) @@ -169,7 +174,6 @@ def test_xdp_generic_mb(cfg, netnl) -> None: _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdpgeneric off" % cfg.ifname) def test_xdp_native_sb(cfg, netnl) -> None: _set_xdp_native_sb_on(cfg) @@ -181,7 +185,6 @@ def test_xdp_native_sb(cfg, netnl) -> None: _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdp off" % cfg.ifname) def test_xdp_native_mb(cfg, netnl) -> None: _set_xdp_native_mb_on(cfg) @@ -193,14 +196,12 @@ def test_xdp_native_mb(cfg, netnl) -> None: _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdp off" % cfg.ifname) def test_xdp_offload(cfg, netnl) -> None: _set_xdp_offload_on(cfg) _test_v4(cfg) _test_v6(cfg) _test_tcp(cfg) - ip("link set dev %s xdpoffload off" % cfg.ifname) def main() -> None: with NetDrvEpEnv(__file__) as cfg: @@ -213,7 +214,6 @@ def main() -> None: test_xdp_native_mb, test_xdp_offload], args=(cfg, EthtoolFamily())) - set_interface_init(cfg) ksft_exit() diff --git a/arch/x86/boot/tools/.gitignore b/tools/testing/selftests/filesystems/mount-notify/.gitignore index ae91f4d0d78b..82a4846cbc4b 100644 --- a/arch/x86/boot/tools/.gitignore +++ b/tools/testing/selftests/filesystems/mount-notify/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -build +/*_test diff --git a/tools/testing/selftests/filesystems/mount-notify/Makefile b/tools/testing/selftests/filesystems/mount-notify/Makefile new file mode 100644 index 000000000000..10be0227b5ae --- /dev/null +++ b/tools/testing/selftests/filesystems/mount-notify/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) +TEST_GEN_PROGS := mount-notify_test + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c new file mode 100644 index 000000000000..4a2d5c454fd1 --- /dev/null +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu> + +#define _GNU_SOURCE +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <linux/fanotify.h> +#include <unistd.h> +#include <sys/fanotify.h> +#include <sys/syscall.h> + +#include "../../kselftest_harness.h" +#include "../statmount/statmount.h" + +#ifndef FAN_MNT_ATTACH +struct fanotify_event_info_mnt { + struct fanotify_event_info_header hdr; + __u64 mnt_id; +}; +#define FAN_MNT_ATTACH 0x01000000 /* Mount was attached */ +#endif + +#ifndef FAN_MNT_DETACH +#define FAN_MNT_DETACH 0x02000000 /* Mount was detached */ +#endif + +#ifndef FAN_REPORT_MNT +#define FAN_REPORT_MNT 0x00004000 /* Report mount events */ +#endif + +#ifndef FAN_MARK_MNTNS +#define FAN_MARK_MNTNS 0x00000110 +#endif + +static uint64_t get_mnt_id(struct __test_metadata *const _metadata, + const char *path) +{ + struct statx sx; + + ASSERT_EQ(statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx), 0); + ASSERT_TRUE(!!(sx.stx_mask & STATX_MNT_ID_UNIQUE)); + return sx.stx_mnt_id; +} + +static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; + +FIXTURE(fanotify) { + int fan_fd; + char buf[256]; + unsigned int rem; + void *next; + char root_mntpoint[sizeof(root_mntpoint_templ)]; + int orig_root; + int ns_fd; + uint64_t root_id; +}; + +FIXTURE_SETUP(fanotify) +{ + int ret; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + self->ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(self->ns_fd, 0); + + ASSERT_EQ(mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL), 0); + + strcpy(self->root_mntpoint, root_mntpoint_templ); + ASSERT_NE(mkdtemp(self->root_mntpoint), NULL); + + self->orig_root = open("/", O_PATH | O_CLOEXEC); + ASSERT_GE(self->orig_root, 0); + + ASSERT_EQ(mount("tmpfs", self->root_mntpoint, "tmpfs", 0, NULL), 0); + + ASSERT_EQ(chroot(self->root_mntpoint), 0); + + ASSERT_EQ(chdir("/"), 0); + + ASSERT_EQ(mkdir("a", 0700), 0); + + ASSERT_EQ(mkdir("b", 0700), 0); + + self->root_id = get_mnt_id(_metadata, "/"); + ASSERT_NE(self->root_id, 0); + + self->fan_fd = fanotify_init(FAN_REPORT_MNT, 0); + ASSERT_GE(self->fan_fd, 0); + + ret = fanotify_mark(self->fan_fd, FAN_MARK_ADD | FAN_MARK_MNTNS, + FAN_MNT_ATTACH | FAN_MNT_DETACH, self->ns_fd, NULL); + ASSERT_EQ(ret, 0); + + self->rem = 0; +} + +FIXTURE_TEARDOWN(fanotify) +{ + ASSERT_EQ(self->rem, 0); + close(self->fan_fd); + + ASSERT_EQ(fchdir(self->orig_root), 0); + + ASSERT_EQ(chroot("."), 0); + + EXPECT_EQ(umount2(self->root_mntpoint, MNT_DETACH), 0); + EXPECT_EQ(chdir(self->root_mntpoint), 0); + EXPECT_EQ(chdir("/"), 0); + EXPECT_EQ(rmdir(self->root_mntpoint), 0); +} + +static uint64_t expect_notify(struct __test_metadata *const _metadata, + FIXTURE_DATA(fanotify) *self, + uint64_t *mask) +{ + struct fanotify_event_metadata *meta; + struct fanotify_event_info_mnt *mnt; + unsigned int thislen; + + if (!self->rem) { + ssize_t len = read(self->fan_fd, self->buf, sizeof(self->buf)); + ASSERT_GT(len, 0); + + self->rem = len; + self->next = (void *) self->buf; + } + + meta = self->next; + ASSERT_TRUE(FAN_EVENT_OK(meta, self->rem)); + + thislen = meta->event_len; + self->rem -= thislen; + self->next += thislen; + + *mask = meta->mask; + thislen -= sizeof(*meta); + + mnt = ((void *) meta) + meta->event_len - thislen; + + ASSERT_EQ(thislen, sizeof(*mnt)); + + return mnt->mnt_id; +} + +static void expect_notify_n(struct __test_metadata *const _metadata, + FIXTURE_DATA(fanotify) *self, + unsigned int n, uint64_t mask[], uint64_t mnts[]) +{ + unsigned int i; + + for (i = 0; i < n; i++) + mnts[i] = expect_notify(_metadata, self, &mask[i]); +} + +static uint64_t expect_notify_mask(struct __test_metadata *const _metadata, + FIXTURE_DATA(fanotify) *self, + uint64_t expect_mask) +{ + uint64_t mntid, mask; + + mntid = expect_notify(_metadata, self, &mask); + ASSERT_EQ(expect_mask, mask); + + return mntid; +} + + +static void expect_notify_mask_n(struct __test_metadata *const _metadata, + FIXTURE_DATA(fanotify) *self, + uint64_t mask, unsigned int n, uint64_t mnts[]) +{ + unsigned int i; + + for (i = 0; i < n; i++) + mnts[i] = expect_notify_mask(_metadata, self, mask); +} + +static void verify_mount_ids(struct __test_metadata *const _metadata, + const uint64_t list1[], const uint64_t list2[], + size_t num) +{ + unsigned int i, j; + + // Check that neither list has any duplicates + for (i = 0; i < num; i++) { + for (j = 0; j < num; j++) { + if (i != j) { + ASSERT_NE(list1[i], list1[j]); + ASSERT_NE(list2[i], list2[j]); + } + } + } + // Check that all list1 memebers can be found in list2. Together with + // the above it means that the list1 and list2 represent the same sets. + for (i = 0; i < num; i++) { + for (j = 0; j < num; j++) { + if (list1[i] == list2[j]) + break; + } + ASSERT_NE(j, num); + } +} + +static void check_mounted(struct __test_metadata *const _metadata, + const uint64_t mnts[], size_t num) +{ + ssize_t ret; + uint64_t *list; + + list = malloc((num + 1) * sizeof(list[0])); + ASSERT_NE(list, NULL); + + ret = listmount(LSMT_ROOT, 0, 0, list, num + 1, 0); + ASSERT_EQ(ret, num); + + verify_mount_ids(_metadata, mnts, list, num); + + free(list); +} + +static void setup_mount_tree(struct __test_metadata *const _metadata, + int log2_num) +{ + int ret, i; + + ret = mount("", "/", NULL, MS_SHARED, NULL); + ASSERT_EQ(ret, 0); + + for (i = 0; i < log2_num; i++) { + ret = mount("/", "/", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + } +} + +TEST_F(fanotify, bind) +{ + int ret; + uint64_t mnts[2] = { self->root_id }; + + ret = mount("/", "/", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + ASSERT_NE(mnts[0], mnts[1]); + + check_mounted(_metadata, mnts, 2); + + // Cleanup + uint64_t detach_id; + ret = umount("/"); + ASSERT_EQ(ret, 0); + + detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH); + ASSERT_EQ(detach_id, mnts[1]); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, move) +{ + int ret; + uint64_t mnts[2] = { self->root_id }; + uint64_t move_id; + + ret = mount("/", "/a", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + ASSERT_NE(mnts[0], mnts[1]); + + check_mounted(_metadata, mnts, 2); + + ret = move_mount(AT_FDCWD, "/a", AT_FDCWD, "/b", 0); + ASSERT_EQ(ret, 0); + + move_id = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH); + ASSERT_EQ(move_id, mnts[1]); + + // Cleanup + ret = umount("/b"); + ASSERT_EQ(ret, 0); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, propagate) +{ + const unsigned int log2_num = 4; + const unsigned int num = (1 << log2_num); + uint64_t mnts[num]; + + setup_mount_tree(_metadata, log2_num); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, num - 1, mnts + 1); + + mnts[0] = self->root_id; + check_mounted(_metadata, mnts, num); + + // Cleanup + int ret; + uint64_t mnts2[num]; + ret = umount2("/", MNT_DETACH); + ASSERT_EQ(ret, 0); + + ret = mount("", "/", NULL, MS_PRIVATE, NULL); + ASSERT_EQ(ret, 0); + + mnts2[0] = self->root_id; + expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, num - 1, mnts2 + 1); + verify_mount_ids(_metadata, mnts, mnts2, num); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, fsmount) +{ + int ret, fs, mnt; + uint64_t mnts[2] = { self->root_id }; + + fs = fsopen("tmpfs", 0); + ASSERT_GE(fs, 0); + + ret = fsconfig(fs, FSCONFIG_CMD_CREATE, 0, 0, 0); + ASSERT_EQ(ret, 0); + + mnt = fsmount(fs, 0, 0); + ASSERT_GE(mnt, 0); + + close(fs); + + ret = move_mount(mnt, "", AT_FDCWD, "/a", MOVE_MOUNT_F_EMPTY_PATH); + ASSERT_EQ(ret, 0); + + close(mnt); + + mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + ASSERT_NE(mnts[0], mnts[1]); + + check_mounted(_metadata, mnts, 2); + + // Cleanup + uint64_t detach_id; + ret = umount("/a"); + ASSERT_EQ(ret, 0); + + detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH); + ASSERT_EQ(detach_id, mnts[1]); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, reparent) +{ + uint64_t mnts[6] = { self->root_id }; + uint64_t dmnts[3]; + uint64_t masks[3]; + unsigned int i; + int ret; + + // Create setup with a[1] -> b[2] propagation + ret = mount("/", "/a", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + ret = mount("", "/a", NULL, MS_SHARED, NULL); + ASSERT_EQ(ret, 0); + + ret = mount("/a", "/b", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + ret = mount("", "/b", NULL, MS_SLAVE, NULL); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1); + + check_mounted(_metadata, mnts, 3); + + // Mount on a[3], which is propagated to b[4] + ret = mount("/", "/a", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 3); + + check_mounted(_metadata, mnts, 5); + + // Mount on b[5], not propagated + ret = mount("/", "/b", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + mnts[5] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + + check_mounted(_metadata, mnts, 6); + + // Umount a[3], which is propagated to b[4], but not b[5] + // This will result in b[5] "falling" on b[2] + ret = umount("/a"); + ASSERT_EQ(ret, 0); + + expect_notify_n(_metadata, self, 3, masks, dmnts); + verify_mount_ids(_metadata, mnts + 3, dmnts, 3); + + for (i = 0; i < 3; i++) { + if (dmnts[i] == mnts[5]) { + ASSERT_EQ(masks[i], FAN_MNT_ATTACH | FAN_MNT_DETACH); + } else { + ASSERT_EQ(masks[i], FAN_MNT_DETACH); + } + } + + mnts[3] = mnts[5]; + check_mounted(_metadata, mnts, 4); + + // Cleanup + ret = umount("/b"); + ASSERT_EQ(ret, 0); + + ret = umount("/a"); + ASSERT_EQ(ret, 0); + + ret = umount("/b"); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 3, dmnts); + verify_mount_ids(_metadata, mnts + 1, dmnts, 3); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, rmdir) +{ + uint64_t mnts[3] = { self->root_id }; + int ret; + + ret = mount("/", "/a", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + ret = mount("/", "/a/b", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1); + + check_mounted(_metadata, mnts, 3); + + ret = chdir("/a"); + ASSERT_EQ(ret, 0); + + ret = fork(); + ASSERT_GE(ret, 0); + + if (ret == 0) { + chdir("/"); + unshare(CLONE_NEWNS); + mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); + umount2("/a", MNT_DETACH); + // This triggers a detach in the other namespace + rmdir("/a"); + exit(0); + } + wait(NULL); + + expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 2, mnts + 1); + check_mounted(_metadata, mnts, 1); + + // Cleanup + ret = chdir("/"); + ASSERT_EQ(ret, 0); +} + +TEST_F(fanotify, pivot_root) +{ + uint64_t mnts[3] = { self->root_id }; + uint64_t mnts2[3]; + int ret; + + ret = mount("tmpfs", "/a", "tmpfs", 0, NULL); + ASSERT_EQ(ret, 0); + + mnts[2] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + + ret = mkdir("/a/new", 0700); + ASSERT_EQ(ret, 0); + + ret = mkdir("/a/old", 0700); + ASSERT_EQ(ret, 0); + + ret = mount("/a", "/a/new", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + check_mounted(_metadata, mnts, 3); + + ret = syscall(SYS_pivot_root, "/a/new", "/a/new/old"); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH, 2, mnts2); + verify_mount_ids(_metadata, mnts, mnts2, 2); + check_mounted(_metadata, mnts, 3); + + // Cleanup + ret = syscall(SYS_pivot_root, "/old", "/old/a/new"); + ASSERT_EQ(ret, 0); + + ret = umount("/a/new"); + ASSERT_EQ(ret, 0); + + ret = umount("/a"); + ASSERT_EQ(ret, 0); + + check_mounted(_metadata, mnts, 1); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c index 457cf76f3c5f..a3d8015897e9 100644 --- a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c +++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c @@ -3,6 +3,8 @@ #define _GNU_SOURCE #include <fcntl.h> +#include <linux/auto_dev-ioctl.h> +#include <linux/errno.h> #include <sched.h> #include <stdio.h> #include <string.h> @@ -146,4 +148,16 @@ TEST_F(iterate_mount_namespaces, iterate_backward) } } +TEST_F(iterate_mount_namespaces, nfs_valid_ioctl) +{ + ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_OPENMOUNT, NULL), 0); + ASSERT_EQ(errno, ENOTTY); + + ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_CLOSEMOUNT, NULL), 0); + ASSERT_EQ(errno, ENOTTY); + + ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_READY, NULL), 0); + ASSERT_EQ(errno, ENOTTY); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile index e8d1adb021af..6c661232b3b5 100644 --- a/tools/testing/selftests/filesystems/overlayfs/Makefile +++ b/tools/testing/selftests/filesystems/overlayfs/Makefile @@ -1,7 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := dev_in_maps set_layers_via_fds +CFLAGS += -Wall +CFLAGS += $(KHDR_INCLUDES) +LDLIBS += -lcap -CFLAGS := -Wall -Werror +LOCAL_HDRS += wrappers.h log.h + +TEST_GEN_PROGS := dev_in_maps +TEST_GEN_PROGS += set_layers_via_fds include ../../lib.mk + +$(OUTPUT)/set_layers_via_fds: ../utils.c diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c index 1d0ae785a667..5074e64e74a8 100644 --- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c +++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c @@ -6,26 +6,40 @@ #include <sched.h> #include <stdio.h> #include <string.h> +#include <sys/socket.h> #include <sys/stat.h> +#include <sys/sysmacros.h> #include <sys/mount.h> #include <unistd.h> #include "../../kselftest_harness.h" +#include "../../pidfd/pidfd.h" #include "log.h" +#include "../utils.h" #include "wrappers.h" FIXTURE(set_layers_via_fds) { + int pidfd; }; FIXTURE_SETUP(set_layers_via_fds) { - ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0); + self->pidfd = -EBADF; + EXPECT_EQ(mkdir("/set_layers_via_fds", 0755), 0); + EXPECT_EQ(mkdir("/set_layers_via_fds_tmpfs", 0755), 0); } FIXTURE_TEARDOWN(set_layers_via_fds) { + if (self->pidfd >= 0) { + EXPECT_EQ(sys_pidfd_send_signal(self->pidfd, SIGKILL, NULL, 0), 0); + EXPECT_EQ(close(self->pidfd), 0); + } umount2("/set_layers_via_fds", 0); - ASSERT_EQ(rmdir("/set_layers_via_fds"), 0); + EXPECT_EQ(rmdir("/set_layers_via_fds"), 0); + + umount2("/set_layers_via_fds_tmpfs", 0); + EXPECT_EQ(rmdir("/set_layers_via_fds_tmpfs"), 0); } TEST_F(set_layers_via_fds, set_layers_via_fds) @@ -214,4 +228,493 @@ TEST_F(set_layers_via_fds, set_500_layers_via_fds) ASSERT_EQ(close(fd_overlay), 0); } +TEST_F(set_layers_via_fds, set_override_creds) +{ + int fd_context, fd_tmpfs, fd_overlay; + int layer_fds[] = { [0 ... 3] = -EBADF }; + pid_t pid; + int pidfd; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + + layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY); + ASSERT_GE(layer_fds[3], 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0); + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) { + TH_LOG("sys_fsconfig should have succeeded"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0); + ASSERT_GE(close(pidfd), 0); + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "nooverride_creds", NULL, 0)) { + TH_LOG("sys_fsconfig should have succeeded"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0); + ASSERT_GE(close(pidfd), 0); + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) { + TH_LOG("sys_fsconfig should have succeeded"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0); + ASSERT_GE(close(pidfd), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); +} + +TEST_F(set_layers_via_fds, set_override_creds_invalid) +{ + int fd_context, fd_tmpfs, fd_overlay, ret; + int layer_fds[] = { [0 ... 3] = -EBADF }; + pid_t pid; + int fd_userns1, fd_userns2; + int ipc_sockets[2]; + char c; + const unsigned int predictable_fd_context_nr = 123; + + fd_userns1 = get_userns_fd(0, 0, 10000); + ASSERT_GE(fd_userns1, 0); + + fd_userns2 = get_userns_fd(0, 1234, 10000); + ASSERT_GE(fd_userns2, 0); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_GE(ret, 0); + + pid = create_child(&self->pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (close(ipc_sockets[0])) { + TH_LOG("close should have succeeded"); + _exit(EXIT_FAILURE); + } + + if (!switch_userns(fd_userns2, 0, 0, false)) { + TH_LOG("switch_userns should have succeeded"); + _exit(EXIT_FAILURE); + } + + if (read_nointr(ipc_sockets[1], &c, 1) != 1) { + TH_LOG("read_nointr should have succeeded"); + _exit(EXIT_FAILURE); + } + + if (close(ipc_sockets[1])) { + TH_LOG("close should have succeeded"); + _exit(EXIT_FAILURE); + } + + if (!sys_fsconfig(predictable_fd_context_nr, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) { + TH_LOG("sys_fsconfig should have failed"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + ASSERT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(switch_userns(fd_userns1, 0, 0, false), true); + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + + layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY); + ASSERT_GE(layer_fds[3], 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + ASSERT_EQ(dup3(fd_context, predictable_fd_context_nr, 0), predictable_fd_context_nr); + ASSERT_EQ(close(fd_context), 0); + fd_context = predictable_fd_context_nr; + ASSERT_EQ(write_nointr(ipc_sockets[0], "1", 1), 1); + ASSERT_EQ(close(ipc_sockets[0]), 0); + + ASSERT_EQ(wait_for_pid(pid), 0); + ASSERT_EQ(close(self->pidfd), 0); + self->pidfd = -EBADF; + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) + ASSERT_EQ(close(layer_fds[i]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); + ASSERT_EQ(close(fd_userns1), 0); + ASSERT_EQ(close(fd_userns2), 0); +} + +TEST_F(set_layers_via_fds, set_override_creds_nomknod) +{ + int fd_context, fd_tmpfs, fd_overlay; + int layer_fds[] = { [0 ... 3] = -EBADF }; + pid_t pid; + int pidfd; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + + layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY); + ASSERT_GE(layer_fds[3], 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0); + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (!cap_down(CAP_MKNOD)) + _exit(EXIT_FAILURE); + + if (!cap_down(CAP_SYS_ADMIN)) + _exit(EXIT_FAILURE); + + if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + ASSERT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0); + ASSERT_GE(close(pidfd), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(mknodat(fd_overlay, "dev-zero", S_IFCHR | 0644, makedev(1, 5)), -1); + ASSERT_EQ(errno, EPERM); + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); +} + +TEST_F(set_layers_via_fds, set_500_layers_via_opath_fds) +{ + int fd_context, fd_tmpfs, fd_overlay, fd_work, fd_upper, fd_lower; + int layer_fds[500] = { [0 ... 499] = -EBADF }; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + char path[100]; + + sprintf(path, "l%d", i); + ASSERT_EQ(mkdirat(fd_tmpfs, path, 0755), 0); + layer_fds[i] = openat(fd_tmpfs, path, O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[i], 0); + } + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + fd_work = openat(fd_tmpfs, "w", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_work, 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + fd_upper = openat(fd_tmpfs, "u", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_upper, 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "l501", 0755), 0); + fd_lower = openat(fd_tmpfs, "l501", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_lower, 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, fd_work), 0); + ASSERT_EQ(close(fd_work), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, fd_upper), 0); + ASSERT_EQ(close(fd_upper), 0); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[i]), 0); + ASSERT_EQ(close(layer_fds[i]), 0); + } + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower), 0); + ASSERT_EQ(close(fd_lower), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); +} + +TEST_F(set_layers_via_fds, set_layers_via_detached_mount_fds) +{ + int fd_context, fd_tmpfs, fd_overlay, fd_tmp; + int layer_fds[] = { [0 ... 8] = -EBADF }; + bool layers_found[] = { [0 ... 8] = false }; + size_t len = 0; + char *line = NULL; + FILE *f_mountinfo; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u/upper", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u/work", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/set_layers_via_fds_tmpfs", MOVE_MOUNT_F_EMPTY_PATH), 0); + + fd_tmp = open_tree(fd_tmpfs, "u", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tmp, 0); + + layer_fds[0] = openat(fd_tmp, "upper", O_CLOEXEC | O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmp, "work", O_CLOEXEC | O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = open_tree(fd_tmpfs, "l1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = open_tree(fd_tmpfs, "l2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[3], 0); + + layer_fds[4] = open_tree(fd_tmpfs, "l3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[4], 0); + + layer_fds[5] = open_tree(fd_tmpfs, "l4", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[5], 0); + + layer_fds[6] = open_tree(fd_tmpfs, "d1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[6], 0); + + layer_fds[7] = open_tree(fd_tmpfs, "d2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[7], 0); + + layer_fds[8] = open_tree(fd_tmpfs, "d3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[8], 0); + + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[6]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[7]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[8]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + + f_mountinfo = fopen("/proc/self/mountinfo", "r"); + ASSERT_NE(f_mountinfo, NULL); + + while (getline(&line, &len, f_mountinfo) != -1) { + char *haystack = line; + + if (strstr(haystack, "workdir=/tmp/w")) + layers_found[0] = true; + if (strstr(haystack, "upperdir=/tmp/u")) + layers_found[1] = true; + if (strstr(haystack, "lowerdir+=/tmp/l1")) + layers_found[2] = true; + if (strstr(haystack, "lowerdir+=/tmp/l2")) + layers_found[3] = true; + if (strstr(haystack, "lowerdir+=/tmp/l3")) + layers_found[4] = true; + if (strstr(haystack, "lowerdir+=/tmp/l4")) + layers_found[5] = true; + if (strstr(haystack, "datadir+=/tmp/d1")) + layers_found[6] = true; + if (strstr(haystack, "datadir+=/tmp/d2")) + layers_found[7] = true; + if (strstr(haystack, "datadir+=/tmp/d3")) + layers_found[8] = true; + } + free(line); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + ASSERT_EQ(layers_found[i], true); + ASSERT_EQ(close(layer_fds[i]), 0); + } + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); + ASSERT_EQ(fclose(f_mountinfo), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/overlayfs/wrappers.h b/tools/testing/selftests/filesystems/overlayfs/wrappers.h index 071b95fd2ac0..c38bc48e0cfa 100644 --- a/tools/testing/selftests/filesystems/overlayfs/wrappers.h +++ b/tools/testing/selftests/filesystems/overlayfs/wrappers.h @@ -44,4 +44,21 @@ static inline int sys_move_mount(int from_dfd, const char *from_pathname, to_pathname, flags); } +#ifndef OPEN_TREE_CLONE +#define OPEN_TREE_CLONE 1 +#endif + +#ifndef OPEN_TREE_CLOEXEC +#define OPEN_TREE_CLOEXEC O_CLOEXEC +#endif + +#ifndef AT_RECURSIVE +#define AT_RECURSIVE 0x8000 +#endif + +static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} + #endif diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h index f4294bab9d73..a7a5289ddae9 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount.h +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -25,7 +25,7 @@ static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, return syscall(__NR_statmount, &req, buf, bufsize, flags); } -static ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id, +static inline ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t last_mnt_id, uint64_t list[], size_t num, unsigned int flags) { diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c new file mode 100644 index 000000000000..e553c89c5b19 --- /dev/null +++ b/tools/testing/selftests/filesystems/utils.c @@ -0,0 +1,501 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <fcntl.h> +#include <sys/types.h> +#include <dirent.h> +#include <grp.h> +#include <linux/limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/eventfd.h> +#include <sys/fsuid.h> +#include <sys/prctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/xattr.h> + +#include "utils.h" + +#define MAX_USERNS_LEVEL 32 + +#define syserror(format, ...) \ + ({ \ + fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \ + (-errno); \ + }) + +#define syserror_set(__ret__, format, ...) \ + ({ \ + typeof(__ret__) __internal_ret__ = (__ret__); \ + errno = labs(__ret__); \ + fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \ + __internal_ret__; \ + }) + +#define STRLITERALLEN(x) (sizeof(""x"") - 1) + +#define INTTYPE_TO_STRLEN(type) \ + (2 + (sizeof(type) <= 1 \ + ? 3 \ + : sizeof(type) <= 2 \ + ? 5 \ + : sizeof(type) <= 4 \ + ? 10 \ + : sizeof(type) <= 8 ? 20 : sizeof(int[-2 * (sizeof(type) > 8)]))) + +#define list_for_each(__iterator, __list) \ + for (__iterator = (__list)->next; __iterator != __list; __iterator = __iterator->next) + +typedef enum idmap_type_t { + ID_TYPE_UID, + ID_TYPE_GID +} idmap_type_t; + +struct id_map { + idmap_type_t map_type; + __u32 nsid; + __u32 hostid; + __u32 range; +}; + +struct list { + void *elem; + struct list *next; + struct list *prev; +}; + +struct userns_hierarchy { + int fd_userns; + int fd_event; + unsigned int level; + struct list id_map; +}; + +static inline void list_init(struct list *list) +{ + list->elem = NULL; + list->next = list->prev = list; +} + +static inline int list_empty(const struct list *list) +{ + return list == list->next; +} + +static inline void __list_add(struct list *new, struct list *prev, struct list *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add_tail(struct list *head, struct list *list) +{ + __list_add(list, head->prev, head); +} + +static inline void list_del(struct list *list) +{ + struct list *next, *prev; + + next = list->next; + prev = list->prev; + next->prev = prev; + prev->next = next; +} + +static ssize_t read_nointr(int fd, void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = read(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + +static ssize_t write_nointr(int fd, const void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = write(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + +#define __STACK_SIZE (8 * 1024 * 1024) +static pid_t do_clone(int (*fn)(void *), void *arg, int flags) +{ + void *stack; + + stack = malloc(__STACK_SIZE); + if (!stack) + return -ENOMEM; + +#ifdef __ia64__ + return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL); +#else + return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL); +#endif +} + +static int get_userns_fd_cb(void *data) +{ + for (;;) + pause(); + _exit(0); +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static int write_id_mapping(idmap_type_t map_type, pid_t pid, const char *buf, size_t buf_size) +{ + int fd = -EBADF, setgroups_fd = -EBADF; + int fret = -1; + int ret; + char path[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) + + STRLITERALLEN("/setgroups") + 1]; + + if (geteuid() != 0 && map_type == ID_TYPE_GID) { + ret = snprintf(path, sizeof(path), "/proc/%d/setgroups", pid); + if (ret < 0 || ret >= sizeof(path)) + goto out; + + setgroups_fd = open(path, O_WRONLY | O_CLOEXEC); + if (setgroups_fd < 0 && errno != ENOENT) { + syserror("Failed to open \"%s\"", path); + goto out; + } + + if (setgroups_fd >= 0) { + ret = write_nointr(setgroups_fd, "deny\n", STRLITERALLEN("deny\n")); + if (ret != STRLITERALLEN("deny\n")) { + syserror("Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid); + goto out; + } + } + } + + ret = snprintf(path, sizeof(path), "/proc/%d/%cid_map", pid, map_type == ID_TYPE_UID ? 'u' : 'g'); + if (ret < 0 || ret >= sizeof(path)) + goto out; + + fd = open(path, O_WRONLY | O_CLOEXEC); + if (fd < 0) { + syserror("Failed to open \"%s\"", path); + goto out; + } + + ret = write_nointr(fd, buf, buf_size); + if (ret != buf_size) { + syserror("Failed to write %cid mapping to \"%s\"", + map_type == ID_TYPE_UID ? 'u' : 'g', path); + goto out; + } + + fret = 0; +out: + close(fd); + close(setgroups_fd); + + return fret; +} + +static int map_ids_from_idmap(struct list *idmap, pid_t pid) +{ + int fill, left; + char mapbuf[4096] = {}; + bool had_entry = false; + idmap_type_t map_type, u_or_g; + + if (list_empty(idmap)) + return 0; + + for (map_type = ID_TYPE_UID, u_or_g = 'u'; + map_type <= ID_TYPE_GID; map_type++, u_or_g = 'g') { + char *pos = mapbuf; + int ret; + struct list *iterator; + + + list_for_each(iterator, idmap) { + struct id_map *map = iterator->elem; + if (map->map_type != map_type) + continue; + + had_entry = true; + + left = 4096 - (pos - mapbuf); + fill = snprintf(pos, left, "%u %u %u\n", map->nsid, map->hostid, map->range); + /* + * The kernel only takes <= 4k for writes to + * /proc/<pid>/{g,u}id_map + */ + if (fill <= 0 || fill >= left) + return syserror_set(-E2BIG, "Too many %cid mappings defined", u_or_g); + + pos += fill; + } + if (!had_entry) + continue; + + ret = write_id_mapping(map_type, pid, mapbuf, pos - mapbuf); + if (ret < 0) + return syserror("Failed to write mapping: %s", mapbuf); + + memset(mapbuf, 0, sizeof(mapbuf)); + } + + return 0; +} + +static int get_userns_fd_from_idmap(struct list *idmap) +{ + int ret; + pid_t pid; + char path_ns[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) + + STRLITERALLEN("/ns/user") + 1]; + + pid = do_clone(get_userns_fd_cb, NULL, CLONE_NEWUSER | CLONE_NEWNS); + if (pid < 0) + return -errno; + + ret = map_ids_from_idmap(idmap, pid); + if (ret < 0) + return ret; + + ret = snprintf(path_ns, sizeof(path_ns), "/proc/%d/ns/user", pid); + if (ret < 0 || (size_t)ret >= sizeof(path_ns)) + ret = -EIO; + else + ret = open(path_ns, O_RDONLY | O_CLOEXEC | O_NOCTTY); + + (void)kill(pid, SIGKILL); + (void)wait_for_pid(pid); + return ret; +} + +int get_userns_fd(unsigned long nsid, unsigned long hostid, unsigned long range) +{ + struct list head, uid_mapl, gid_mapl; + struct id_map uid_map = { + .map_type = ID_TYPE_UID, + .nsid = nsid, + .hostid = hostid, + .range = range, + }; + struct id_map gid_map = { + .map_type = ID_TYPE_GID, + .nsid = nsid, + .hostid = hostid, + .range = range, + }; + + list_init(&head); + uid_mapl.elem = &uid_map; + gid_mapl.elem = &gid_map; + list_add_tail(&head, &uid_mapl); + list_add_tail(&head, &gid_mapl); + + return get_userns_fd_from_idmap(&head); +} + +bool switch_ids(uid_t uid, gid_t gid) +{ + if (setgroups(0, NULL)) + return syserror("failure: setgroups"); + + if (setresgid(gid, gid, gid)) + return syserror("failure: setresgid"); + + if (setresuid(uid, uid, uid)) + return syserror("failure: setresuid"); + + /* Ensure we can access proc files from processes we can ptrace. */ + if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0)) + return syserror("failure: make dumpable"); + + return true; +} + +static int create_userns_hierarchy(struct userns_hierarchy *h); + +static int userns_fd_cb(void *data) +{ + struct userns_hierarchy *h = data; + char c; + int ret; + + ret = read_nointr(h->fd_event, &c, 1); + if (ret < 0) + return syserror("failure: read from socketpair"); + + /* Only switch ids if someone actually wrote a mapping for us. */ + if (c == '1') { + if (!switch_ids(0, 0)) + return syserror("failure: switch ids to 0"); + } + + ret = write_nointr(h->fd_event, "1", 1); + if (ret < 0) + return syserror("failure: write to socketpair"); + + ret = create_userns_hierarchy(++h); + if (ret < 0) + return syserror("failure: userns level %d", h->level); + + return 0; +} + +static int create_userns_hierarchy(struct userns_hierarchy *h) +{ + int fret = -1; + char c; + int fd_socket[2]; + int fd_userns = -EBADF, ret = -1; + ssize_t bytes; + pid_t pid; + char path[256]; + + if (h->level == MAX_USERNS_LEVEL) + return 0; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, fd_socket); + if (ret < 0) + return syserror("failure: create socketpair"); + + /* Note the CLONE_FILES | CLONE_VM when mucking with fds and memory. */ + h->fd_event = fd_socket[1]; + pid = do_clone(userns_fd_cb, h, CLONE_NEWUSER | CLONE_FILES | CLONE_VM); + if (pid < 0) { + syserror("failure: userns level %d", h->level); + goto out_close; + } + + ret = map_ids_from_idmap(&h->id_map, pid); + if (ret < 0) { + kill(pid, SIGKILL); + syserror("failure: writing id mapping for userns level %d for %d", h->level, pid); + goto out_wait; + } + + if (!list_empty(&h->id_map)) + bytes = write_nointr(fd_socket[0], "1", 1); /* Inform the child we wrote a mapping. */ + else + bytes = write_nointr(fd_socket[0], "0", 1); /* Inform the child we didn't write a mapping. */ + if (bytes < 0) { + kill(pid, SIGKILL); + syserror("failure: write to socketpair"); + goto out_wait; + } + + /* Wait for child to set*id() and become dumpable. */ + bytes = read_nointr(fd_socket[0], &c, 1); + if (bytes < 0) { + kill(pid, SIGKILL); + syserror("failure: read from socketpair"); + goto out_wait; + } + + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); + fd_userns = open(path, O_RDONLY | O_CLOEXEC); + if (fd_userns < 0) { + kill(pid, SIGKILL); + syserror("failure: open userns level %d for %d", h->level, pid); + goto out_wait; + } + + fret = 0; + +out_wait: + if (!wait_for_pid(pid) && !fret) { + h->fd_userns = fd_userns; + fd_userns = -EBADF; + } + +out_close: + if (fd_userns >= 0) + close(fd_userns); + close(fd_socket[0]); + close(fd_socket[1]); + return fret; +} + +/* caps_down - lower all effective caps */ +int caps_down(void) +{ + bool fret = false; + cap_t caps = NULL; + int ret = -1; + + caps = cap_get_proc(); + if (!caps) + goto out; + + ret = cap_clear_flag(caps, CAP_EFFECTIVE); + if (ret) + goto out; + + ret = cap_set_proc(caps); + if (ret) + goto out; + + fret = true; + +out: + cap_free(caps); + return fret; +} + +/* cap_down - lower an effective cap */ +int cap_down(cap_value_t down) +{ + bool fret = false; + cap_t caps = NULL; + cap_value_t cap = down; + int ret = -1; + + caps = cap_get_proc(); + if (!caps) + goto out; + + ret = cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap, 0); + if (ret) + goto out; + + ret = cap_set_proc(caps); + if (ret) + goto out; + + fret = true; + +out: + cap_free(caps); + return fret; +} diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h new file mode 100644 index 000000000000..7f1df2a3e94c --- /dev/null +++ b/tools/testing/selftests/filesystems/utils.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __IDMAP_UTILS_H +#define __IDMAP_UTILS_H + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <errno.h> +#include <linux/types.h> +#include <sched.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/capability.h> +#include <sys/fsuid.h> +#include <sys/types.h> +#include <unistd.h> + +extern int get_userns_fd(unsigned long nsid, unsigned long hostid, + unsigned long range); + +extern int caps_down(void); +extern int cap_down(cap_value_t down); + +extern bool switch_ids(uid_t uid, gid_t gid); + +static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps) +{ + if (setns(fd, CLONE_NEWUSER)) + return false; + + if (!switch_ids(uid, gid)) + return false; + + if (drop_caps && !caps_down()) + return false; + + return true; +} + +#endif /* __IDMAP_UTILS_H */ diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index cdf91b0ca40f..c3b6d2604b1e 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -444,10 +444,6 @@ static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ... static inline int ksft_min_kernel_version(unsigned int min_major, unsigned int min_minor) { -#ifdef NOLIBC - ksft_print_msg("NOLIBC: Can't check kernel version: Function not implemented\n"); - return 0; -#else unsigned int major, minor; struct utsname info; @@ -455,7 +451,6 @@ static inline int ksft_min_kernel_version(unsigned int min_major, ksft_exit_fail_msg("Can't parse kernel version\n"); return major > min_major || (major == min_major && minor >= min_minor); -#endif } #endif /* __KSELFTEST_H */ diff --git a/tools/testing/selftests/kselftest/module.sh b/tools/testing/selftests/kselftest/module.sh index fb4733faff12..51fb65159932 100755 --- a/tools/testing/selftests/kselftest/module.sh +++ b/tools/testing/selftests/kselftest/module.sh @@ -11,7 +11,7 @@ # SPDX-License-Identifier: GPL-2.0+ # $(dirname $0)/../kselftest/module.sh "description" module_name # -# Example: tools/testing/selftests/lib/printf.sh +# Example: tools/testing/selftests/lib/bitmap.sh desc="" # Output prefix. module="" # Filename (without the .ko). diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index d6edcfcb5be8..530390033929 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -228,4 +228,7 @@ $(OUTPUT)/%:%.S $(LINK.S) $^ $(LDLIBS) -o $@ endif -.PHONY: run_tests all clean install emit_tests gen_mods_dir clean_mods_dir +headers: + $(Q)$(MAKE) -C $(top_srcdir) headers + +.PHONY: run_tests all clean install emit_tests gen_mods_dir clean_mods_dir headers diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile index c52fe3ad8e98..f876bf4744e1 100644 --- a/tools/testing/selftests/lib/Makefile +++ b/tools/testing/selftests/lib/Makefile @@ -4,5 +4,5 @@ # No binaries, but make sure arg-less "make" doesn't trigger "run_tests" all: -TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh +TEST_PROGS := bitmap.sh include ../lib.mk diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index dc15aba8d0a3..81a1f64a22e8 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -1,5 +1,2 @@ -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m -CONFIG_PRIME_NUMBERS=m CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/lib/prime_numbers.sh b/tools/testing/selftests/lib/prime_numbers.sh deleted file mode 100755 index 370b79a9cb2e..000000000000 --- a/tools/testing/selftests/lib/prime_numbers.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Checks fast/slow prime_number generation for inconsistencies -$(dirname $0)/../kselftest/module.sh "prime numbers" prime_numbers selftest=65536 diff --git a/tools/testing/selftests/lib/printf.sh b/tools/testing/selftests/lib/printf.sh deleted file mode 100755 index 05f4544e87f9..000000000000 --- a/tools/testing/selftests/lib/printf.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Tests the printf infrastructure using test_printf kernel module. -$(dirname $0)/../kselftest/module.sh "printf" test_printf diff --git a/tools/testing/selftests/lib/scanf.sh b/tools/testing/selftests/lib/scanf.sh deleted file mode 100755 index b59b8ba561c3..000000000000 --- a/tools/testing/selftests/lib/scanf.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Tests the scanf infrastructure using test_scanf kernel module. -$(dirname $0)/../kselftest/module.sh "scanf" test_scanf diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c index ece37212a8a2..525c50d3ec23 100644 --- a/tools/testing/selftests/mm/guard-pages.c +++ b/tools/testing/selftests/mm/guard-pages.c @@ -19,6 +19,8 @@ #include <sys/uio.h> #include <unistd.h> +#include "../pidfd/pidfd.h" + /* * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1: * @@ -50,11 +52,6 @@ static void handle_fatal(int c) siglongjmp(signal_jmp_buf, c); } -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(SYS_pidfd_open, pid, flags); -} - static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, size_t n, int advice, unsigned int flags) { @@ -370,14 +367,10 @@ TEST_F(guard_pages, multi_vma) TEST_F(guard_pages, process_madvise) { const unsigned long page_size = self->page_size; - pid_t pid = getpid(); - int pidfd = pidfd_open(pid, 0); char *ptr_region, *ptr1, *ptr2, *ptr3; ssize_t count; struct iovec vec[6]; - ASSERT_NE(pidfd, -1); - /* Reserve region to map over. */ ptr_region = mmap(NULL, 100 * page_size, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); @@ -425,7 +418,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0); /* Now guard in one step. */ - count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); + count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_INSTALL, 0); /* OK we don't have permission to do this, skip. */ if (count == -1 && errno == EPERM) @@ -446,7 +439,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size])); /* Now do the same with unguard... */ - count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); + count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_REMOVE, 0); /* ...and everything should now succeed. */ @@ -463,7 +456,6 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(ptr1, 10 * page_size), 0); ASSERT_EQ(munmap(ptr2, 5 * page_size), 0); ASSERT_EQ(munmap(ptr3, 20 * page_size), 0); - close(pidfd); } /* Assert that unmapping ranges does not leave guard markers behind. */ diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index ad17005521a8..005f29c86484 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -218,7 +218,7 @@ bool seal_support(void) bool pkey_supported(void) { #if defined(__i386__) || defined(__x86_64__) /* arch */ - int pkey = sys_pkey_alloc(0, 0); + int pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); if (pkey > 0) return true; @@ -1671,7 +1671,7 @@ static void test_seal_discard_ro_anon_on_pkey(bool seal) setup_single_address_rw(size, &ptr); FAIL_TEST_IF_FALSE(ptr != (void *)-1); - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_TEST_IF_FALSE(pkey > 0); ret = sys_mprotect_pkey((void *)ptr, size, PROT_READ | PROT_WRITE, pkey); @@ -1683,7 +1683,7 @@ static void test_seal_discard_ro_anon_on_pkey(bool seal) } /* sealing doesn't take effect if PKRU allow write. */ - set_pkey(pkey, 0); + set_pkey(pkey, PKEY_UNRESTRICTED); ret = sys_madvise(ptr, size, MADV_DONTNEED); FAIL_TEST_IF_FALSE(!ret); diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index f080e97b39be..ea404f80e6cb 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -13,6 +13,7 @@ #include <ucontext.h> #include <sys/mman.h> +#include <linux/mman.h> #include <linux/types.h> #include "../kselftest.h" @@ -193,7 +194,7 @@ static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) static inline int kernel_has_pkeys(void) { /* try allocating a key and see if it succeeds */ - int ret = sys_pkey_alloc(0, 0); + int ret = sys_pkey_alloc(0, PKEY_UNRESTRICTED); if (ret <= 0) { return 0; } diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c index 1ac8c8809880..b5e076a564c9 100644 --- a/tools/testing/selftests/mm/pkey_sighandler_tests.c +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -311,7 +311,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) __write_pkey_reg(pkey_reg); /* Protect the new stack with MPK 1 */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); /* Set up alternate signal stack that will use the default MPK */ @@ -484,7 +484,7 @@ static void test_pkru_sigreturn(void) __write_pkey_reg(pkey_reg); /* Protect the stack with MPK 2 */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); /* Set up alternate signal stack that will use the default MPK */ diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 35565af308af..23ebec367015 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -463,7 +463,7 @@ static pid_t fork_lazy_child(void) static int alloc_pkey(void) { int ret; - unsigned long init_val = 0x0; + unsigned long init_val = PKEY_UNRESTRICTED; dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index da7e26668103..7cc71d942f83 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -304,7 +304,9 @@ uffd_stress_bin=./uffd-stress CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 # Hugetlb tests require source and destination huge pages. Pass in half # the size of the free pages we have, which is used for *each*. -half_ufd_size_MB=$((freepgs / 2)) +# uffd-stress expects a region expressed in MiB, so we adjust +# half_ufd_size_MB accordingly. +half_ufd_size_MB=$(((freepgs * hpgsize_KB) / 1024 / 2)) CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 70f65eb320a7..48a000cabc97 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -20,6 +20,7 @@ #include <stdarg.h> #include <linux/mount.h> +#include "../filesystems/overlayfs/wrappers.h" #include "../kselftest_harness.h" #ifndef CLONE_NEWNS @@ -126,6 +127,26 @@ #endif #endif +#ifndef __NR_move_mount + #if defined __alpha__ + #define __NR_move_mount 539 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_move_mount 4429 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_move_mount 6429 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_move_mount 5429 + #endif + #elif defined __ia64__ + #define __NR_move_mount (428 + 1024) + #else + #define __NR_move_mount 429 + #endif +#endif + #ifndef MOUNT_ATTR_IDMAP #define MOUNT_ATTR_IDMAP 0x00100000 #endif @@ -397,6 +418,10 @@ FIXTURE_SETUP(mount_setattr) ASSERT_EQ(mkdir("/tmp/B/BB", 0777), 0); + ASSERT_EQ(mkdir("/tmp/target1", 0777), 0); + + ASSERT_EQ(mkdir("/tmp/target2", 0777), 0); + ASSERT_EQ(mount("testing", "/tmp/B/BB", "tmpfs", MS_NOATIME | MS_NODEV, "size=100000,mode=700"), 0); @@ -1506,4 +1531,631 @@ TEST_F(mount_setattr, mount_attr_nosymfollow) ASSERT_EQ(close(fd), 0); } +TEST_F(mount_setattr, open_tree_detached) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /AA testing tmpfs + * `-/AA/B testing tmpfs + * `-/AA/B/BB testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_subdir, "B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(move_mount(fd_tree_subdir, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + /* + * /tmp/target1 testing tmpfs + * `-/tmp/target1/B testing tmpfs + * `-/tmp/target1/B/BB testing tmpfs + */ + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target1/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target1/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target2", MOVE_MOUNT_F_EMPTY_PATH), 0); + /* + * /tmp/target2 testing tmpfs + * |-/tmp/target2/A testing tmpfs + * | `-/tmp/target2/A/AA testing tmpfs + * | `-/tmp/target2/A/AA/B testing tmpfs + * | `-/tmp/target2/A/AA/B/BB testing tmpfs + * `-/tmp/target2/B testing ramfs + */ + ASSERT_EQ(statx(-EBADF, "/tmp/target2", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, open_tree_detached_fail) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + /* + * The origin mount namespace of the anonymous mount namespace + * of @fd_tree_base doesn't match the caller's mount namespace + * anymore so creation of another detached mounts must fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EINVAL); +} + +TEST_F(mount_setattr, open_tree_detached_fail2) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(create_and_enter_userns(), 0); + + /* + * The caller entered a new user namespace. They will have + * CAP_SYS_ADMIN in this user namespace. However, they're still + * located in a mount namespace that is owned by an ancestor + * user namespace in which they hold no privilege. Creating a + * detached mount must thus fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EPERM); +} + +TEST_F(mount_setattr, open_tree_detached_fail3) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(prepare_unpriv_mountns(), 0); + + /* + * The caller entered a new mount namespace. They will have + * CAP_SYS_ADMIN in the owning user namespace of their mount + * namespace. + * + * However, the origin mount namespace of the anonymous mount + * namespace of @fd_tree_base doesn't match the caller's mount + * namespace anymore so creation of another detached mounts must + * fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EINVAL); +} + +TEST_F(mount_setattr, open_tree_subfolder) +{ + int fd_context, fd_tmpfs, fd_tree; + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + + EXPECT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "subdir", 0755), 0); + + fd_tree = sys_open_tree(fd_tmpfs, "subdir", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + EXPECT_EQ(close(fd_tmpfs), 0); + + ASSERT_EQ(mkdirat(-EBADF, "/mnt/open_tree_subfolder", 0755), 0); + + ASSERT_EQ(sys_move_mount(fd_tree, "", -EBADF, "/mnt/open_tree_subfolder", MOVE_MOUNT_F_EMPTY_PATH), 0); + + EXPECT_EQ(close(fd_tree), 0); + + ASSERT_EQ(umount2("/mnt/open_tree_subfolder", 0), 0); + + EXPECT_EQ(rmdir("/mnt/open_tree_subfolder"), 0); +} + +TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_then_close) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* + * /mnt testing tmpfs + * `-/mnt testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(statx(fd_tree_subdir, "", AT_EMPTY_PATH, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_NE(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_and_attach) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + __u64 mnt_id = 0; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* + * /mnt testing tmpfs + * `-/mnt testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(statx(fd_tree_subdir, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_TRUE(stx.stx_mask & STATX_MNT_ID_UNIQUE); + mnt_id = stx.stx_mnt_id; + + ASSERT_NE(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, STATX_MNT_ID_UNIQUE, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_TRUE(stx.stx_mask & STATX_MNT_ID_UNIQUE); + ASSERT_EQ(stx.stx_mnt_id, mnt_id); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, move_mount_detached_fail) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + + /* Attach the mount to the caller's mount namespace. */ + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(-EBADF, "/tmp/B", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + ASSERT_EQ(statx(fd_tree_subdir, "BB", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* Not allowed to move an attached mount to a detached mount. */ + ASSERT_NE(move_mount(fd_tree_base, "", fd_tree_subdir, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(errno, EINVAL); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, attach_detached_mount_then_umount_then_close) +{ + int fd_tree = -EBADF; + struct statx stx; + + fd_tree = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx), 0); + /* We copied with AT_RECURSIVE so /mnt/A must be a mountpoint. */ + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* Attach the mount to the caller's mount namespace. */ + ASSERT_EQ(move_mount(fd_tree, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(umount2("/tmp/target1", MNT_DETACH), 0); + + /* + * This tests whether dissolve_on_fput() handles a NULL mount + * namespace correctly, i.e., that it doesn't splat. + */ + EXPECT_EQ(close(fd_tree), 0); +} + +TEST_F(mount_setattr, mount_detached1_onto_detached2_then_close_detached1_then_mount_detached2_onto_attached) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * `-/mnt/B testing ramfs + */ + fd_tree2 = sys_open_tree(-EBADF, "/mnt/B", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree2, 0); + + /* + * Move the source detached mount tree to the target detached + * mount tree. This will move all the mounts in the source mount + * tree from the source anonymous mount namespace to the target + * anonymous mount namespace. + * + * The source detached mount tree and the target detached mount + * tree now both refer to the same anonymous mount namespace. + * + * |-"" testing ramfs + * `-"" testing tmpfs + * `-""/AA testing tmpfs + * `-""/AA/B testing tmpfs + * `-""/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree1, "", fd_tree2, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + /* + * The source detached mount tree @fd_tree1 is now an attached + * mount, i.e., it has a parent. Specifically, it now has the + * root mount of the mount tree of @fd_tree2 as its parent. + * + * That means we are no longer allowed to attach it as we only + * allow attaching the root of an anonymous mount tree, not + * random bits and pieces. Verify that the kernel enforces this. + */ + ASSERT_NE(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + /* + * Closing the source detached mount tree must not unmount and + * free the shared anonymous mount namespace. The kernel will + * quickly yell at us because the anonymous mount namespace + * won't be empty when it's freed. + */ + EXPECT_EQ(close(fd_tree1), 0); + + /* + * Attach the mount tree to a non-anonymous mount namespace. + * This can only succeed if closing fd_tree1 had proper + * semantics and didn't cause the anonymous mount namespace to + * be freed. If it did this will trigger a UAF which will be + * visible on any KASAN enabled kernel. + * + * |-/tmp/target1 testing ramfs + * `-/tmp/target1 testing tmpfs + * `-/tmp/target1/AA testing tmpfs + * `-/tmp/target1/AA/B testing tmpfs + * `-/tmp/target1/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + EXPECT_EQ(close(fd_tree2), 0); +} + +TEST_F(mount_setattr, two_detached_mounts_referring_to_same_anonymous_mount_namespace) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * Copy the following mount tree: + * + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * Create an O_PATH file descriptors with a separate struct file + * that refers to the same detached mount tree as @fd_tree1 + */ + fd_tree2 = sys_open_tree(fd_tree1, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree2, 0); + + /* + * Copy the following mount tree: + * + * |-/tmp/target1 testing tmpfs + * `-/tmp/target1/AA testing tmpfs + * `-/tmp/target1/AA/B testing tmpfs + * `-/tmp/target1/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + /* + * This must fail as this would mean adding the same mount tree + * into the same mount tree. + */ + ASSERT_NE(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); +} + +TEST_F(mount_setattr, two_detached_subtrees_of_same_anonymous_mount_namespace) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * Copy the following mount tree: + * + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * Create an O_PATH file descriptors with a separate struct file that + * refers to a subtree of the same detached mount tree as @fd_tree1 + */ + fd_tree2 = sys_open_tree(fd_tree1, "AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree2, 0); + + /* + * This must fail as it is only possible to attach the root of a + * detached mount tree. + */ + ASSERT_NE(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); +} + +TEST_F(mount_setattr, detached_tree_propagation) +{ + int fd_tree = -EBADF; + struct statx stx1, stx2, stx3, stx4; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(mount(NULL, "/mnt", NULL, MS_REC | MS_SHARED, NULL), 0); + + /* + * Copy the following mount tree: + * + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + fd_tree = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + ASSERT_EQ(statx(-EBADF, "/mnt/A", 0, 0, &stx1), 0); + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx2), 0); + + /* + * Copying the mount namespace like done above doesn't alter the + * mounts in any way so the filesystem mounted on /mnt must be + * identical even though the mounts will differ. Use the device + * information to verify that. Note that tmpfs will have a 0 + * major number so comparing the major number is misleading. + */ + ASSERT_EQ(stx1.stx_dev_minor, stx2.stx_dev_minor); + + /* Mount a tmpfs filesystem over /mnt/A. */ + ASSERT_EQ(mount(NULL, "/mnt/A", "tmpfs", 0, NULL), 0); + + + ASSERT_EQ(statx(-EBADF, "/mnt/A", 0, 0, &stx3), 0); + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx4), 0); + + /* + * A new filesystem has been mounted on top of /mnt/A which + * means that the device information will be different for any + * statx() that was taken from /mnt/A before the mount compared + * to one after the mount. + * + * Since we already now that the device information between the + * stx1 and stx2 samples are identical we also now that stx2 and + * stx3 device information will necessarily differ. + */ + ASSERT_NE(stx1.stx_dev_minor, stx3.stx_dev_minor); + + /* + * If mount propagation worked correctly then the tmpfs mount + * that was created after the mount namespace was unshared will + * have propagated onto /mnt/A in the detached mount tree. + * + * Verify that the device information for stx3 and stx4 are + * identical. It is already established that stx3 is different + * from both stx1 and stx2 sampled before the tmpfs mount was + * done so if stx3 and stx4 are identical the proof is done. + */ + ASSERT_EQ(stx3.stx_dev_minor, stx4.stx_dev_minor); + + EXPECT_EQ(close(fd_tree), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 5916f3b81c39..8f32b4f01aee 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -31,7 +31,6 @@ TEST_PROGS += veth.sh TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh TEST_PROGS += gre_gso.sh -TEST_PROGS += gre_ipv6_lladdr.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_so_priority.sh TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh @@ -101,6 +100,7 @@ TEST_PROGS += vlan_bridge_binding.sh TEST_PROGS += bpf_offload.py TEST_PROGS += ipv6_route_update_soft_lockup.sh TEST_PROGS += busy_poll_test.sh +TEST_PROGS += lwt_dst_cache_ref_loop.sh # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller netlink-dumps diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 5b9baf708950..61e5116987f3 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -107,3 +107,5 @@ CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IPV6_ILA=m +CONFIG_IPV6_RPL_LWTUNNEL=y diff --git a/tools/testing/selftests/net/gre_ipv6_lladdr.sh b/tools/testing/selftests/net/gre_ipv6_lladdr.sh deleted file mode 100755 index 5b34f6e1f831..000000000000 --- a/tools/testing/selftests/net/gre_ipv6_lladdr.sh +++ /dev/null @@ -1,177 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source ./lib.sh - -PAUSE_ON_FAIL="no" - -# The trap function handler -# -exit_cleanup_all() -{ - cleanup_all_ns - - exit "${EXIT_STATUS}" -} - -# Add fake IPv4 and IPv6 networks on the loopback device, to be used as -# underlay by future GRE devices. -# -setup_basenet() -{ - ip -netns "${NS0}" link set dev lo up - ip -netns "${NS0}" address add dev lo 192.0.2.10/24 - ip -netns "${NS0}" address add dev lo 2001:db8::10/64 nodad -} - -# Check if network device has an IPv6 link-local address assigned. -# -# Parameters: -# -# * $1: The network device to test -# * $2: An extra regular expression that should be matched (to verify the -# presence of extra attributes) -# * $3: The expected return code from grep (to allow checking the absence of -# a link-local address) -# * $4: The user visible name for the scenario being tested -# -check_ipv6_ll_addr() -{ - local DEV="$1" - local EXTRA_MATCH="$2" - local XRET="$3" - local MSG="$4" - - RET=0 - set +e - ip -netns "${NS0}" -6 address show dev "${DEV}" scope link | grep "fe80::" | grep -q "${EXTRA_MATCH}" - check_err_fail "${XRET}" $? "" - log_test "${MSG}" - set -e -} - -# Create a GRE device and verify that it gets an IPv6 link-local address as -# expected. -# -# Parameters: -# -# * $1: The device type (gre, ip6gre, gretap or ip6gretap) -# * $2: The local underlay IP address (can be an IPv4, an IPv6 or "any") -# * $3: The remote underlay IP address (can be an IPv4, an IPv6 or "any") -# * $4: The IPv6 interface identifier generation mode to use for the GRE -# device (eui64, none, stable-privacy or random). -# -test_gre_device() -{ - local GRE_TYPE="$1" - local LOCAL_IP="$2" - local REMOTE_IP="$3" - local MODE="$4" - local ADDR_GEN_MODE - local MATCH_REGEXP - local MSG - - ip link add netns "${NS0}" name gretest type "${GRE_TYPE}" local "${LOCAL_IP}" remote "${REMOTE_IP}" - - case "${MODE}" in - "eui64") - ADDR_GEN_MODE=0 - MATCH_REGEXP="" - MSG="${GRE_TYPE}, mode: 0 (EUI64), ${LOCAL_IP} -> ${REMOTE_IP}" - XRET=0 - ;; - "none") - ADDR_GEN_MODE=1 - MATCH_REGEXP="" - MSG="${GRE_TYPE}, mode: 1 (none), ${LOCAL_IP} -> ${REMOTE_IP}" - XRET=1 # No link-local address should be generated - ;; - "stable-privacy") - ADDR_GEN_MODE=2 - MATCH_REGEXP="stable-privacy" - MSG="${GRE_TYPE}, mode: 2 (stable privacy), ${LOCAL_IP} -> ${REMOTE_IP}" - XRET=0 - # Initialise stable_secret (required for stable-privacy mode) - ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.stable_secret="2001:db8::abcd" - ;; - "random") - ADDR_GEN_MODE=3 - MATCH_REGEXP="stable-privacy" - MSG="${GRE_TYPE}, mode: 3 (random), ${LOCAL_IP} -> ${REMOTE_IP}" - XRET=0 - ;; - esac - - # Check that IPv6 link-local address is generated when device goes up - ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}" - ip -netns "${NS0}" link set dev gretest up - check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "config: ${MSG}" - - # Now disable link-local address generation - ip -netns "${NS0}" link set dev gretest down - ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode=1 - ip -netns "${NS0}" link set dev gretest up - - # Check that link-local address generation works when re-enabled while - # the device is already up - ip netns exec "${NS0}" sysctl -qw net.ipv6.conf.gretest.addr_gen_mode="${ADDR_GEN_MODE}" - check_ipv6_ll_addr gretest "${MATCH_REGEXP}" "${XRET}" "update: ${MSG}" - - ip -netns "${NS0}" link del dev gretest -} - -test_gre4() -{ - local GRE_TYPE - local MODE - - for GRE_TYPE in "gre" "gretap"; do - printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n" - - for MODE in "eui64" "none" "stable-privacy" "random"; do - test_gre_device "${GRE_TYPE}" 192.0.2.10 192.0.2.11 "${MODE}" - test_gre_device "${GRE_TYPE}" any 192.0.2.11 "${MODE}" - test_gre_device "${GRE_TYPE}" 192.0.2.10 any "${MODE}" - done - done -} - -test_gre6() -{ - local GRE_TYPE - local MODE - - for GRE_TYPE in "ip6gre" "ip6gretap"; do - printf "\n####\nTesting IPv6 link-local address generation on ${GRE_TYPE} devices\n####\n\n" - - for MODE in "eui64" "none" "stable-privacy" "random"; do - test_gre_device "${GRE_TYPE}" 2001:db8::10 2001:db8::11 "${MODE}" - test_gre_device "${GRE_TYPE}" any 2001:db8::11 "${MODE}" - test_gre_device "${GRE_TYPE}" 2001:db8::10 any "${MODE}" - done - done -} - -usage() -{ - echo "Usage: $0 [-p]" - exit 1 -} - -while getopts :p o -do - case $o in - p) PAUSE_ON_FAIL="yes";; - *) usage;; - esac -done - -setup_ns NS0 - -set -e -trap exit_cleanup_all EXIT - -setup_basenet - -test_gre4 -test_gre6 diff --git a/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh b/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh new file mode 100755 index 000000000000..881eb399798f --- /dev/null +++ b/tools/testing/selftests/net/lwt_dst_cache_ref_loop.sh @@ -0,0 +1,246 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Author: Justin Iurman <justin.iurman@uliege.be> +# +# WARNING +# ------- +# This is just a dummy script that triggers encap cases with possible dst cache +# reference loops in affected lwt users (see list below). Some cases are +# pathological configurations for simplicity, others are valid. Overall, we +# don't want this issue to happen, no matter what. In order to catch any +# reference loops, kmemleak MUST be used. The results alone are always blindly +# successful, don't rely on them. Note that the following tests may crash the +# kernel if the fix to prevent lwtunnel_{input|output|xmit}() reentry loops is +# not present. +# +# Affected lwt users so far (please update accordingly if needed): +# - ila_lwt (output only) +# - ioam6_iptunnel (output only) +# - rpl_iptunnel (both input and output) +# - seg6_iptunnel (both input and output) + +source lib.sh + +check_compatibility() +{ + setup_ns tmp_node &>/dev/null + if [ $? != 0 ]; then + echo "SKIP: Cannot create netns." + exit $ksft_skip + fi + + ip link add name veth0 netns $tmp_node type veth \ + peer name veth1 netns $tmp_node &>/dev/null + local ret=$? + + ip -netns $tmp_node link set veth0 up &>/dev/null + ret=$((ret + $?)) + + ip -netns $tmp_node link set veth1 up &>/dev/null + ret=$((ret + $?)) + + if [ $ret != 0 ]; then + echo "SKIP: Cannot configure links." + cleanup_ns $tmp_node + exit $ksft_skip + fi + + lsmod 2>/dev/null | grep -q "ila" + ila_lsmod=$? + [ $ila_lsmod != 0 ] && modprobe ila &>/dev/null + + ip -netns $tmp_node route add 2001:db8:1::/64 \ + encap ila 1:2:3:4 csum-mode no-action ident-type luid \ + hook-type output \ + dev veth0 &>/dev/null + + ip -netns $tmp_node route add 2001:db8:2::/64 \ + encap ioam6 trace prealloc type 0x800000 ns 0 size 4 \ + dev veth0 &>/dev/null + + ip -netns $tmp_node route add 2001:db8:3::/64 \ + encap rpl segs 2001:db8:3::1 dev veth0 &>/dev/null + + ip -netns $tmp_node route add 2001:db8:4::/64 \ + encap seg6 mode inline segs 2001:db8:4::1 dev veth0 &>/dev/null + + ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ila" + skip_ila=$? + + ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap ioam6" + skip_ioam6=$? + + ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap rpl" + skip_rpl=$? + + ip -netns $tmp_node -6 route 2>/dev/null | grep -q "encap seg6" + skip_seg6=$? + + cleanup_ns $tmp_node +} + +setup() +{ + setup_ns alpha beta gamma &>/dev/null + + ip link add name veth-alpha netns $alpha type veth \ + peer name veth-betaL netns $beta &>/dev/null + + ip link add name veth-betaR netns $beta type veth \ + peer name veth-gamma netns $gamma &>/dev/null + + ip -netns $alpha link set veth-alpha name veth0 &>/dev/null + ip -netns $beta link set veth-betaL name veth0 &>/dev/null + ip -netns $beta link set veth-betaR name veth1 &>/dev/null + ip -netns $gamma link set veth-gamma name veth0 &>/dev/null + + ip -netns $alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null + ip -netns $alpha link set veth0 up &>/dev/null + ip -netns $alpha link set lo up &>/dev/null + ip -netns $alpha route add 2001:db8:2::/64 \ + via 2001:db8:1::1 dev veth0 &>/dev/null + + ip -netns $beta addr add 2001:db8:1::1/64 dev veth0 &>/dev/null + ip -netns $beta addr add 2001:db8:2::1/64 dev veth1 &>/dev/null + ip -netns $beta link set veth0 up &>/dev/null + ip -netns $beta link set veth1 up &>/dev/null + ip -netns $beta link set lo up &>/dev/null + ip -netns $beta route del 2001:db8:2::/64 + ip -netns $beta route add 2001:db8:2::/64 dev veth1 + ip netns exec $beta \ + sysctl -wq net.ipv6.conf.all.forwarding=1 &>/dev/null + + ip -netns $gamma addr add 2001:db8:2::2/64 dev veth0 &>/dev/null + ip -netns $gamma link set veth0 up &>/dev/null + ip -netns $gamma link set lo up &>/dev/null + ip -netns $gamma route add 2001:db8:1::/64 \ + via 2001:db8:2::1 dev veth0 &>/dev/null + + sleep 1 + + ip netns exec $alpha ping6 -c 5 -W 1 2001:db8:2::2 &>/dev/null + if [ $? != 0 ]; then + echo "SKIP: Setup failed." + exit $ksft_skip + fi + + sleep 1 +} + +cleanup() +{ + cleanup_ns $alpha $beta $gamma + [ $ila_lsmod != 0 ] && modprobe -r ila &>/dev/null +} + +run_ila() +{ + if [ $skip_ila != 0 ]; then + echo "SKIP: ila (output)" + return + fi + + ip -netns $beta route del 2001:db8:2::/64 + ip -netns $beta route add 2001:db8:2:0:0:0:0:2/128 \ + encap ila 2001:db8:2:0 csum-mode no-action ident-type luid \ + hook-type output \ + dev veth1 &>/dev/null + sleep 1 + + echo "TEST: ila (output)" + ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 + + ip -netns $beta route del 2001:db8:2:0:0:0:0:2/128 + ip -netns $beta route add 2001:db8:2::/64 dev veth1 + sleep 1 +} + +run_ioam6() +{ + if [ $skip_ioam6 != 0 ]; then + echo "SKIP: ioam6 (output)" + return + fi + + ip -netns $beta route change 2001:db8:2::/64 \ + encap ioam6 trace prealloc type 0x800000 ns 1 size 4 \ + dev veth1 &>/dev/null + sleep 1 + + echo "TEST: ioam6 (output)" + ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 +} + +run_rpl() +{ + if [ $skip_rpl != 0 ]; then + echo "SKIP: rpl (input)" + echo "SKIP: rpl (output)" + return + fi + + ip -netns $beta route change 2001:db8:2::/64 \ + encap rpl segs 2001:db8:2::2 \ + dev veth1 &>/dev/null + sleep 1 + + echo "TEST: rpl (input)" + ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 + + echo "TEST: rpl (output)" + ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 +} + +run_seg6() +{ + if [ $skip_seg6 != 0 ]; then + echo "SKIP: seg6 (input)" + echo "SKIP: seg6 (output)" + return + fi + + ip -netns $beta route change 2001:db8:2::/64 \ + encap seg6 mode inline segs 2001:db8:2::2 \ + dev veth1 &>/dev/null + sleep 1 + + echo "TEST: seg6 (input)" + ip netns exec $alpha ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 + + echo "TEST: seg6 (output)" + ip netns exec $beta ping6 -c 2 -W 1 2001:db8:2::2 &>/dev/null + sleep 1 +} + +run() +{ + run_ila + run_ioam6 + run_rpl + run_seg6 +} + +if [ "$(id -u)" -ne 0 ]; then + echo "SKIP: Need root privileges." + exit $ksft_skip +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool." + exit $ksft_skip +fi + +check_compatibility + +trap cleanup EXIT + +setup +run + +exit $ksft_pass diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 7d14a7c0cb62..58bcbbd029bc 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -47,6 +47,7 @@ XARCH_riscv = riscv64 XARCH = $(or $(XARCH_$(ARCH)),$(ARCH)) # map from user input variants to their kernel supported architectures +ARCH_armthumb = arm ARCH_ppc = powerpc ARCH_ppc64 = powerpc ARCH_ppc64le = powerpc @@ -54,6 +55,7 @@ ARCH_mips32le = mips ARCH_mips32be = mips ARCH_riscv32 = riscv ARCH_riscv64 = riscv +ARCH_s390x = s390 ARCH := $(or $(ARCH_$(XARCH)),$(XARCH)) # kernel image names by architecture @@ -62,6 +64,7 @@ IMAGE_x86_64 = arch/x86/boot/bzImage IMAGE_x86 = arch/x86/boot/bzImage IMAGE_arm64 = arch/arm64/boot/Image IMAGE_arm = arch/arm/boot/zImage +IMAGE_armthumb = arch/arm/boot/zImage IMAGE_mips32le = vmlinuz IMAGE_mips32be = vmlinuz IMAGE_ppc = vmlinux @@ -70,6 +73,7 @@ IMAGE_ppc64le = arch/powerpc/boot/zImage IMAGE_riscv = arch/riscv/boot/Image IMAGE_riscv32 = arch/riscv/boot/Image IMAGE_riscv64 = arch/riscv/boot/Image +IMAGE_s390x = arch/s390/boot/bzImage IMAGE_s390 = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi IMAGE = $(objtree)/$(IMAGE_$(XARCH)) @@ -81,19 +85,20 @@ DEFCONFIG_x86_64 = defconfig DEFCONFIG_x86 = defconfig DEFCONFIG_arm64 = defconfig DEFCONFIG_arm = multi_v7_defconfig +DEFCONFIG_armthumb = multi_v7_defconfig DEFCONFIG_mips32le = malta_defconfig -DEFCONFIG_mips32be = malta_defconfig +DEFCONFIG_mips32be = malta_defconfig generic/eb.config DEFCONFIG_ppc = pmac32_defconfig DEFCONFIG_ppc64 = powernv_be_defconfig DEFCONFIG_ppc64le = powernv_defconfig DEFCONFIG_riscv = defconfig DEFCONFIG_riscv32 = rv32_defconfig DEFCONFIG_riscv64 = defconfig -DEFCONFIG_s390 = defconfig +DEFCONFIG_s390x = defconfig +DEFCONFIG_s390 = defconfig compat.config DEFCONFIG_loongarch = defconfig DEFCONFIG = $(DEFCONFIG_$(XARCH)) -EXTRACONFIG_mips32be = -d CONFIG_CPU_LITTLE_ENDIAN -e CONFIG_CPU_BIG_ENDIAN EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) # optional tests to run (default = all) @@ -105,6 +110,7 @@ QEMU_ARCH_x86_64 = x86_64 QEMU_ARCH_x86 = x86_64 QEMU_ARCH_arm64 = aarch64 QEMU_ARCH_arm = arm +QEMU_ARCH_armthumb = arm QEMU_ARCH_mips32le = mipsel # works with malta_defconfig QEMU_ARCH_mips32be = mips QEMU_ARCH_ppc = ppc @@ -113,6 +119,7 @@ QEMU_ARCH_ppc64le = ppc64 QEMU_ARCH_riscv = riscv64 QEMU_ARCH_riscv32 = riscv32 QEMU_ARCH_riscv64 = riscv64 +QEMU_ARCH_s390x = s390x QEMU_ARCH_s390 = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH = $(QEMU_ARCH_$(XARCH)) @@ -133,6 +140,7 @@ QEMU_ARGS_x86_64 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $( QEMU_ARGS_x86 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_arm64 = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_arm = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_armthumb = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_mips32le = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_mips32be = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -141,6 +149,7 @@ QEMU_ARGS_ppc64le = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv32 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv64 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS = -m 1G $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA) @@ -156,15 +165,18 @@ Q=@ endif CFLAGS_i386 = $(call cc-option,-m32) +CFLAGS_arm = -marm +CFLAGS_armthumb = -mthumb -march=armv6t2 CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) -CFLAGS_s390 = -m64 +CFLAGS_s390x = -m64 +CFLAGS_s390 = -m31 CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \ - $(call cc-option,-fno-stack-protector) \ + $(call cc-option,-fno-stack-protector) $(call cc-option,-Wmissing-prototypes) \ $(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA) LDFLAGS := @@ -220,7 +232,7 @@ all: run sysroot: sysroot/$(ARCH)/include -sysroot/$(ARCH)/include: +sysroot/$(ARCH)/include: | defconfig $(Q)rm -rf sysroot/$(ARCH) sysroot/sysroot $(QUIET_MKDIR)mkdir -p sysroot $(Q)$(MAKE) -C $(srctree) outputmakefile @@ -264,16 +276,16 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ fi -kernel: +kernel: | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -kernel-standalone: initramfs +kernel-standalone: initramfs | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null # run the tests after building the kernel diff --git a/tools/testing/selftests/nolibc/nolibc-test-linkage.c b/tools/testing/selftests/nolibc/nolibc-test-linkage.c index 5ff4c8a1db2a..a7ca8325863f 100644 --- a/tools/testing/selftests/nolibc/nolibc-test-linkage.c +++ b/tools/testing/selftests/nolibc/nolibc-test-linkage.c @@ -11,16 +11,16 @@ void *linkage_test_errno_addr(void) return &errno; } -int linkage_test_constructor_test_value; +int linkage_test_constructor_test_value = 0; __attribute__((constructor)) static void constructor1(void) { - linkage_test_constructor_test_value = 2; + linkage_test_constructor_test_value |= 1 << 0; } __attribute__((constructor)) static void constructor2(void) { - linkage_test_constructor_test_value *= 3; + linkage_test_constructor_test_value |= 1 << 1; } diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 0e0e3b48a8c3..5884a891c491 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -43,6 +43,8 @@ #endif #endif +#pragma GCC diagnostic ignored "-Wmissing-prototypes" + #include "nolibc-test-linkage.h" /* for the type of int_fast16_t and int_fast32_t, musl differs from glibc and nolibc */ @@ -690,14 +692,14 @@ int expect_strtox(int llen, void *func, const char *input, int base, intmax_t ex __attribute__((constructor)) static void constructor1(void) { - constructor_test_value = 1; + constructor_test_value |= 1 << 0; } __attribute__((constructor)) static void constructor2(int argc, char **argv, char **envp) { if (argc && argv && envp) - constructor_test_value *= 2; + constructor_test_value |= 1 << 1; } int run_startup(int min, int max) @@ -736,9 +738,9 @@ int run_startup(int min, int max) CASE_TEST(environ_HOME); EXPECT_PTRNZ(1, getenv("HOME")); break; CASE_TEST(auxv_addr); EXPECT_PTRGT(test_auxv != (void *)-1, test_auxv, brk); break; CASE_TEST(auxv_AT_UID); EXPECT_EQ(1, getauxval(AT_UID), getuid()); break; - CASE_TEST(constructor); EXPECT_EQ(1, constructor_test_value, 2); break; + CASE_TEST(constructor); EXPECT_EQ(is_nolibc, constructor_test_value, 0x3); break; CASE_TEST(linkage_errno); EXPECT_PTREQ(1, linkage_test_errno_addr(), &errno); break; - CASE_TEST(linkage_constr); EXPECT_EQ(1, linkage_test_constructor_test_value, 6); break; + CASE_TEST(linkage_constr); EXPECT_EQ(1, linkage_test_constructor_test_value, 0x3); break; case __LINE__: return ret; /* must be last */ /* note: do not set any defaults so as to permit holes above */ @@ -767,6 +769,44 @@ int test_getdents64(const char *dir) return ret; } +static int test_dirent(void) +{ + int comm = 0, cmdline = 0; + struct dirent dirent, *result; + DIR *dir; + int ret; + + dir = opendir("/proc/self"); + if (!dir) + return 1; + + while (1) { + errno = 0; + ret = readdir_r(dir, &dirent, &result); + if (ret != 0) + return 1; + if (!result) + break; + + if (strcmp(dirent.d_name, "comm") == 0) + comm++; + else if (strcmp(dirent.d_name, "cmdline") == 0) + cmdline++; + } + + if (errno) + return 1; + + ret = closedir(dir); + if (ret) + return 1; + + if (comm != 1 || cmdline != 1) + return 1; + + return 0; +} + int test_getpagesize(void) { int x = getpagesize(); @@ -988,6 +1028,22 @@ int test_rlimit(void) return 0; } +int test_openat(void) +{ + int dev, null; + + dev = openat(AT_FDCWD, "/dev", O_DIRECTORY); + if (dev < 0) + return -1; + + null = openat(dev, "null", O_RDONLY); + close(dev); + if (null < 0) + return -1; + + close(null); + return 0; +} /* Run syscall tests between IDs <min> and <max>. * Return 0 on success, non-zero on failure. @@ -1059,6 +1115,7 @@ int run_syscall(int min, int max) CASE_TEST(fork); EXPECT_SYSZR(1, test_fork()); break; CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; + CASE_TEST(directories); EXPECT_SYSZR(proc, test_dirent()); break; CASE_TEST(gettimeofday_tv); EXPECT_SYSZR(1, gettimeofday(&tv, NULL)); break; CASE_TEST(gettimeofday_tv_tz);EXPECT_SYSZR(1, gettimeofday(&tv, &tz)); break; CASE_TEST(getpagesize); EXPECT_SYSZR(1, test_getpagesize()); break; @@ -1073,8 +1130,9 @@ int run_syscall(int min, int max) CASE_TEST(mmap_bad); EXPECT_PTRER(1, mmap(NULL, 0, PROT_READ, MAP_PRIVATE, 0, 0), MAP_FAILED, EINVAL); break; CASE_TEST(munmap_bad); EXPECT_SYSER(1, munmap(NULL, 0), -1, EINVAL); break; CASE_TEST(mmap_munmap_good); EXPECT_SYSZR(1, test_mmap_munmap()); break; - CASE_TEST(open_tty); EXPECT_SYSNE(1, tmp = open("/dev/null", 0), -1); if (tmp != -1) close(tmp); break; - CASE_TEST(open_blah); EXPECT_SYSER(1, tmp = open("/proc/self/blah", 0), -1, ENOENT); if (tmp != -1) close(tmp); break; + CASE_TEST(open_tty); EXPECT_SYSNE(1, tmp = open("/dev/null", O_RDONLY), -1); if (tmp != -1) close(tmp); break; + CASE_TEST(open_blah); EXPECT_SYSER(1, tmp = open("/proc/self/blah", O_RDONLY), -1, ENOENT); if (tmp != -1) close(tmp); break; + CASE_TEST(openat_dir); EXPECT_SYSZR(1, test_openat()); break; CASE_TEST(pipe); EXPECT_SYSZR(1, test_pipe()); break; CASE_TEST(poll_null); EXPECT_SYSZR(1, poll(NULL, 0, 0)); break; CASE_TEST(poll_stdout); EXPECT_SYSNE(1, ({ struct pollfd fds = { 1, POLLOUT, 0}; poll(&fds, 1, 0); }), -1); break; @@ -1284,6 +1342,73 @@ static int expect_vfprintf(int llen, int c, const char *expected, const char *fm return ret; } +static int test_scanf(void) +{ + unsigned long long ull; + unsigned long ul; + unsigned int u; + long long ll; + long l; + void *p; + int i; + + /* return __LINE__ to point to the specific failure */ + + /* test EOF */ + if (sscanf("", "foo") != EOF) + return __LINE__; + + /* test simple literal without placeholder */ + if (sscanf("foo", "foo") != 0) + return __LINE__; + + /* test single placeholder */ + if (sscanf("123", "%d", &i) != 1) + return __LINE__; + + if (i != 123) + return __LINE__; + + /* test multiple place holders and separators */ + if (sscanf("a123b456c0x90", "a%db%uc%p", &i, &u, &p) != 3) + return __LINE__; + + if (i != 123) + return __LINE__; + + if (u != 456) + return __LINE__; + + if (p != (void *)0x90) + return __LINE__; + + /* test space handling */ + if (sscanf("a b1", "a b%d", &i) != 1) + return __LINE__; + + if (i != 1) + return __LINE__; + + /* test literal percent */ + if (sscanf("a%1", "a%%%d", &i) != 1) + return __LINE__; + + if (i != 1) + return __LINE__; + + /* test stdint.h types */ + if (sscanf("1|2|3|4|5|6", + "%d|%ld|%lld|%u|%lu|%llu", + &i, &l, &ll, &u, &ul, &ull) != 6) + return __LINE__; + + if (i != 1 || l != 2 || ll != 3 || + u != 4 || ul != 5 || ull != 6) + return __LINE__; + + return 0; +} + static int run_vfprintf(int min, int max) { int test; @@ -1305,6 +1430,7 @@ static int run_vfprintf(int min, int max) CASE_TEST(char); EXPECT_VFPRINTF(1, "c", "%c", 'c'); break; CASE_TEST(hex); EXPECT_VFPRINTF(1, "f", "%x", 0xf); break; CASE_TEST(pointer); EXPECT_VFPRINTF(3, "0x1", "%p", (void *) 0x1); break; + CASE_TEST(scanf); EXPECT_ZR(1, test_scanf()); break; case __LINE__: return ret; /* must be last */ /* note: do not set any defaults so as to permit holes above */ diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 9c5160c53881..0299a0912d40 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -17,7 +17,16 @@ perform_download=0 test_mode=system werror=1 llvm= -archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv32 riscv64 s390 loongarch" +all_archs=( + i386 x86_64 + arm64 arm armthumb + mips32le mips32be + ppc ppc64 ppc64le + riscv32 riscv64 + s390x s390 + loongarch +) +archs="${all_archs[@]}" TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@") @@ -94,19 +103,21 @@ fi crosstool_arch() { case "$1" in arm64) echo aarch64;; + armthumb) echo arm;; ppc) echo powerpc;; ppc64) echo powerpc64;; ppc64le) echo powerpc64;; riscv) echo riscv64;; loongarch) echo loongarch64;; mips*) echo mips;; + s390*) echo s390;; *) echo "$1";; esac } crosstool_abi() { case "$1" in - arm) echo linux-gnueabi;; + arm | armthumb) echo linux-gnueabi;; *) echo linux;; esac } @@ -157,10 +168,6 @@ test_arch() { fi MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") - mkdir -p "$build_dir" - if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then - swallow_output "${MAKE[@]}" defconfig - fi case "$test_mode" in 'system') test_target=run @@ -173,6 +180,13 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" + if [ "$arch" = "s390" ] && ([ "$llvm" = "1" ] || [ "$test_mode" = "user" ]); then + echo "Unsupported configuration" + return + fi + + mkdir -p "$build_dir" + swallow_output "${MAKE[@]}" defconfig swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" V=1 cp run.out run.out."${arch}" "${MAKE[@]}" report | grep passed diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index bf92481f925c..0406a065deb4 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -8,3 +8,5 @@ pidfd_getfd_test pidfd_setns_test pidfd_file_handle_test pidfd_bind_mount +pidfd_info_test +pidfd_exec_helper diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 301343a11b62..fcbefc0d77f6 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -3,7 +3,9 @@ CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \ - pidfd_file_handle_test pidfd_bind_mount + pidfd_file_handle_test pidfd_bind_mount pidfd_info_test + +TEST_GEN_PROGS_EXTENDED := pidfd_exec_helper include ../lib.mk diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 0b96ac4b8ce5..cec22aa11cdf 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -12,6 +12,7 @@ #include <stdlib.h> #include <string.h> #include <syscall.h> +#include <sys/ioctl.h> #include <sys/types.h> #include <sys/wait.h> @@ -50,6 +51,107 @@ #define PIDFD_NONBLOCK O_NONBLOCK #endif +#ifndef PIDFD_SELF_THREAD +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#endif + +#ifndef PIDFD_SELF_THREAD_GROUP +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#endif + +#ifndef PIDFD_SELF +#define PIDFD_SELF PIDFD_SELF_THREAD +#endif + +#ifndef PIDFD_SELF_PROCESS +#define PIDFD_SELF_PROCESS PIDFD_SELF_THREAD_GROUP +#endif + +#ifndef PIDFS_IOCTL_MAGIC +#define PIDFS_IOCTL_MAGIC 0xFF +#endif + +#ifndef PIDFD_GET_CGROUP_NAMESPACE +#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1) +#endif + +#ifndef PIDFD_GET_IPC_NAMESPACE +#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2) +#endif + +#ifndef PIDFD_GET_MNT_NAMESPACE +#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3) +#endif + +#ifndef PIDFD_GET_NET_NAMESPACE +#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4) +#endif + +#ifndef PIDFD_GET_PID_NAMESPACE +#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5) +#endif + +#ifndef PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE +#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6) +#endif + +#ifndef PIDFD_GET_TIME_NAMESPACE +#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7) +#endif + +#ifndef PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE +#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8) +#endif + +#ifndef PIDFD_GET_USER_NAMESPACE +#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9) +#endif + +#ifndef PIDFD_GET_UTS_NAMESPACE +#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10) +#endif + +#ifndef PIDFD_GET_INFO +#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info) +#endif + +#ifndef PIDFD_INFO_PID +#define PIDFD_INFO_PID (1UL << 0) /* Always returned, even if not requested */ +#endif + +#ifndef PIDFD_INFO_CREDS +#define PIDFD_INFO_CREDS (1UL << 1) /* Always returned, even if not requested */ +#endif + +#ifndef PIDFD_INFO_CGROUPID +#define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */ +#endif + +#ifndef PIDFD_INFO_EXIT +#define PIDFD_INFO_EXIT (1UL << 3) /* Always returned if available, even if not requested */ +#endif + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + __u64 mask; + __u64 cgroupid; + __u32 pid; + __u32 tgid; + __u32 ppid; + __u32 ruid; + __u32 rgid; + __u32 euid; + __u32 egid; + __u32 suid; + __u32 sgid; + __u32 fsuid; + __u32 fsgid; + __s32 exit_code; +}; + /* * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c * That means, when it wraps around any pid < 300 will be skipped. @@ -152,4 +254,11 @@ static inline ssize_t write_nointr(int fd, const void *buf, size_t count) return ret; } +static inline int sys_execveat(int dirfd, const char *pathname, + char *const argv[], char *const envp[], + int flags) +{ + return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags); +} + #endif /* __PIDFD_H */ diff --git a/tools/testing/selftests/pidfd/pidfd_exec_helper.c b/tools/testing/selftests/pidfd/pidfd_exec_helper.c new file mode 100644 index 000000000000..5516808c95f2 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_exec_helper.c @@ -0,0 +1,12 @@ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +int main(int argc, char *argv[]) +{ + if (pause()) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); +} diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c index f062a986e382..f718aac75068 100644 --- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -13,6 +13,7 @@ #include <syscall.h> #include <sys/wait.h> #include <sys/mman.h> +#include <sys/mount.h> #include "pidfd.h" #include "../kselftest.h" diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c new file mode 100644 index 000000000000..1758a1b0457b --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/types.h> +#include <poll.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <unistd.h> +#include <sys/socket.h> +#include <linux/kcmp.h> +#include <sys/stat.h> + +#include "pidfd.h" +#include "../kselftest_harness.h" + +FIXTURE(pidfd_info) +{ + pid_t child_pid1; + int child_pidfd1; + + pid_t child_pid2; + int child_pidfd2; + + pid_t child_pid3; + int child_pidfd3; + + pid_t child_pid4; + int child_pidfd4; +}; + +FIXTURE_SETUP(pidfd_info) +{ + int ret; + int ipc_sockets[2]; + char c; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + self->child_pid1 = create_child(&self->child_pidfd1, 0); + EXPECT_GE(self->child_pid1, 0); + + if (self->child_pid1 == 0) { + close(ipc_sockets[0]); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + + pause(); + _exit(EXIT_SUCCESS); + } + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + /* SIGKILL but don't reap. */ + EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0), 0); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + self->child_pid2 = create_child(&self->child_pidfd2, 0); + EXPECT_GE(self->child_pid2, 0); + + if (self->child_pid2 == 0) { + close(ipc_sockets[0]); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + + pause(); + _exit(EXIT_SUCCESS); + } + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + /* SIGKILL and reap. */ + EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0), 0); + EXPECT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0); + + self->child_pid3 = create_child(&self->child_pidfd3, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid3, 0); + + if (self->child_pid3 == 0) + _exit(EXIT_SUCCESS); + + self->child_pid4 = create_child(&self->child_pidfd4, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid4, 0); + + if (self->child_pid4 == 0) + _exit(EXIT_SUCCESS); + + EXPECT_EQ(sys_waitid(P_PID, self->child_pid4, NULL, WEXITED), 0); +} + +FIXTURE_TEARDOWN(pidfd_info) +{ + sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0); + if (self->child_pidfd1 >= 0) + EXPECT_EQ(0, close(self->child_pidfd1)); + + sys_waitid(P_PID, self->child_pid1, NULL, WEXITED); + + sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0); + if (self->child_pidfd2 >= 0) + EXPECT_EQ(0, close(self->child_pidfd2)); + + sys_waitid(P_PID, self->child_pid2, NULL, WEXITED); + sys_waitid(P_PID, self->child_pid3, NULL, WEXITED); + sys_waitid(P_PID, self->child_pid4, NULL, WEXITED); +} + +TEST_F(pidfd_info, sigkill_exit) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID, + }; + + /* Process has exited but not been reaped so this must work. */ + ASSERT_EQ(ioctl(self->child_pidfd1, PIDFD_GET_INFO, &info), 0); + + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(self->child_pidfd1, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + /* Process has exited but not been reaped, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); +} + +TEST_F(pidfd_info, sigkill_reaped) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID, + }; + + /* Process has already been reaped and PIDFD_INFO_EXIT hasn't been set. */ + ASSERT_NE(ioctl(self->child_pidfd2, PIDFD_GET_INFO, &info), 0); + ASSERT_EQ(errno, ESRCH); + + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(self->child_pidfd2, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(WIFSIGNALED(info.exit_code)); + ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL); +} + +TEST_F(pidfd_info, success_exit) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID, + }; + + /* Process has exited but not been reaped so this must work. */ + ASSERT_EQ(ioctl(self->child_pidfd3, PIDFD_GET_INFO, &info), 0); + + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(self->child_pidfd3, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + /* Process has exited but not been reaped, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); +} + +TEST_F(pidfd_info, success_reaped) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID, + }; + + /* Process has already been reaped and PIDFD_INFO_EXIT hasn't been set. */ + ASSERT_NE(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); + ASSERT_EQ(errno, ESRCH); + + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); +} + +TEST_F(pidfd_info, success_reaped_poll) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, + }; + struct pollfd fds = {}; + int nevents; + + fds.events = POLLIN; + fds.fd = self->child_pidfd2; + + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + ASSERT_TRUE(!!(fds.revents & POLLIN)); + ASSERT_TRUE(!!(fds.revents & POLLHUP)); + + ASSERT_EQ(ioctl(self->child_pidfd2, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(WIFSIGNALED(info.exit_code)); + ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL); +} + +static void *pidfd_info_pause_thread(void *arg) +{ + pid_t pid_thread = gettid(); + int ipc_socket = *(int *)arg; + + /* Inform the grand-parent what the tid of this thread is. */ + if (write_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + close(ipc_socket); + + /* Sleep untill we're killed. */ + pause(); + return NULL; +} + +TEST_F(pidfd_info, thread_group) +{ + pid_t pid_leader, pid_poller, pid_thread; + pthread_t thread; + int nevents, pidfd_leader, pidfd_thread, pidfd_leader_thread, ret; + int ipc_sockets[2]; + struct pollfd fds = {}; + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, + }, info2; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + pid_leader = create_child(&pidfd_leader, 0); + EXPECT_GE(pid_leader, 0); + + if (pid_leader == 0) { + close(ipc_sockets[0]); + + /* The thread will outlive the thread-group leader. */ + if (pthread_create(&thread, NULL, pidfd_info_pause_thread, &ipc_sockets[1])) + syscall(__NR_exit, EXIT_FAILURE); + + /* Make the thread-group leader exit prematurely. */ + syscall(__NR_exit, EXIT_SUCCESS); + } + + /* + * Opening a PIDFD_THREAD aka thread-specific pidfd based on a + * thread-group leader must succeed. + */ + pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD); + ASSERT_GE(pidfd_leader_thread, 0); + + pid_poller = fork(); + ASSERT_GE(pid_poller, 0); + if (pid_poller == 0) { + /* + * We can't poll and wait for the old thread-group + * leader to exit using a thread-specific pidfd. The + * thread-group leader exited prematurely and + * notification is delayed until all subthreads have + * exited. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, 10000 /* wait 5 seconds */); + if (nevents != 0) + _exit(EXIT_FAILURE); + if (fds.revents & POLLIN) + _exit(EXIT_FAILURE); + if (fds.revents & POLLHUP) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + /* Retrieve the tid of the thread. */ + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + /* Opening a thread as a thread-group leader must fail. */ + pidfd_thread = sys_pidfd_open(pid_thread, 0); + ASSERT_LT(pidfd_thread, 0); + + /* Opening a thread as a PIDFD_THREAD must succeed. */ + pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD); + ASSERT_GE(pidfd_thread, 0); + + ASSERT_EQ(wait_for_pid(pid_poller), 0); + + /* + * Note that pidfd_leader is a thread-group pidfd, so polling on it + * would only notify us once all thread in the thread-group have + * exited. So we can't poll before we have taken down the whole + * thread-group. + */ + + /* Get PIDFD_GET_INFO using the thread-group leader pidfd. */ + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + /* Process has exited but not been reaped, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info.pid, pid_leader); + + /* + * Now retrieve the same info using the thread specific pidfd + * for the thread-group leader. + */ + info2.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader_thread, PIDFD_GET_INFO, &info2), 0); + ASSERT_TRUE(!!(info2.mask & PIDFD_INFO_CREDS)); + /* Process has exited but not been reaped, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info2.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info2.pid, pid_leader); + + /* Now try the thread-specific pidfd. */ + ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + /* The thread hasn't exited, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info.pid, pid_thread); + + /* + * Take down the whole thread-group. The thread-group leader + * exited successfully but the thread will now be SIGKILLed. + * This must be reflected in the recorded exit information. + */ + EXPECT_EQ(sys_pidfd_send_signal(pidfd_leader, SIGKILL, NULL, 0), 0); + EXPECT_EQ(sys_waitid(P_PIDFD, pidfd_leader, NULL, WEXITED), 0); + + fds.events = POLLIN; + fds.fd = pidfd_leader; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + ASSERT_TRUE(!!(fds.revents & POLLIN)); + /* The thread-group leader has been reaped. */ + ASSERT_TRUE(!!(fds.revents & POLLHUP)); + + /* + * Retrieve exit information for the thread-group leader via the + * thread-group leader pidfd. + */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + /* The thread-group leader exited successfully. Only the specific thread was SIGKILLed. */ + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); + + /* + * Retrieve exit information for the thread-group leader via the + * thread-specific pidfd. + */ + info2.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader_thread, PIDFD_GET_INFO, &info2), 0); + ASSERT_FALSE(!!(info2.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info2.mask & PIDFD_INFO_EXIT)); + + /* The thread-group leader exited successfully. Only the specific thread was SIGKILLed. */ + ASSERT_TRUE(WIFEXITED(info2.exit_code)); + ASSERT_EQ(WEXITSTATUS(info2.exit_code), 0); + + /* Retrieve exit information for the thread. */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + + /* The thread got SIGKILLed. */ + ASSERT_TRUE(WIFSIGNALED(info.exit_code)); + ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL); + + EXPECT_EQ(close(pidfd_leader), 0); + EXPECT_EQ(close(pidfd_thread), 0); +} + +static void *pidfd_info_thread_exec(void *arg) +{ + pid_t pid_thread = gettid(); + int ipc_socket = *(int *)arg; + + /* Inform the grand-parent what the tid of this thread is. */ + if (write_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + if (read_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + close(ipc_socket); + + sys_execveat(AT_FDCWD, "pidfd_exec_helper", NULL, NULL, 0); + return NULL; +} + +TEST_F(pidfd_info, thread_group_exec) +{ + pid_t pid_leader, pid_poller, pid_thread; + pthread_t thread; + int nevents, pidfd_leader, pidfd_leader_thread, pidfd_thread, ret; + int ipc_sockets[2]; + struct pollfd fds = {}; + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, + }; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + pid_leader = create_child(&pidfd_leader, 0); + EXPECT_GE(pid_leader, 0); + + if (pid_leader == 0) { + close(ipc_sockets[0]); + + /* The thread will outlive the thread-group leader. */ + if (pthread_create(&thread, NULL, pidfd_info_thread_exec, &ipc_sockets[1])) + syscall(__NR_exit, EXIT_FAILURE); + + /* Make the thread-group leader exit prematurely. */ + syscall(__NR_exit, EXIT_SUCCESS); + } + + /* Open a thread-specific pidfd for the thread-group leader. */ + pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD); + ASSERT_GE(pidfd_leader_thread, 0); + + pid_poller = fork(); + ASSERT_GE(pid_poller, 0); + if (pid_poller == 0) { + /* + * We can't poll and wait for the old thread-group + * leader to exit using a thread-specific pidfd. The + * thread-group leader exited prematurely and + * notification is delayed until all subthreads have + * exited. + * + * When the thread has execed it will taken over the old + * thread-group leaders struct pid. Calling poll after + * the thread execed will thus block again because a new + * thread-group has started. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, 10000 /* wait 5 seconds */); + if (nevents != 0) + _exit(EXIT_FAILURE); + if (fds.revents & POLLIN) + _exit(EXIT_FAILURE); + if (fds.revents & POLLHUP) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + /* Retrieve the tid of the thread. */ + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + + /* Opening a thread as a PIDFD_THREAD must succeed. */ + pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD); + ASSERT_GE(pidfd_thread, 0); + + /* Now that we've opened a thread-specific pidfd the thread can exec. */ + ASSERT_EQ(write_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + ASSERT_EQ(wait_for_pid(pid_poller), 0); + + /* Wait until the kernel has SIGKILLed the thread. */ + fds.events = POLLHUP; + fds.fd = pidfd_thread; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + /* The thread has been reaped. */ + ASSERT_TRUE(!!(fds.revents & POLLHUP)); + + /* Retrieve thread-specific exit info from pidfd. */ + ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + /* + * While the kernel will have SIGKILLed the whole thread-group + * during exec it will cause the individual threads to exit + * cleanly. + */ + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); + + /* + * The thread-group leader is still alive, the thread has taken + * over its struct pid and thus its pid number. + */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info.pid, pid_leader); + + /* Take down the thread-group leader. */ + EXPECT_EQ(sys_pidfd_send_signal(pidfd_leader, SIGKILL, NULL, 0), 0); + + /* + * Afte the exec we're dealing with an empty thread-group so now + * we must see an exit notification on the thread-specific pidfd + * for the thread-group leader as there's no subthread that can + * revive the struct pid. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + ASSERT_TRUE(!!(fds.revents & POLLIN)); + ASSERT_FALSE(!!(fds.revents & POLLHUP)); + + EXPECT_EQ(sys_waitid(P_PIDFD, pidfd_leader, NULL, WEXITED), 0); + + /* Retrieve exit information for the thread-group leader. */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + + EXPECT_EQ(close(pidfd_leader), 0); + EXPECT_EQ(close(pidfd_thread), 0); +} + +static void *pidfd_info_thread_exec_sane(void *arg) +{ + pid_t pid_thread = gettid(); + int ipc_socket = *(int *)arg; + + /* Inform the grand-parent what the tid of this thread is. */ + if (write_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + if (read_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + close(ipc_socket); + + sys_execveat(AT_FDCWD, "pidfd_exec_helper", NULL, NULL, 0); + return NULL; +} + +TEST_F(pidfd_info, thread_group_exec_thread) +{ + pid_t pid_leader, pid_poller, pid_thread; + pthread_t thread; + int nevents, pidfd_leader, pidfd_leader_thread, pidfd_thread, ret; + int ipc_sockets[2]; + struct pollfd fds = {}; + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, + }; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + pid_leader = create_child(&pidfd_leader, 0); + EXPECT_GE(pid_leader, 0); + + if (pid_leader == 0) { + close(ipc_sockets[0]); + + /* The thread will outlive the thread-group leader. */ + if (pthread_create(&thread, NULL, pidfd_info_thread_exec_sane, &ipc_sockets[1])) + syscall(__NR_exit, EXIT_FAILURE); + + /* + * Pause the thread-group leader. It will be killed once + * the subthread execs. + */ + pause(); + syscall(__NR_exit, EXIT_SUCCESS); + } + + /* Retrieve the tid of the thread. */ + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + + /* Opening a thread as a PIDFD_THREAD must succeed. */ + pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD); + ASSERT_GE(pidfd_thread, 0); + + /* Open a thread-specific pidfd for the thread-group leader. */ + pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD); + ASSERT_GE(pidfd_leader_thread, 0); + + pid_poller = fork(); + ASSERT_GE(pid_poller, 0); + if (pid_poller == 0) { + /* + * The subthread will now exec. The struct pid of the old + * thread-group leader will be assumed by the subthread which + * becomes the new thread-group leader. So no exit notification + * must be generated. Wait for 5 seconds and call it a success + * if no notification has been received. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, 10000 /* wait 5 seconds */); + if (nevents != 0) + _exit(EXIT_FAILURE); + if (fds.revents & POLLIN) + _exit(EXIT_FAILURE); + if (fds.revents & POLLHUP) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + /* Now that we've opened a thread-specific pidfd the thread can exec. */ + ASSERT_EQ(write_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + EXPECT_EQ(close(ipc_sockets[0]), 0); + ASSERT_EQ(wait_for_pid(pid_poller), 0); + + /* Wait until the kernel has SIGKILLed the thread. */ + fds.events = POLLHUP; + fds.fd = pidfd_thread; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + /* The thread has been reaped. */ + ASSERT_TRUE(!!(fds.revents & POLLHUP)); + + /* Retrieve thread-specific exit info from pidfd. */ + ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + /* + * While the kernel will have SIGKILLed the whole thread-group + * during exec it will cause the individual threads to exit + * cleanly. + */ + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); + + /* + * The thread-group leader is still alive, the thread has taken + * over its struct pid and thus its pid number. + */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info.pid, pid_leader); + + /* Take down the thread-group leader. */ + EXPECT_EQ(sys_pidfd_send_signal(pidfd_leader, SIGKILL, NULL, 0), 0); + + /* + * Afte the exec we're dealing with an empty thread-group so now + * we must see an exit notification on the thread-specific pidfd + * for the thread-group leader as there's no subthread that can + * revive the struct pid. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + ASSERT_TRUE(!!(fds.revents & POLLIN)); + ASSERT_FALSE(!!(fds.revents & POLLHUP)); + + EXPECT_EQ(sys_waitid(P_PIDFD, pidfd_leader, NULL, WEXITED), 0); + + /* Retrieve exit information for the thread-group leader. */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + + EXPECT_EQ(close(pidfd_leader), 0); + EXPECT_EQ(close(pidfd_thread), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c index ce413a221bac..cd3de40e4977 100644 --- a/tools/testing/selftests/pidfd/pidfd_open_test.c +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -22,32 +22,6 @@ #include "pidfd.h" #include "../kselftest.h" -#ifndef PIDFS_IOCTL_MAGIC -#define PIDFS_IOCTL_MAGIC 0xFF -#endif - -#ifndef PIDFD_GET_INFO -#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info) -#define PIDFD_INFO_CGROUPID (1UL << 0) - -struct pidfd_info { - __u64 request_mask; - __u64 cgroupid; - __u32 pid; - __u32 tgid; - __u32 ppid; - __u32 ruid; - __u32 rgid; - __u32 euid; - __u32 egid; - __u32 suid; - __u32 sgid; - __u32 fsuid; - __u32 fsgid; - __u32 spare0[1]; -}; -#endif - static int safe_int(const char *numstr, int *converted) { char *err = NULL; @@ -148,7 +122,7 @@ out: int main(int argc, char **argv) { struct pidfd_info info = { - .request_mask = PIDFD_INFO_CGROUPID, + .mask = PIDFD_INFO_CGROUPID, }; int pidfd = -1, ret = 1; pid_t pid; @@ -227,7 +201,7 @@ int main(int argc, char **argv) getegid(), info.sgid); goto on_error; } - if ((info.request_mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) { + if ((info.mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) { ksft_print_msg("cgroupid should not be 0 when PIDFD_INFO_CGROUPID is set\n"); goto on_error; } diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 222f8131283b..e6a079b3d5e2 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -16,55 +16,10 @@ #include <unistd.h> #include <sys/socket.h> #include <sys/stat.h> -#include <linux/ioctl.h> #include "pidfd.h" #include "../kselftest_harness.h" -#ifndef PIDFS_IOCTL_MAGIC -#define PIDFS_IOCTL_MAGIC 0xFF -#endif - -#ifndef PIDFD_GET_CGROUP_NAMESPACE -#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1) -#endif - -#ifndef PIDFD_GET_IPC_NAMESPACE -#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2) -#endif - -#ifndef PIDFD_GET_MNT_NAMESPACE -#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3) -#endif - -#ifndef PIDFD_GET_NET_NAMESPACE -#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4) -#endif - -#ifndef PIDFD_GET_PID_NAMESPACE -#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5) -#endif - -#ifndef PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE -#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6) -#endif - -#ifndef PIDFD_GET_TIME_NAMESPACE -#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7) -#endif - -#ifndef PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE -#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8) -#endif - -#ifndef PIDFD_GET_USER_NAMESPACE -#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9) -#endif - -#ifndef PIDFD_GET_UTS_NAMESPACE -#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10) -#endif - enum { PIDFD_NS_USER, PIDFD_NS_MNT, diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index e9728e86b4f2..fcd85cad9f18 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -42,12 +42,41 @@ static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *)) #endif } -static int signal_received; +static pthread_t signal_received; static void set_signal_received_on_sigusr1(int sig) { if (sig == SIGUSR1) - signal_received = 1; + signal_received = pthread_self(); +} + +static int send_signal(int pidfd) +{ + int ret = 0; + + if (sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0) < 0) { + ret = -EINVAL; + goto exit; + } + + if (signal_received != pthread_self()) { + ret = -EINVAL; + goto exit; + } + +exit: + signal_received = 0; + return ret; +} + +static void *send_signal_worker(void *arg) +{ + int pidfd = (int)(intptr_t)arg; + int ret; + + /* We forward any errors for the caller to handle. */ + ret = send_signal(pidfd); + return (void *)(intptr_t)ret; } /* @@ -56,8 +85,11 @@ static void set_signal_received_on_sigusr1(int sig) */ static int test_pidfd_send_signal_simple_success(void) { - int pidfd, ret; + int pidfd; const char *test_name = "pidfd_send_signal send SIGUSR1"; + pthread_t thread; + void *thread_res; + int err; if (!have_pidfd_send_signal) { ksft_test_result_skip( @@ -66,25 +98,45 @@ static int test_pidfd_send_signal_simple_success(void) return 0; } + signal(SIGUSR1, set_signal_received_on_sigusr1); + + /* Try sending a signal to ourselves via /proc/self. */ pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); if (pidfd < 0) ksft_exit_fail_msg( "%s test: Failed to open process file descriptor\n", test_name); + err = send_signal(pidfd); + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on sending pidfd signal\n", + test_name, err); + close(pidfd); - signal(SIGUSR1, set_signal_received_on_sigusr1); + /* Now try the same thing only using PIDFD_SELF_THREAD_GROUP. */ + err = send_signal(PIDFD_SELF_THREAD_GROUP); + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on PIDFD_SELF_THREAD_GROUP signal\n", + test_name, err); - ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0); - close(pidfd); - if (ret < 0) - ksft_exit_fail_msg("%s test: Failed to send signal\n", + /* + * Now try the same thing in a thread and assert thread ID is equal to + * worker thread ID. + */ + if (pthread_create(&thread, NULL, send_signal_worker, + (void *)(intptr_t)PIDFD_SELF_THREAD)) + ksft_exit_fail_msg("%s test: Failed to create thread\n", test_name); - - if (signal_received != 1) - ksft_exit_fail_msg("%s test: Failed to receive signal\n", + if (pthread_join(thread, &thread_res)) + ksft_exit_fail_msg("%s test: Failed to join thread\n", test_name); + err = (int)(intptr_t)thread_res; + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on PIDFD_SELF_THREAD signal\n", + test_name, err); - signal_received = 0; ksft_test_result_pass("%s test: Sent signal\n", test_name); return 0; } diff --git a/tools/testing/selftests/powerpc/include/pkeys.h b/tools/testing/selftests/powerpc/include/pkeys.h index 3a0129467de6..d6deb6ffa1b9 100644 --- a/tools/testing/selftests/powerpc/include/pkeys.h +++ b/tools/testing/selftests/powerpc/include/pkeys.h @@ -24,6 +24,9 @@ #undef PKEY_DISABLE_EXECUTE #define PKEY_DISABLE_EXECUTE 0x4 +#undef PKEY_UNRESTRICTED +#define PKEY_UNRESTRICTED 0x0 + /* Older versions of libc do not define this */ #ifndef SEGV_PKUERR #define SEGV_PKUERR 4 @@ -93,7 +96,7 @@ int pkeys_unsupported(void) SKIP_IF(!hash_mmu); /* Check if the system call is supported */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); SKIP_IF(pkey < 0); sys_pkey_free(pkey); diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c index 0af4f02669a1..29b91b7456eb 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c @@ -72,7 +72,7 @@ static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) switch (fault_type) { case PKEY_DISABLE_ACCESS: - pkey_set_rights(fault_pkey, 0); + pkey_set_rights(fault_pkey, PKEY_UNRESTRICTED); break; case PKEY_DISABLE_EXECUTE: /* diff --git a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c index 2db76e56d4cb..e89a164c686b 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c +++ b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c @@ -83,7 +83,7 @@ static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) mprotect(pgstart, pgsize, PROT_EXEC)) _exit(1); else - pkey_set_rights(pkey, 0); + pkey_set_rights(pkey, PKEY_UNRESTRICTED); fault_count++; } diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c index f061434af452..7ff53caeb4aa 100644 --- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c +++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c @@ -95,16 +95,16 @@ static int child(struct shared_info *info) /* Get some pkeys so that we can change their bits in the AMR. */ pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE); if (pkey1 < 0) { - pkey1 = sys_pkey_alloc(0, 0); + pkey1 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey1 < 0); disable_execute = false; } - pkey2 = sys_pkey_alloc(0, 0); + pkey2 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey2 < 0); - pkey3 = sys_pkey_alloc(0, 0); + pkey3 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey3 < 0); info->amr |= 3ul << pkeyshift(pkey1) | 2ul << pkeyshift(pkey2); diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c index fc633014424f..10f63042cf91 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c @@ -57,16 +57,16 @@ static int child(struct shared_info *info) /* Get some pkeys so that we can change their bits in the AMR. */ pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE); if (pkey1 < 0) { - pkey1 = sys_pkey_alloc(0, 0); + pkey1 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey1 < 0, &info->child_sync); disable_execute = false; } - pkey2 = sys_pkey_alloc(0, 0); + pkey2 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey2 < 0, &info->child_sync); - pkey3 = sys_pkey_alloc(0, 0); + pkey3 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey3 < 0, &info->child_sync); info->amr1 |= 3ul << pkeyshift(pkey1); diff --git a/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh index 2e63ef009d59..2db12c5cad9c 100755 --- a/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh +++ b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh @@ -49,7 +49,7 @@ do do err= val=$((d*1000+t*10+c)) - tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 5s --configs "SRCU-P" --bootargs "rcutorture.test_srcu_lockdep=$val" --trust-make --datestamp "$ds/$val" > "$T/kvm.sh.out" 2>&1 + tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 5s --configs "SRCU-P" --kconfig "CONFIG_FORCE_NEED_SRCU_NMI_SAFE=y" --bootargs "rcutorture.test_srcu_lockdep=$val rcutorture.reader_flavor=0x2" --trust-make --datestamp "$ds/$val" > "$T/kvm.sh.out" 2>&1 ret=$? mv "$T/kvm.sh.out" "$RCUTORTURE/res/$ds/$val" if test "$d" -ne 0 && test "$ret" -eq 0 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot index 2db39f298d18..fb61703690cb 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot @@ -2,3 +2,4 @@ rcutorture.torture_type=srcud rcupdate.rcu_self_test=1 rcutorture.fwd_progress=3 srcutree.big_cpu_lim=5 +rcutorture.reader_flavor=0x8 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot index c419cac233ee..54f5c9053474 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot @@ -2,3 +2,9 @@ rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcupdate.rcu_self_test=1 + +# This part is for synchronize_rcu() testing +rcutorture.nfakewriters=-1 +rcutorture.gp_sync=1 +rcupdate.rcu_normal=1 +rcutree.rcu_normal_wake_from_gp=1 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index d30922d8c883..352393bc5c56 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -1,7 +1,8 @@ CONFIG_SMP=y CONFIG_NR_CPUS=16 -CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT_LAZY=y CONFIG_PREEMPT=n CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TREE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 index 759ee51d3ddc..420632b030dc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -1,6 +1,7 @@ CONFIG_SMP=y CONFIG_NR_CPUS=74 -CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_LAZY=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n CONFIG_PREEMPT_DYNAMIC=n diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 16496de5f6ce..0fda241fa62b 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -9,3 +9,4 @@ param_test_compare_twice param_test_mm_cid param_test_mm_cid_benchmark param_test_mm_cid_compare_twice +syscall_errors_test diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 5a3432fceb58..0d0a5fae5954 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -16,11 +16,12 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ - param_test_mm_cid_benchmark param_test_mm_cid_compare_twice + param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ + syscall_errors_test TEST_GEN_PROGS_EXTENDED = librseq.so -TEST_PROGS = run_param_test.sh +TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh TEST_FILES := settings @@ -54,3 +55,7 @@ $(OUTPUT)/param_test_mm_cid_benchmark: param_test.c $(TEST_GEN_PROGS_EXTENDED) \ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDED) \ rseq.h rseq-*.h $(CC) $(CFLAGS) -DBUILDOPT_RSEQ_PERCPU_MM_CID -DRSEQ_COMPARE_TWICE $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \ + rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index f6156790c3b4..663a9cef1952 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -71,9 +71,20 @@ static int rseq_ownership; /* Original struct rseq allocation size is 32 bytes. */ #define ORIG_RSEQ_ALLOC_SIZE 32 +/* + * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an + * rseq registration that is larger than the current rseq ABI. + */ +union rseq_tls { + struct rseq_abi abi; + char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE]; +}; + static -__thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"), aligned(RSEQ_THREAD_AREA_ALLOC_SIZE))) = { - .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED, +__thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = { + .abi = { + .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED, + }, }; static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len, @@ -87,7 +98,7 @@ static int sys_getcpu(unsigned *cpu, unsigned *node) return syscall(__NR_getcpu, cpu, node, NULL); } -int rseq_available(void) +bool rseq_available(void) { int rc; @@ -96,9 +107,9 @@ int rseq_available(void) abort(); switch (errno) { case ENOSYS: - return 0; + return false; case EINVAL: - return 1; + return true; default: abort(); } @@ -149,7 +160,7 @@ int rseq_register_current_thread(void) /* Treat libc's ownership as a successful registration. */ return 0; } - rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); + rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); if (rc) { /* * After at least one thread has registered successfully @@ -183,7 +194,7 @@ int rseq_unregister_current_thread(void) /* Treat libc's ownership as a successful unregistration. */ return 0; } - rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); if (rc) return -1; return 0; @@ -249,7 +260,7 @@ void rseq_init(void) rseq_ownership = 1; /* Calculate the offset of the rseq area from the thread pointer. */ - rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer(); + rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer(); /* rseq flags are deprecated, always set to 0. */ rseq_flags = 0; diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index ba424ce80a71..f51a5fdb0444 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -160,6 +160,11 @@ int32_t rseq_fallback_current_cpu(void); int32_t rseq_fallback_current_node(void); /* + * Returns true if rseq is supported. + */ +bool rseq_available(void); + +/* * Values returned can be either the current CPU number, -1 (rseq is * uninitialized), or -2 (rseq initialization has failed). */ diff --git a/tools/testing/selftests/rseq/run_syscall_errors_test.sh b/tools/testing/selftests/rseq/run_syscall_errors_test.sh new file mode 100755 index 000000000000..9272246b39f2 --- /dev/null +++ b/tools/testing/selftests/rseq/run_syscall_errors_test.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Michael Jeanson <mjeanson@efficios.com> + +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" ./syscall_errors_test diff --git a/tools/testing/selftests/rseq/syscall_errors_test.c b/tools/testing/selftests/rseq/syscall_errors_test.c new file mode 100644 index 000000000000..a5d9e1f8a2dc --- /dev/null +++ b/tools/testing/selftests/rseq/syscall_errors_test.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: 2024 Michael Jeanson <mjeanson@efficios.com> + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <assert.h> +#include <stdint.h> +#include <syscall.h> +#include <string.h> +#include <unistd.h> + +#include "rseq.h" + +static int sys_rseq(void *rseq_abi, uint32_t rseq_len, + int flags, uint32_t sig) +{ + return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); +} + +/* + * Check the value of errno on some expected failures of the rseq syscall. + */ + +int main(void) +{ + struct rseq_abi *global_rseq = rseq_get_abi(); + int ret; + int errno_copy; + + if (!rseq_available()) { + fprintf(stderr, "rseq syscall unavailable"); + goto error; + } + + /* The current thread is NOT registered. */ + + /* EINVAL */ + errno = 0; + ret = sys_rseq(global_rseq, 32, -1, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid flag fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + errno = 0; + ret = sys_rseq((char *) global_rseq + 1, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with unaligned rseq_abi fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 31, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid size fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + +#if defined(__LP64__) && (!defined(__s390__) && !defined(__s390x__)) + /* + * We haven't found a reliable way to find an invalid address when + * running a 32bit userspace on a 64bit kernel, so only run this test + * on 64bit builds for the moment. + * + * Also exclude architectures that select + * CONFIG_ALTERNATE_USER_ADDRESS_SPACE where the kernel and userspace + * have their own address space and this failure can't happen. + */ + + /* EFAULT */ + errno = 0; + ret = sys_rseq((void *) -4096UL, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid address fails with errno set to EFAULT (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EFAULT) + goto error; +#endif + + errno = 0; + ret = sys_rseq(global_rseq, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration succeeds for the current thread (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret != 0 && errno != 0) + goto error; + + /* The current thread is registered. */ + + /* EBUSY */ + errno = 0; + ret = sys_rseq(global_rseq, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Double registration fails with errno set to EBUSY (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EBUSY) + goto error; + + /* EPERM */ + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG + 1); + errno_copy = errno; + fprintf(stderr, "Unregistration with wrong RSEQ_SIG fails with errno to EPERM (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EPERM) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Unregistration succeeds for the current thread (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret != 0) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Double unregistration fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + return 0; +error: + return -1; +} diff --git a/tools/testing/selftests/sched/config b/tools/testing/selftests/sched/config index e8b09aa7c0c4..1bb8bf6d7fd4 100644 --- a/tools/testing/selftests/sched/config +++ b/tools/testing/selftests/sched/config @@ -1 +1 @@ -CONFIG_SCHED_DEBUG=y +# empty diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 011762224600..f4531327b8e7 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -172,6 +172,7 @@ auto-test-targets := \ maximal \ maybe_null \ minimal \ + numa \ prog_run \ reload_loop \ select_cpu_dfl \ diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config index 0de9b4ee249d..aa901b05c8ad 100644 --- a/tools/testing/selftests/sched_ext/config +++ b/tools/testing/selftests/sched_ext/config @@ -1,4 +1,3 @@ -CONFIG_SCHED_DEBUG=y CONFIG_SCHED_CLASS_EXT=y CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y diff --git a/tools/testing/selftests/sched_ext/numa.bpf.c b/tools/testing/selftests/sched_ext/numa.bpf.c new file mode 100644 index 000000000000..a79d86ed54a1 --- /dev/null +++ b/tools/testing/selftests/sched_ext/numa.bpf.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that validates the behavior of the NUMA-aware + * functionalities. + * + * The scheduler creates a separate DSQ for each NUMA node, ensuring tasks + * are exclusively processed by CPUs within their respective nodes. Idle + * CPUs are selected only within the same node, so task migration can only + * occurs between CPUs belonging to the same node. + * + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +const volatile unsigned int __COMPAT_SCX_PICK_IDLE_IN_NODE; + +static bool is_cpu_idle(s32 cpu, int node) +{ + const struct cpumask *idle_cpumask; + bool idle; + + idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node); + idle = bpf_cpumask_test_cpu(cpu, idle_cpumask); + scx_bpf_put_cpumask(idle_cpumask); + + return idle; +} + +s32 BPF_STRUCT_OPS(numa_select_cpu, + struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); + s32 cpu; + + /* + * We could just use __COMPAT_scx_bpf_pick_any_cpu_node() here, + * since it already tries to pick an idle CPU within the node + * first, but let's use both functions for better testing coverage. + */ + cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node, + __COMPAT_SCX_PICK_IDLE_IN_NODE); + if (cpu < 0) + cpu = __COMPAT_scx_bpf_pick_any_cpu_node(p->cpus_ptr, node, + __COMPAT_SCX_PICK_IDLE_IN_NODE); + + if (is_cpu_idle(cpu, node)) + scx_bpf_error("CPU %d should be marked as busy", cpu); + + if (__COMPAT_scx_bpf_cpu_node(cpu) != node) + scx_bpf_error("CPU %d should be in node %d", cpu, node); + + return cpu; +} + +void BPF_STRUCT_OPS(numa_enqueue, struct task_struct *p, u64 enq_flags) +{ + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); + + scx_bpf_dsq_insert(p, node, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev) +{ + int node = __COMPAT_scx_bpf_cpu_node(cpu); + + scx_bpf_dsq_move_to_local(node); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init) +{ + int node, err; + + bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) { + err = scx_bpf_create_dsq(node, node); + if (err) + return err; + } + + return 0; +} + +void BPF_STRUCT_OPS(numa_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops numa_ops = { + .select_cpu = (void *)numa_select_cpu, + .enqueue = (void *)numa_enqueue, + .dispatch = (void *)numa_dispatch, + .init = (void *)numa_init, + .exit = (void *)numa_exit, + .name = "numa", +}; diff --git a/tools/testing/selftests/sched_ext/numa.c b/tools/testing/selftests/sched_ext/numa.c new file mode 100644 index 000000000000..b060c3b65c82 --- /dev/null +++ b/tools/testing/selftests/sched_ext/numa.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "numa.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct numa *skel; + + skel = numa__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + skel->rodata->__COMPAT_SCX_PICK_IDLE_IN_NODE = SCX_PICK_IDLE_IN_NODE; + skel->struct_ops.numa_ops->flags = SCX_OPS_BUILTIN_IDLE_PER_NODE; + SCX_FAIL_IF(numa__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct numa *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.numa_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + /* Just sleeping is fine, plenty of scheduling events happening */ + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct numa *skel = ctx; + + numa__destroy(skel); +} + +struct scx_test numa = { + .name = "numa", + .description = "Verify NUMA-aware functionalities", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&numa) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 14ba51b52095..b2f76a52215a 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -155,6 +155,12 @@ struct seccomp_data { # endif #endif +#ifndef __NR_uretprobe +# if defined(__x86_64__) +# define __NR_uretprobe 335 +# endif +#endif + #ifndef SECCOMP_SET_MODE_STRICT #define SECCOMP_SET_MODE_STRICT 0 #endif diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 9814b3a1c77d..f0eceb0faf34 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -7,6 +7,7 @@ * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com> */ #define _GNU_SOURCE +#include <sys/prctl.h> #include <sys/time.h> #include <sys/types.h> #include <stdio.h> @@ -599,14 +600,84 @@ static void check_overrun(int which, const char *name) "check_overrun %s\n", name); } +#include <sys/syscall.h> + +static int do_timer_create(int *id) +{ + return syscall(__NR_timer_create, CLOCK_MONOTONIC, NULL, id); +} + +static int do_timer_delete(int id) +{ + return syscall(__NR_timer_delete, id); +} + +#ifndef PR_TIMER_CREATE_RESTORE_IDS +# define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +#endif + +static void check_timer_create_exact(void) +{ + int id; + + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0)) { + switch (errno) { + case EINVAL: + ksft_test_result_skip("check timer create exact, not supported\n"); + return; + default: + ksft_test_result_skip("check timer create exact, errno = %d\n", errno); + return; + } + } + + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) != 1) + fatal_error(NULL, "prctl(GET) failed\n"); + + id = 8; + if (do_timer_create(&id) < 0) + fatal_error(NULL, "timer_create()"); + + if (do_timer_delete(id)) + fatal_error(NULL, "timer_delete()"); + + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0)) + fatal_error(NULL, "prctl(OFF)"); + + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) != 0) + fatal_error(NULL, "prctl(GET) failed\n"); + + if (id != 8) { + ksft_test_result_fail("check timer create exact %d != 8\n", id); + return; + } + + /* Validate that it went back to normal mode and allocates ID 9 */ + if (do_timer_create(&id) < 0) + fatal_error(NULL, "timer_create()"); + + if (do_timer_delete(id)) + fatal_error(NULL, "timer_delete()"); + + if (id == 9) + ksft_test_result_pass("check timer create exact\n"); + else + ksft_test_result_fail("check timer create exact. Disabling failed.\n"); +} + int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(18); + ksft_set_plan(19); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); + check_timer_create_exact(); + check_itimer(ITIMER_VIRTUAL, "ITIMER_VIRTUAL"); check_itimer(ITIMER_PROF, "ITIMER_PROF"); check_itimer(ITIMER_REAL, "ITIMER_REAL"); diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index 83450145fe65..46c391d7f45d 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -47,7 +47,7 @@ int main(int argc, char **argv) pid = fork(); if (!pid) - return system("./inconsistency-check -c 1 -t 600"); + return system("./inconsistency-check -t 60"); ppm = 500; ret = 0; diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 1cf14a8da438..12a0614b9fd4 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -19,13 +19,20 @@ LDLIBS += -lgcc_s endif include ../lib.mk + +CFLAGS += $(TOOLS_INCLUDES) + +CFLAGS_NOLIBC := -nostdlib -nostdinc -ffreestanding -fno-asynchronous-unwind-tables \ + -fno-stack-protector -include $(top_srcdir)/tools/include/nolibc/nolibc.h \ + -I$(top_srcdir)/tools/include/nolibc/ $(KHDR_INCLUDES) + $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c $(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c -$(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c -$(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector +$(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c | headers +$(OUTPUT)/vdso_standalone_test_x86: CFLAGS:=$(CFLAGS_NOLIBC) $(CFLAGS) $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c $(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index f89d052c730e..3ff00fb624a4 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -19,13 +19,14 @@ #include <stdint.h> #include <string.h> #include <limits.h> -#include <elf.h> +#include <linux/auxvec.h> +#include <linux/elf.h> #include "parse_vdso.h" /* And here's the code. */ #ifndef ELF_BITS -# if ULONG_MAX > 0xffffffffUL +# if __SIZEOF_LONG__ >= 8 # define ELF_BITS 64 # else # define ELF_BITS 32 @@ -297,17 +298,3 @@ void *vdso_sym(const char *version, const char *name) return 0; } - -void vdso_init_from_auxv(void *auxv) -{ - ELF(auxv_t) *elf_auxv = auxv; - for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++) - { - if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) { - vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val); - return; - } - } - - vdso_info.valid = false; -} diff --git a/tools/testing/selftests/vDSO/parse_vdso.h b/tools/testing/selftests/vDSO/parse_vdso.h index de0453067d7c..09d068ed11f9 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.h +++ b/tools/testing/selftests/vDSO/parse_vdso.h @@ -26,6 +26,5 @@ */ void *vdso_sym(const char *version, const char *name); void vdso_init_from_sysinfo_ehdr(uintptr_t base); -void vdso_init_from_auxv(void *auxv); #endif diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c index 644915862af8..9ce795b806f0 100644 --- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c +++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c @@ -1,142 +1,58 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vdso_test.c: Sample code to test parse_vdso.c on x86 - * Copyright (c) 2011-2014 Andy Lutomirski + * vdso_test_gettimeofday.c: Sample code to test parse_vdso.c and + * vDSO gettimeofday() + * Copyright (c) 2014 Andy Lutomirski * - * You can amuse yourself by compiling with: - * gcc -std=gnu99 -nostdlib - * -Os -fno-asynchronous-unwind-tables -flto -lgcc_s - * vdso_standalone_test_x86.c parse_vdso.c - * to generate a small binary. On x86_64, you can omit -lgcc_s - * if you want the binary to be completely standalone. + * Compile with: + * gcc -std=gnu99 vdso_test_gettimeofday.c parse_vdso_gettimeofday.c + * + * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ -#include <sys/syscall.h> +#include <stdio.h> +#ifndef NOLIBC +#include <sys/auxv.h> #include <sys/time.h> -#include <unistd.h> -#include <stdint.h> - -#include "parse_vdso.h" - -/* We need some libc functions... */ -int strcmp(const char *a, const char *b) -{ - /* This implementation is buggy: it never returns -1. */ - while (*a || *b) { - if (*a != *b) - return 1; - if (*a == 0 || *b == 0) - return 1; - a++; - b++; - } - - return 0; -} - -/* - * The clang build needs this, although gcc does not. - * Stolen from lib/string.c. - */ -void *memcpy(void *dest, const void *src, size_t count) -{ - char *tmp = dest; - const char *s = src; - - while (count--) - *tmp++ = *s++; - return dest; -} - -/* ...and two syscalls. This is x86-specific. */ -static inline long x86_syscall3(long nr, long a0, long a1, long a2) -{ - long ret; -#ifdef __x86_64__ - asm volatile ("syscall" : "=a" (ret) : "a" (nr), - "D" (a0), "S" (a1), "d" (a2) : - "cc", "memory", "rcx", - "r8", "r9", "r10", "r11" ); -#else - asm volatile ("int $0x80" : "=a" (ret) : "a" (nr), - "b" (a0), "c" (a1), "d" (a2) : - "cc", "memory" ); #endif - return ret; -} -static inline long linux_write(int fd, const void *data, size_t len) -{ - return x86_syscall3(__NR_write, fd, (long)data, (long)len); -} +#include "../kselftest.h" +#include "parse_vdso.h" +#include "vdso_config.h" +#include "vdso_call.h" -static inline void linux_exit(int code) +int main(int argc, char **argv) { - x86_syscall3(__NR_exit, code, 0, 0); -} + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; -void to_base10(char *lastdig, time_t n) -{ - while (n) { - *lastdig = (n % 10) + '0'; - n /= 10; - lastdig--; + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + return KSFT_SKIP; } -} - -void c_main(void **stack) -{ - /* Parse the stack */ - long argc = (long)*stack; - stack += argc + 2; - - /* Now we're pointing at the environment. Skip it. */ - while(*stack) - stack++; - stack++; - /* Now we're pointing at auxv. Initialize the vDSO parser. */ - vdso_init_from_auxv((void *)stack); + vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); /* Find gettimeofday. */ typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); - gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); + gtod_t gtod = (gtod_t)vdso_sym(version, name[0]); - if (!gtod) - linux_exit(1); + if (!gtod) { + printf("Could not find %s\n", name[0]); + return KSFT_SKIP; + } struct timeval tv; - long ret = gtod(&tv, 0); + long ret = VDSO_CALL(gtod, 2, &tv, 0); if (ret == 0) { - char buf[] = "The time is .000000\n"; - to_base10(buf + 31, tv.tv_sec); - to_base10(buf + 38, tv.tv_usec); - linux_write(1, buf, sizeof(buf) - 1); + printf("The time is %lld.%06lld\n", + (long long)tv.tv_sec, (long long)tv.tv_usec); } else { - linux_exit(ret); + printf("%s failed\n", name[0]); + return KSFT_FAIL; } - linux_exit(0); + return 0; } - -/* - * This is the real entry point. It passes the initial stack into - * the C entry point. - */ -asm ( - ".text\n" - ".global _start\n" - ".type _start,@function\n" - "_start:\n\t" -#ifdef __x86_64__ - "mov %rsp,%rdi\n\t" - "and $-16,%rsp\n\t" - "sub $8,%rsp\n\t" - "jmp c_main" -#else - "push %esp\n\t" - "call c_main\n\t" - "int $3" -#endif - ); diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index e31b18ffae33..9ce795b806f0 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -10,11 +10,11 @@ * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ -#include <stdint.h> -#include <elf.h> #include <stdio.h> +#ifndef NOLIBC #include <sys/auxv.h> #include <sys/time.h> +#endif #include "../kselftest.h" #include "parse_vdso.h" diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 139fd9aa8b12..c305d2f613f0 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -27,7 +27,6 @@ CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_SHIRQ=y CONFIG_WQ_WATCHDOG=y -CONFIG_SCHED_DEBUG=y CONFIG_SCHED_INFO=y CONFIG_SCHEDSTATS=y CONFIG_SCHED_STACK_END_CHECK=y diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index d51249f14e2f..28422c32cc8f 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -19,7 +19,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ - corrupt_xstate_header amx lam test_shadow_stack + corrupt_xstate_header amx lam test_shadow_stack avx # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall @@ -132,3 +132,7 @@ $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static $(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack $(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack + +$(OUTPUT)/avx_64: CFLAGS += -mno-avx -mno-avx512f +$(OUTPUT)/amx_64: EXTRA_FILES += xstate.c +$(OUTPUT)/avx_64: EXTRA_FILES += xstate.c diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c index 1fdf35a4d7f6..40769c16de1b 100644 --- a/tools/testing/selftests/x86/amx.c +++ b/tools/testing/selftests/x86/amx.c @@ -3,7 +3,6 @@ #define _GNU_SOURCE #include <err.h> #include <errno.h> -#include <pthread.h> #include <setjmp.h> #include <stdio.h> #include <string.h> @@ -14,169 +13,27 @@ #include <sys/auxv.h> #include <sys/mman.h> #include <sys/shm.h> -#include <sys/ptrace.h> #include <sys/syscall.h> #include <sys/wait.h> -#include <sys/uio.h> -#include "../kselftest.h" /* For __cpuid_count() */ +#include "helpers.h" +#include "xstate.h" #ifndef __x86_64__ # error This test is 64-bit only #endif -#define XSAVE_HDR_OFFSET 512 -#define XSAVE_HDR_SIZE 64 - -struct xsave_buffer { - union { - struct { - char legacy[XSAVE_HDR_OFFSET]; - char header[XSAVE_HDR_SIZE]; - char extended[0]; - }; - char bytes[0]; - }; -}; - -static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm) -{ - uint32_t rfbm_lo = rfbm; - uint32_t rfbm_hi = rfbm >> 32; - - asm volatile("xsave (%%rdi)" - : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi) - : "memory"); -} - -static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm) -{ - uint32_t rfbm_lo = rfbm; - uint32_t rfbm_hi = rfbm >> 32; - - asm volatile("xrstor (%%rdi)" - : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)); -} - /* err() exits and will not return */ #define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__) -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - fatal_error("sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - fatal_error("sigaction"); -} - -#define XFEATURE_XTILECFG 17 -#define XFEATURE_XTILEDATA 18 #define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) #define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) #define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) -#define CPUID_LEAF1_ECX_XSAVE_MASK (1 << 26) -#define CPUID_LEAF1_ECX_OSXSAVE_MASK (1 << 27) - -static uint32_t xbuf_size; - -static struct { - uint32_t xbuf_offset; - uint32_t size; -} xtiledata; - -#define CPUID_LEAF_XSTATE 0xd -#define CPUID_SUBLEAF_XSTATE_USER 0x0 -#define TILE_CPUID 0x1d -#define TILE_PALETTE_ID 0x1 - -static void check_cpuid_xtiledata(void) -{ - uint32_t eax, ebx, ecx, edx; - - __cpuid_count(CPUID_LEAF_XSTATE, CPUID_SUBLEAF_XSTATE_USER, - eax, ebx, ecx, edx); - - /* - * EBX enumerates the size (in bytes) required by the XSAVE - * instruction for an XSAVE area containing all the user state - * components corresponding to bits currently set in XCR0. - * - * Stash that off so it can be used to allocate buffers later. - */ - xbuf_size = ebx; - - __cpuid_count(CPUID_LEAF_XSTATE, XFEATURE_XTILEDATA, - eax, ebx, ecx, edx); - /* - * eax: XTILEDATA state component size - * ebx: XTILEDATA state component offset in user buffer - */ - if (!eax || !ebx) - fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d", - eax, ebx); - - xtiledata.size = eax; - xtiledata.xbuf_offset = ebx; -} +struct xstate_info xtiledata; /* The helpers for managing XSAVE buffer and tile states: */ -struct xsave_buffer *alloc_xbuf(void) -{ - struct xsave_buffer *xbuf; - - /* XSAVE buffer should be 64B-aligned. */ - xbuf = aligned_alloc(64, xbuf_size); - if (!xbuf) - fatal_error("aligned_alloc()"); - return xbuf; -} - -static inline void clear_xstate_header(struct xsave_buffer *buffer) -{ - memset(&buffer->header, 0, sizeof(buffer->header)); -} - -static inline void set_xstatebv(struct xsave_buffer *buffer, uint64_t bv) -{ - /* XSTATE_BV is at the beginning of the header: */ - *(uint64_t *)(&buffer->header) = bv; -} - -static void set_rand_tiledata(struct xsave_buffer *xbuf) -{ - int *ptr = (int *)&xbuf->bytes[xtiledata.xbuf_offset]; - int data; - int i; - - /* - * Ensure that 'data' is never 0. This ensures that - * the registers are never in their initial configuration - * and thus never tracked as being in the init state. - */ - data = rand() | 1; - - for (i = 0; i < xtiledata.size / sizeof(int); i++, ptr++) - *ptr = data; -} - struct xsave_buffer *stashed_xsave; static void init_stashed_xsave(void) @@ -192,21 +49,6 @@ static void free_stashed_xsave(void) free(stashed_xsave); } -/* See 'struct _fpx_sw_bytes' at sigcontext.h */ -#define SW_BYTES_OFFSET 464 -/* N.B. The struct's field name varies so read from the offset. */ -#define SW_BYTES_BV_OFFSET (SW_BYTES_OFFSET + 8) - -static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *buffer) -{ - return (struct _fpx_sw_bytes *)(buffer + SW_BYTES_OFFSET); -} - -static inline uint64_t get_fpx_sw_bytes_features(void *buffer) -{ - return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET); -} - /* Work around printf() being unsafe in signals: */ #define SIGNAL_BUF_LEN 1000 char signal_message_buffer[SIGNAL_BUF_LEN]; @@ -304,17 +146,10 @@ static inline bool load_rand_tiledata(struct xsave_buffer *xbuf) { clear_xstate_header(xbuf); set_xstatebv(xbuf, XFEATURE_MASK_XTILEDATA); - set_rand_tiledata(xbuf); + set_rand_data(&xtiledata, xbuf); return xrstor_safe(xbuf, XFEATURE_MASK_XTILEDATA); } -/* Return XTILEDATA to its initial configuration. */ -static inline void init_xtiledata(void) -{ - clear_xstate_header(stashed_xsave); - xrstor_safe(stashed_xsave, XFEATURE_MASK_XTILEDATA); -} - enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED }; /* arch_prctl() and sigaltstack() test */ @@ -587,14 +422,6 @@ static inline bool __validate_tiledata_regs(struct xsave_buffer *xbuf1) return true; } -static inline void validate_tiledata_regs_same(struct xsave_buffer *xbuf) -{ - int ret = __validate_tiledata_regs(xbuf); - - if (ret != 0) - fatal_error("TILEDATA registers changed"); -} - static inline void validate_tiledata_regs_changed(struct xsave_buffer *xbuf) { int ret = __validate_tiledata_regs(xbuf); @@ -651,251 +478,6 @@ static void test_fork(void) _exit(0); } -/* Context switching test */ - -static struct _ctxtswtest_cfg { - unsigned int iterations; - unsigned int num_threads; -} ctxtswtest_config; - -struct futex_info { - pthread_t thread; - int nr; - pthread_mutex_t mutex; - struct futex_info *next; -}; - -static void *check_tiledata(void *info) -{ - struct futex_info *finfo = (struct futex_info *)info; - struct xsave_buffer *xbuf; - int i; - - xbuf = alloc_xbuf(); - if (!xbuf) - fatal_error("unable to allocate XSAVE buffer"); - - /* - * Load random data into 'xbuf' and then restore - * it to the tile registers themselves. - */ - load_rand_tiledata(xbuf); - for (i = 0; i < ctxtswtest_config.iterations; i++) { - pthread_mutex_lock(&finfo->mutex); - - /* - * Ensure the register values have not - * diverged from those recorded in 'xbuf'. - */ - validate_tiledata_regs_same(xbuf); - - /* Load new, random values into xbuf and registers */ - load_rand_tiledata(xbuf); - - /* - * The last thread's last unlock will be for - * thread 0's mutex. However, thread 0 will - * have already exited the loop and the mutex - * will already be unlocked. - * - * Because this is not an ERRORCHECK mutex, - * that inconsistency will be silently ignored. - */ - pthread_mutex_unlock(&finfo->next->mutex); - } - - free(xbuf); - /* - * Return this thread's finfo, which is - * a unique value for this thread. - */ - return finfo; -} - -static int create_threads(int num, struct futex_info *finfo) -{ - int i; - - for (i = 0; i < num; i++) { - int next_nr; - - finfo[i].nr = i; - /* - * Thread 'i' will wait on this mutex to - * be unlocked. Lock it immediately after - * initialization: - */ - pthread_mutex_init(&finfo[i].mutex, NULL); - pthread_mutex_lock(&finfo[i].mutex); - - next_nr = (i + 1) % num; - finfo[i].next = &finfo[next_nr]; - - if (pthread_create(&finfo[i].thread, NULL, check_tiledata, &finfo[i])) - fatal_error("pthread_create()"); - } - return 0; -} - -static void affinitize_cpu0(void) -{ - cpu_set_t cpuset; - - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - - if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) - fatal_error("sched_setaffinity to CPU 0"); -} - -static void test_context_switch(void) -{ - struct futex_info *finfo; - int i; - - /* Affinitize to one CPU to force context switches */ - affinitize_cpu0(); - - req_xtiledata_perm(); - - printf("[RUN]\tCheck tiledata context switches, %d iterations, %d threads.\n", - ctxtswtest_config.iterations, - ctxtswtest_config.num_threads); - - - finfo = malloc(sizeof(*finfo) * ctxtswtest_config.num_threads); - if (!finfo) - fatal_error("malloc()"); - - create_threads(ctxtswtest_config.num_threads, finfo); - - /* - * This thread wakes up thread 0 - * Thread 0 will wake up 1 - * Thread 1 will wake up 2 - * ... - * the last thread will wake up 0 - * - * ... this will repeat for the configured - * number of iterations. - */ - pthread_mutex_unlock(&finfo[0].mutex); - - /* Wait for all the threads to finish: */ - for (i = 0; i < ctxtswtest_config.num_threads; i++) { - void *thread_retval; - int rc; - - rc = pthread_join(finfo[i].thread, &thread_retval); - - if (rc) - fatal_error("pthread_join() failed for thread %d err: %d\n", - i, rc); - - if (thread_retval != &finfo[i]) - fatal_error("unexpected thread retval for thread %d: %p\n", - i, thread_retval); - - } - - printf("[OK]\tNo incorrect case was found.\n"); - - free(finfo); -} - -/* Ptrace test */ - -/* - * Make sure the ptracee has the expanded kernel buffer on the first - * use. Then, initialize the state before performing the state - * injection from the ptracer. - */ -static inline void ptracee_firstuse_tiledata(void) -{ - load_rand_tiledata(stashed_xsave); - init_xtiledata(); -} - -/* - * Ptracer injects the randomized tile data state. It also reads - * before and after that, which will execute the kernel's state copy - * functions. So, the tester is advised to double-check any emitted - * kernel messages. - */ -static void ptracer_inject_tiledata(pid_t target) -{ - struct xsave_buffer *xbuf; - struct iovec iov; - - xbuf = alloc_xbuf(); - if (!xbuf) - fatal_error("unable to allocate XSAVE buffer"); - - printf("\tRead the init'ed tiledata via ptrace().\n"); - - iov.iov_base = xbuf; - iov.iov_len = xbuf_size; - - memset(stashed_xsave, 0, xbuf_size); - - if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_GETREGSET"); - - if (!__compare_tiledata_state(stashed_xsave, xbuf)) - printf("[OK]\tThe init'ed tiledata was read from ptracee.\n"); - else - printf("[FAIL]\tThe init'ed tiledata was not read from ptracee.\n"); - - printf("\tInject tiledata via ptrace().\n"); - - load_rand_tiledata(xbuf); - - memcpy(&stashed_xsave->bytes[xtiledata.xbuf_offset], - &xbuf->bytes[xtiledata.xbuf_offset], - xtiledata.size); - - if (ptrace(PTRACE_SETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_SETREGSET"); - - if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_GETREGSET"); - - if (!__compare_tiledata_state(stashed_xsave, xbuf)) - printf("[OK]\tTiledata was correctly written to ptracee.\n"); - else - printf("[FAIL]\tTiledata was not correctly written to ptracee.\n"); -} - -static void test_ptrace(void) -{ - pid_t child; - int status; - - child = fork(); - if (child < 0) { - err(1, "fork"); - } else if (!child) { - if (ptrace(PTRACE_TRACEME, 0, NULL, NULL)) - err(1, "PTRACE_TRACEME"); - - ptracee_firstuse_tiledata(); - - raise(SIGTRAP); - _exit(0); - } - - do { - wait(&status); - } while (WSTOPSIG(status) != SIGTRAP); - - ptracer_inject_tiledata(child); - - ptrace(PTRACE_DETACH, child, NULL, NULL); - wait(&status); - if (!WIFEXITED(status) || WEXITSTATUS(status)) - err(1, "ptrace test"); -} - int main(void) { unsigned long features; @@ -907,7 +489,11 @@ int main(void) return KSFT_SKIP; } - check_cpuid_xtiledata(); + xtiledata = get_xstate_info(XFEATURE_XTILEDATA); + if (!xtiledata.size || !xtiledata.xbuf_offset) { + fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d", + xtiledata.size, xtiledata.xbuf_offset); + } init_stashed_xsave(); sethandler(SIGILL, handle_noperm, 0); @@ -919,11 +505,11 @@ int main(void) test_fork(); - ctxtswtest_config.iterations = 10; - ctxtswtest_config.num_threads = 5; - test_context_switch(); - - test_ptrace(); + /* + * Perform generic xstate tests for context switching, ptrace, + * and signal. + */ + test_xstate(XFEATURE_XTILEDATA); clearhandler(SIGILL); free_stashed_xsave(); diff --git a/tools/testing/selftests/x86/avx.c b/tools/testing/selftests/x86/avx.c new file mode 100644 index 000000000000..11d5367c235f --- /dev/null +++ b/tools/testing/selftests/x86/avx.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE /* Required for inline xstate helpers */ +#include "xstate.h" + +int main(void) +{ + test_xstate(XFEATURE_YMM); + test_xstate(XFEATURE_OPMASK); + test_xstate(XFEATURE_ZMM_Hi256); + test_xstate(XFEATURE_Hi16_ZMM); +} diff --git a/tools/testing/selftests/x86/corrupt_xstate_header.c b/tools/testing/selftests/x86/corrupt_xstate_header.c index cf9ce8fbb656..93a89a5997ca 100644 --- a/tools/testing/selftests/x86/corrupt_xstate_header.c +++ b/tools/testing/selftests/x86/corrupt_xstate_header.c @@ -18,6 +18,7 @@ #include <sys/wait.h> #include "../kselftest.h" /* For __cpuid_count() */ +#include "helpers.h" static inline int xsave_enabled(void) { @@ -29,19 +30,6 @@ static inline int xsave_enabled(void) return ecx & (1U << 27); } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigusr1(int sig, siginfo_t *info, void *uc_void) { ucontext_t *uc = uc_void; diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c index d1e919b0c1dc..5cb8393737d0 100644 --- a/tools/testing/selftests/x86/entry_from_vm86.c +++ b/tools/testing/selftests/x86/entry_from_vm86.c @@ -24,31 +24,11 @@ #include <errno.h> #include <sys/vm86.h> +#include "helpers.h" + static unsigned long load_addr = 0x10000; static int nerrs = 0; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static sig_atomic_t got_signal; static void sighandler(int sig, siginfo_t *info, void *ctx_void) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 50cf32de6313..0a75252d31b6 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -28,6 +28,8 @@ #include <sys/wait.h> #include <setjmp.h> +#include "helpers.h" + #ifndef __x86_64__ # error This test is 64-bit only #endif @@ -39,28 +41,6 @@ static unsigned short *shared_scratch; static int nerrs; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigsegv(int sig, siginfo_t *si, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; diff --git a/tools/testing/selftests/x86/helpers.h b/tools/testing/selftests/x86/helpers.h index 4ef42c4559a9..6deaad035161 100644 --- a/tools/testing/selftests/x86/helpers.h +++ b/tools/testing/selftests/x86/helpers.h @@ -2,8 +2,13 @@ #ifndef __SELFTESTS_X86_HELPERS_H #define __SELFTESTS_X86_HELPERS_H +#include <signal.h> +#include <string.h> + #include <asm/processor-flags.h> +#include "../kselftest.h" + static inline unsigned long get_eflags(void) { #ifdef __x86_64__ @@ -22,4 +27,27 @@ static inline void set_eflags(unsigned long eflags) #endif } +static inline void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + ksft_exit_fail_msg("sigaction failed"); +} + +static inline void clearhandler(int sig) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + ksft_exit_fail_msg("sigaction failed"); +} + #endif /* __SELFTESTS_X86_HELPERS_H */ diff --git a/tools/testing/selftests/x86/ioperm.c b/tools/testing/selftests/x86/ioperm.c index 57ec5e99edb9..69d5fb7050c2 100644 --- a/tools/testing/selftests/x86/ioperm.c +++ b/tools/testing/selftests/x86/ioperm.c @@ -20,30 +20,9 @@ #include <sched.h> #include <sys/io.h> -static int nerrs = 0; - -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - -} +#include "helpers.h" -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} +static int nerrs = 0; static jmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/iopl.c b/tools/testing/selftests/x86/iopl.c index 7e3e09c1abac..457b6715542b 100644 --- a/tools/testing/selftests/x86/iopl.c +++ b/tools/testing/selftests/x86/iopl.c @@ -20,30 +20,9 @@ #include <sched.h> #include <sys/io.h> -static int nerrs = 0; - -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - -} +#include "helpers.h" -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} +static int nerrs = 0; static jmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 4d4a76532dc9..18d736640ece 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -4,6 +4,7 @@ #include <stdlib.h> #include <string.h> #include <sys/syscall.h> +#include <sys/ioctl.h> #include <time.h> #include <signal.h> #include <setjmp.h> @@ -43,7 +44,15 @@ #define FUNC_INHERITE 0x20 #define FUNC_PASID 0x40 +/* get_user() pointer test cases */ +#define GET_USER_USER 0 +#define GET_USER_KERNEL_TOP 1 +#define GET_USER_KERNEL_BOT 2 +#define GET_USER_KERNEL 3 + #define TEST_MASK 0x7f +#define L5_SIGN_EXT_MASK (0xFFUL << 56) +#define L4_SIGN_EXT_MASK (0x1FFFFUL << 47) #define LOW_ADDR (0x1UL << 30) #define HIGH_ADDR (0x3UL << 48) @@ -115,23 +124,42 @@ static void segv_handler(int sig) siglongjmp(segv_env, 1); } -static inline int cpu_has_lam(void) +static inline int lam_is_available(void) { unsigned int cpuinfo[4]; + unsigned long bits = 0; + int ret; __cpuid_count(0x7, 1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); - return (cpuinfo[0] & (1 << 26)); + /* Check if cpu supports LAM */ + if (!(cpuinfo[0] & (1 << 26))) { + ksft_print_msg("LAM is not supported!\n"); + return 0; + } + + /* Return 0 if CONFIG_ADDRESS_MASKING is not set */ + ret = syscall(SYS_arch_prctl, ARCH_GET_MAX_TAG_BITS, &bits); + if (ret) { + ksft_print_msg("LAM is disabled in the kernel!\n"); + return 0; + } + + return 1; } -/* Check 5-level page table feature in CPUID.(EAX=07H, ECX=00H):ECX.[bit 16] */ -static inline int cpu_has_la57(void) +static inline int la57_enabled(void) { - unsigned int cpuinfo[4]; + int ret; + void *p; + + p = mmap((void *)HIGH_ADDR, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); - __cpuid_count(0x7, 0, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); + ret = p == MAP_FAILED ? 0 : 1; - return (cpuinfo[2] & (1 << 16)); + munmap(p, PAGE_SIZE); + return ret; } /* @@ -322,7 +350,7 @@ static int handle_mmap(struct testcases *test) flags, -1, 0); if (ptr == MAP_FAILED) { if (test->addr == HIGH_ADDR) - if (!cpu_has_la57()) + if (!la57_enabled()) return 3; /* unsupport LA57 */ return 1; } @@ -370,6 +398,78 @@ static int handle_syscall(struct testcases *test) return ret; } +static int get_user_syscall(struct testcases *test) +{ + uint64_t ptr_address, bitmask; + int fd, ret = 0; + void *ptr; + + if (la57_enabled()) { + bitmask = L5_SIGN_EXT_MASK; + ptr_address = HIGH_ADDR; + } else { + bitmask = L4_SIGN_EXT_MASK; + ptr_address = LOW_ADDR; + } + + ptr = mmap((void *)ptr_address, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + + if (ptr == MAP_FAILED) { + perror("failed to map byte to pass into get_user"); + return 1; + } + + if (set_lam(test->lam) != 0) { + ret = 2; + goto error; + } + + fd = memfd_create("lam_ioctl", 0); + if (fd == -1) { + munmap(ptr, PAGE_SIZE); + exit(EXIT_FAILURE); + } + + switch (test->later) { + case GET_USER_USER: + /* Control group - properly tagged user pointer */ + ptr = (void *)set_metadata((uint64_t)ptr, test->lam); + break; + case GET_USER_KERNEL_TOP: + /* Kernel address with top bit cleared */ + bitmask &= (bitmask >> 1); + ptr = (void *)((uint64_t)ptr | bitmask); + break; + case GET_USER_KERNEL_BOT: + /* Kernel address with bottom sign-extension bit cleared */ + bitmask &= (bitmask << 1); + ptr = (void *)((uint64_t)ptr | bitmask); + break; + case GET_USER_KERNEL: + /* Try to pass a kernel address */ + ptr = (void *)((uint64_t)ptr | bitmask); + break; + default: + printf("Invalid test case value passed!\n"); + break; + } + + /* + * Use FIOASYNC ioctl because it utilizes get_user() internally and is + * very non-invasive to the system. Pass differently tagged pointers to + * get_user() in order to verify that valid user pointers are going + * through and invalid kernel/non-canonical pointers are not. + */ + if (ioctl(fd, FIOASYNC, ptr) != 0) + ret = 1; + + close(fd); +error: + munmap(ptr, PAGE_SIZE); + return ret; +} + int sys_uring_setup(unsigned int entries, struct io_uring_params *p) { return (int)syscall(__NR_io_uring_setup, entries, p); @@ -596,8 +696,10 @@ int do_uring(unsigned long lam) fi->file_fd = file_fd; ring = malloc(sizeof(*ring)); - if (!ring) + if (!ring) { + free(fi); return 1; + } memset(ring, 0, sizeof(struct io_ring)); @@ -883,6 +985,33 @@ static struct testcases syscall_cases[] = { .test_func = handle_syscall, .msg = "SYSCALL:[Negative] Disable LAM. Dereferencing pointer with metadata.\n", }, + { + .later = GET_USER_USER, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER: get_user() and pass a properly tagged user pointer.\n", + }, + { + .later = GET_USER_KERNEL_TOP, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() with a kernel pointer and the top bit cleared.\n", + }, + { + .later = GET_USER_KERNEL_BOT, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() with a kernel pointer and the bottom sign-extension bit cleared.\n", + }, + { + .later = GET_USER_KERNEL, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() and pass a kernel pointer.\n", + }, }; static struct testcases mmap_cases[] = { @@ -1181,10 +1310,8 @@ int main(int argc, char **argv) tests_cnt = 0; - if (!cpu_has_lam()) { - ksft_print_msg("Unsupported LAM feature!\n"); + if (!lam_is_available()) return KSFT_SKIP; - } while ((c = getopt(argc, argv, "ht:")) != -1) { switch (c) { diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 3a29346e1452..bb99a71380a5 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -26,6 +26,8 @@ #include <asm/prctl.h> #include <sys/prctl.h> +#include "helpers.h" + #define AR_ACCESSED (1<<8) #define AR_TYPE_RODATA (0 * (1<<9)) @@ -506,20 +508,6 @@ static void fix_sa_restorer(int sig) } #endif -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - - fix_sa_restorer(sig); -} - static jmp_buf jmpbuf; static void sigsegv(int sig, siginfo_t *info, void *ctx_void) @@ -549,9 +537,11 @@ static void do_multicpu_tests(void) } sethandler(SIGSEGV, sigsegv, 0); + fix_sa_restorer(SIGSEGV); #ifdef __i386__ /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */ sethandler(SIGILL, sigsegv, 0); + fix_sa_restorer(SIGILL); #endif printf("[RUN]\tCross-CPU LDT invalidation\n"); diff --git a/tools/testing/selftests/x86/mov_ss_trap.c b/tools/testing/selftests/x86/mov_ss_trap.c index cc3de6ff9fba..f22cb6b382f9 100644 --- a/tools/testing/selftests/x86/mov_ss_trap.c +++ b/tools/testing/selftests/x86/mov_ss_trap.c @@ -36,7 +36,7 @@ #include <setjmp.h> #include <sys/prctl.h> -#define X86_EFLAGS_RF (1UL << 16) +#include "helpers.h" #if __x86_64__ # define REG_IP REG_RIP @@ -94,18 +94,6 @@ static void enable_watchpoint(void) } } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static char const * const signames[] = { [SIGSEGV] = "SIGSEGV", [SIGBUS] = "SIBGUS", diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 12aaa063196e..360ec88d5432 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -15,6 +15,8 @@ #include <asm/ptrace-abi.h> #include <sys/auxv.h> +#include "helpers.h" + /* Bitness-agnostic defines for user_regs_struct fields. */ #ifdef __x86_64__ # define user_syscall_nr orig_rax @@ -93,18 +95,6 @@ static siginfo_t wait_trap(pid_t chld) return si; } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void setsigign(int sig, int flags) { struct sigaction sa; @@ -116,16 +106,6 @@ static void setsigign(int sig, int flags) err(1, "sigaction"); } -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - #ifdef __x86_64__ # define REG_BP REG_RBP #else diff --git a/tools/testing/selftests/x86/sigaltstack.c b/tools/testing/selftests/x86/sigaltstack.c index f689af75e979..0ae1b784498c 100644 --- a/tools/testing/selftests/x86/sigaltstack.c +++ b/tools/testing/selftests/x86/sigaltstack.c @@ -14,6 +14,8 @@ #include <sys/resource.h> #include <setjmp.h> +#include "helpers.h" + /* sigaltstack()-enforced minimum stack */ #define ENFORCED_MINSIGSTKSZ 2048 @@ -27,30 +29,6 @@ static bool sigalrm_expected; static unsigned long at_minstack_size; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static int setup_altstack(void *start, unsigned long size) { stack_t ss; diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 0b75b29f794b..26ef562f4232 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -46,6 +46,8 @@ #include <sys/ptrace.h> #include <sys/user.h> +#include "helpers.h" + /* Pull in AR_xyz defines. */ typedef unsigned int u32; typedef unsigned short u16; @@ -138,28 +140,6 @@ static unsigned short LDT3(int idx) return (idx << 3) | 7; } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void add_ldt(const struct user_desc *desc, unsigned short *var, const char *name) { diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c index 9a30f443e928..280d7a22b9c9 100644 --- a/tools/testing/selftests/x86/single_step_syscall.c +++ b/tools/testing/selftests/x86/single_step_syscall.c @@ -33,28 +33,6 @@ #include "helpers.h" -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static volatile sig_atomic_t sig_traps, sig_eflags; sigjmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index 48ab065a76f9..f67a2df335ba 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -17,18 +17,6 @@ #include "helpers.h" -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static sigjmp_buf jmpbuf; static volatile sig_atomic_t n_errs; diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index a108b80dd082..f9c9814160f0 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -18,18 +18,6 @@ static unsigned int nerrs; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigtrap(int sig, siginfo_t *si, void *ctx_void) { } diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c index 991591718bb0..41c42b7b54a6 100644 --- a/tools/testing/selftests/x86/syscall_numbering.c +++ b/tools/testing/selftests/x86/syscall_numbering.c @@ -25,6 +25,7 @@ #include <sys/mman.h> #include <linux/ptrace.h> +#include "../kselftest.h" /* Common system call numbers */ #define SYS_READ 0 @@ -313,7 +314,7 @@ static void test_syscall_numbering(void) * The MSB is supposed to be ignored, so we loop over a few * to test that out. */ - for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) { + for (size_t i = 0; i < ARRAY_SIZE(msbs); i++) { int msb = msbs[i]; run("Checking system calls with msb = %d (0x%x)\n", msb, msb); diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index b30de9aaa6d4..5fb531e3ad7c 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -22,6 +22,8 @@ #include <sys/mman.h> #include <assert.h> +#include "helpers.h" + /* * These items are in clang_helpers_64.S, in order to avoid clang inline asm * limitations: @@ -31,28 +33,6 @@ extern const char test_page[]; static void const *current_test_page_addr = test_page; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - /* State used by our signal handlers. */ static gregset_t initial_regs; diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 6de11b4df458..05e1e6774fba 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -310,19 +310,6 @@ static void test_getcpu(int cpu) static jmp_buf jmpbuf; static volatile unsigned long segv_err; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - ksft_exit_fail_msg("sigaction failed\n"); -} - static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t *)ctx_void; diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c index 4c311e1af4c7..9cc17588d818 100644 --- a/tools/testing/selftests/x86/unwind_vdso.c +++ b/tools/testing/selftests/x86/unwind_vdso.c @@ -43,18 +43,6 @@ int main() #include <dlfcn.h> #include <unwind.h> -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static volatile sig_atomic_t nerrs; static unsigned long sysinfo; static bool got_sysinfo = false; diff --git a/tools/testing/selftests/x86/xstate.c b/tools/testing/selftests/x86/xstate.c new file mode 100644 index 000000000000..23c1d6c964ea --- /dev/null +++ b/tools/testing/selftests/x86/xstate.c @@ -0,0 +1,477 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include <elf.h> +#include <pthread.h> +#include <stdbool.h> + +#include <asm/prctl.h> +#include <sys/ptrace.h> +#include <sys/syscall.h> +#include <sys/uio.h> +#include <sys/wait.h> + +#include "helpers.h" +#include "xstate.h" + +/* + * The userspace xstate test suite is designed to be generic and operates + * with randomized xstate data. However, some states require special handling: + * + * - PKRU and XTILECFG need specific adjustments, such as modifying + * randomization behavior or using fixed values. + * - But, PKRU already has a dedicated test suite in /tools/selftests/mm. + * - Legacy states (FP and SSE) are excluded, as they are not considered + * part of extended states (xstates) and their usage is already deeply + * integrated into user-space libraries. + */ +#define XFEATURE_MASK_TEST_SUPPORTED \ + ((1 << XFEATURE_YMM) | \ + (1 << XFEATURE_OPMASK) | \ + (1 << XFEATURE_ZMM_Hi256) | \ + (1 << XFEATURE_Hi16_ZMM) | \ + (1 << XFEATURE_XTILEDATA)) + +static inline uint64_t xgetbv(uint32_t index) +{ + uint32_t eax, edx; + + asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); + return eax + ((uint64_t)edx << 32); +} + +static inline uint64_t get_xstatebv(struct xsave_buffer *xbuf) +{ + return *(uint64_t *)(&xbuf->header); +} + +static struct xstate_info xstate; + +struct futex_info { + unsigned int iterations; + struct futex_info *next; + pthread_mutex_t mutex; + pthread_t thread; + bool valid; + int nr; +}; + +static inline void load_rand_xstate(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + clear_xstate_header(xbuf); + set_xstatebv(xbuf, xstate->mask); + set_rand_data(xstate, xbuf); + xrstor(xbuf, xstate->mask); +} + +static inline void load_init_xstate(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + clear_xstate_header(xbuf); + xrstor(xbuf, xstate->mask); +} + +static inline void copy_xstate(struct xsave_buffer *xbuf_dst, struct xsave_buffer *xbuf_src) +{ + memcpy(&xbuf_dst->bytes[xstate.xbuf_offset], + &xbuf_src->bytes[xstate.xbuf_offset], + xstate.size); +} + +static inline bool validate_xstate_same(struct xsave_buffer *xbuf1, struct xsave_buffer *xbuf2) +{ + int ret; + + ret = memcmp(&xbuf1->bytes[xstate.xbuf_offset], + &xbuf2->bytes[xstate.xbuf_offset], + xstate.size); + return ret == 0; +} + +static inline bool validate_xregs_same(struct xsave_buffer *xbuf1) +{ + struct xsave_buffer *xbuf2; + bool ret; + + xbuf2 = alloc_xbuf(); + if (!xbuf2) + ksft_exit_fail_msg("failed to allocate XSAVE buffer\n"); + + xsave(xbuf2, xstate.mask); + ret = validate_xstate_same(xbuf1, xbuf2); + + free(xbuf2); + return ret; +} + +/* Context switching test */ + +static void *check_xstate(void *info) +{ + struct futex_info *finfo = (struct futex_info *)info; + struct xsave_buffer *xbuf; + int i; + + xbuf = alloc_xbuf(); + if (!xbuf) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + /* + * Load random data into 'xbuf' and then restore it to the xstate + * registers. + */ + load_rand_xstate(&xstate, xbuf); + finfo->valid = true; + + for (i = 0; i < finfo->iterations; i++) { + pthread_mutex_lock(&finfo->mutex); + + /* + * Ensure the register values have not diverged from the + * record. Then reload a new random value. If it failed + * ever before, skip it. + */ + if (finfo->valid) { + finfo->valid = validate_xregs_same(xbuf); + load_rand_xstate(&xstate, xbuf); + } + + /* + * The last thread's last unlock will be for thread 0's + * mutex. However, thread 0 will have already exited the + * loop and the mutex will already be unlocked. + * + * Because this is not an ERRORCHECK mutex, that + * inconsistency will be silently ignored. + */ + pthread_mutex_unlock(&finfo->next->mutex); + } + + free(xbuf); + return finfo; +} + +static void create_threads(uint32_t num_threads, uint32_t iterations, struct futex_info *finfo) +{ + int i; + + for (i = 0; i < num_threads; i++) { + int next_nr; + + finfo[i].nr = i; + finfo[i].iterations = iterations; + + /* + * Thread 'i' will wait on this mutex to be unlocked. + * Lock it immediately after initialization: + */ + pthread_mutex_init(&finfo[i].mutex, NULL); + pthread_mutex_lock(&finfo[i].mutex); + + next_nr = (i + 1) % num_threads; + finfo[i].next = &finfo[next_nr]; + + if (pthread_create(&finfo[i].thread, NULL, check_xstate, &finfo[i])) + ksft_exit_fail_msg("pthread_create() failed\n"); + } +} + +static bool checkout_threads(uint32_t num_threads, struct futex_info *finfo) +{ + void *thread_retval; + bool valid = true; + int err, i; + + for (i = 0; i < num_threads; i++) { + err = pthread_join(finfo[i].thread, &thread_retval); + if (err) + ksft_exit_fail_msg("pthread_join() failed for thread %d err: %d\n", i, err); + + if (thread_retval != &finfo[i]) { + ksft_exit_fail_msg("unexpected thread retval for thread %d: %p\n", + i, thread_retval); + } + + valid &= finfo[i].valid; + } + + return valid; +} + +static void affinitize_cpu0(void) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + ksft_exit_fail_msg("sched_setaffinity to CPU 0 failed\n"); +} + +static void test_context_switch(uint32_t num_threads, uint32_t iterations) +{ + struct futex_info *finfo; + + /* Affinitize to one CPU to force context switches */ + affinitize_cpu0(); + + printf("[RUN]\t%s: check context switches, %d iterations, %d threads.\n", + xstate.name, iterations, num_threads); + + finfo = malloc(sizeof(*finfo) * num_threads); + if (!finfo) + ksft_exit_fail_msg("unable allocate memory\n"); + + create_threads(num_threads, iterations, finfo); + + /* + * This thread wakes up thread 0 + * Thread 0 will wake up 1 + * Thread 1 will wake up 2 + * ... + * The last thread will wake up 0 + * + * This will repeat for the configured + * number of iterations. + */ + pthread_mutex_unlock(&finfo[0].mutex); + + /* Wait for all the threads to finish: */ + if (checkout_threads(num_threads, finfo)) + printf("[OK]\tNo incorrect case was found.\n"); + else + printf("[FAIL]\tFailed with context switching test.\n"); + + free(finfo); +} + +/* + * Ptrace test for the ABI format as described in arch/x86/include/asm/user.h + */ + +/* + * Make sure the ptracee has the expanded kernel buffer on the first use. + * Then, initialize the state before performing the state injection from + * the ptracer. For non-dynamic states, this is benign. + */ +static inline void ptracee_touch_xstate(void) +{ + struct xsave_buffer *xbuf; + + xbuf = alloc_xbuf(); + + load_rand_xstate(&xstate, xbuf); + load_init_xstate(&xstate, xbuf); + + free(xbuf); +} + +/* + * Ptracer injects the randomized xstate data. It also reads before and + * after that, which will execute the kernel's state copy functions. + */ +static void ptracer_inject_xstate(pid_t target) +{ + uint32_t xbuf_size = get_xbuf_size(); + struct xsave_buffer *xbuf1, *xbuf2; + struct iovec iov; + + /* + * Allocate buffers to keep data while ptracer can write the + * other buffer + */ + xbuf1 = alloc_xbuf(); + xbuf2 = alloc_xbuf(); + if (!xbuf1 || !xbuf2) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + iov.iov_base = xbuf1; + iov.iov_len = xbuf_size; + + if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_GETREGSET failed\n"); + + printf("[RUN]\t%s: inject xstate via ptrace().\n", xstate.name); + + load_rand_xstate(&xstate, xbuf1); + copy_xstate(xbuf2, xbuf1); + + if (ptrace(PTRACE_SETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_SETREGSET failed\n"); + + if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_GETREGSET failed\n"); + + if (*(uint64_t *)get_fpx_sw_bytes(xbuf1) == xgetbv(0)) + printf("[OK]\t'xfeatures' in SW reserved area was correctly written\n"); + else + printf("[FAIL]\t'xfeatures' in SW reserved area was not correctly written\n"); + + if (validate_xstate_same(xbuf2, xbuf1)) + printf("[OK]\txstate was correctly updated.\n"); + else + printf("[FAIL]\txstate was not correctly updated.\n"); + + free(xbuf1); + free(xbuf2); +} + +static void test_ptrace(void) +{ + pid_t child; + int status; + + child = fork(); + if (child < 0) { + ksft_exit_fail_msg("fork() failed\n"); + } else if (!child) { + if (ptrace(PTRACE_TRACEME, 0, NULL, NULL)) + ksft_exit_fail_msg("PTRACE_TRACEME failed\n"); + + ptracee_touch_xstate(); + + raise(SIGTRAP); + _exit(0); + } + + do { + wait(&status); + } while (WSTOPSIG(status) != SIGTRAP); + + ptracer_inject_xstate(child); + + ptrace(PTRACE_DETACH, child, NULL, NULL); + wait(&status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + ksft_exit_fail_msg("ptracee exit error\n"); +} + +/* + * Test signal delivery for the ABI compatibility. + * See the ABI format: arch/x86/include/uapi/asm/sigcontext.h + */ + +/* + * Avoid using printf() in signal handlers as it is not + * async-signal-safe. + */ +#define SIGNAL_BUF_LEN 1000 +static char signal_message_buffer[SIGNAL_BUF_LEN]; +static void sig_print(char *msg) +{ + int left = SIGNAL_BUF_LEN - strlen(signal_message_buffer) - 1; + + strncat(signal_message_buffer, msg, left); +} + +static struct xsave_buffer *stashed_xbuf; + +static void validate_sigfpstate(int sig, siginfo_t *si, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t *)ctx_void; + void *xbuf = ctx->uc_mcontext.fpregs; + struct _fpx_sw_bytes *sw_bytes; + uint32_t magic2; + + /* Reset the signal message buffer: */ + signal_message_buffer[0] = '\0'; + + sw_bytes = get_fpx_sw_bytes(xbuf); + if (sw_bytes->magic1 == FP_XSTATE_MAGIC1) + sig_print("[OK]\t'magic1' is valid\n"); + else + sig_print("[FAIL]\t'magic1' is not valid\n"); + + if (get_fpx_sw_bytes_features(xbuf) & xstate.mask) + sig_print("[OK]\t'xfeatures' in SW reserved area is valid\n"); + else + sig_print("[FAIL]\t'xfeatures' in SW reserved area is not valid\n"); + + if (get_xstatebv(xbuf) & xstate.mask) + sig_print("[OK]\t'xfeatures' in XSAVE header is valid\n"); + else + sig_print("[FAIL]\t'xfeatures' in XSAVE header is not valid\n"); + + if (validate_xstate_same(stashed_xbuf, xbuf)) + sig_print("[OK]\txstate delivery was successful\n"); + else + sig_print("[FAIL]\txstate delivery was not successful\n"); + + magic2 = *(uint32_t *)(xbuf + sw_bytes->xstate_size); + if (magic2 == FP_XSTATE_MAGIC2) + sig_print("[OK]\t'magic2' is valid\n"); + else + sig_print("[FAIL]\t'magic2' is not valid\n"); + + set_rand_data(&xstate, xbuf); + copy_xstate(stashed_xbuf, xbuf); +} + +static void test_signal(void) +{ + bool valid_xstate; + + /* + * The signal handler will access this to verify xstate context + * preservation. + */ + stashed_xbuf = alloc_xbuf(); + if (!stashed_xbuf) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + printf("[RUN]\t%s: load xstate and raise SIGUSR1\n", xstate.name); + + sethandler(SIGUSR1, validate_sigfpstate, 0); + + load_rand_xstate(&xstate, stashed_xbuf); + + raise(SIGUSR1); + + /* + * Immediately record the test result, deferring printf() to + * prevent unintended state contamination by that. + */ + valid_xstate = validate_xregs_same(stashed_xbuf); + printf("%s", signal_message_buffer); + + printf("[RUN]\t%s: load new xstate from sighandler and check it after sigreturn\n", + xstate.name); + + if (valid_xstate) + printf("[OK]\txstate was restored correctly\n"); + else + printf("[FAIL]\txstate restoration failed\n"); + + clearhandler(SIGUSR1); + free(stashed_xbuf); +} + +void test_xstate(uint32_t feature_num) +{ + const unsigned int ctxtsw_num_threads = 5, ctxtsw_iterations = 10; + unsigned long features; + long rc; + + if (!(XFEATURE_MASK_TEST_SUPPORTED & (1 << feature_num))) { + ksft_print_msg("The xstate test does not fully support the component %u, yet.\n", + feature_num); + return; + } + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features); + if (rc || !(features & (1 << feature_num))) { + ksft_print_msg("The kernel does not support feature number: %u\n", feature_num); + return; + } + + xstate = get_xstate_info(feature_num); + if (!xstate.size || !xstate.xbuf_offset) { + ksft_exit_fail_msg("invalid state size/offset (%d/%d)\n", + xstate.size, xstate.xbuf_offset); + } + + test_context_switch(ctxtsw_num_threads, ctxtsw_iterations); + test_ptrace(); + test_signal(); +} diff --git a/tools/testing/selftests/x86/xstate.h b/tools/testing/selftests/x86/xstate.h new file mode 100644 index 000000000000..42af36ec852f --- /dev/null +++ b/tools/testing/selftests/x86/xstate.h @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __SELFTESTS_X86_XSTATE_H +#define __SELFTESTS_X86_XSTATE_H + +#include <stdint.h> + +#include "../kselftest.h" + +#define XSAVE_HDR_OFFSET 512 +#define XSAVE_HDR_SIZE 64 + +/* + * List of XSAVE features Linux knows about. Copied from + * arch/x86/include/asm/fpu/types.h + */ +enum xfeature { + XFEATURE_FP, + XFEATURE_SSE, + XFEATURE_YMM, + XFEATURE_BNDREGS, + XFEATURE_BNDCSR, + XFEATURE_OPMASK, + XFEATURE_ZMM_Hi256, + XFEATURE_Hi16_ZMM, + XFEATURE_PT_UNIMPLEMENTED_SO_FAR, + XFEATURE_PKRU, + XFEATURE_PASID, + XFEATURE_CET_USER, + XFEATURE_CET_KERNEL_UNUSED, + XFEATURE_RSRVD_COMP_13, + XFEATURE_RSRVD_COMP_14, + XFEATURE_LBR, + XFEATURE_RSRVD_COMP_16, + XFEATURE_XTILECFG, + XFEATURE_XTILEDATA, + + XFEATURE_MAX, +}; + +/* Copied from arch/x86/kernel/fpu/xstate.c */ +static const char *xfeature_names[] = +{ + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace (unused)", + "Protection Keys User registers", + "PASID state", + "Control-flow User registers", + "Control-flow Kernel registers (unused)", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "AMX Tile config", + "AMX Tile data", + "unknown xstate feature", +}; + +struct xsave_buffer { + union { + struct { + char legacy[XSAVE_HDR_OFFSET]; + char header[XSAVE_HDR_SIZE]; + char extended[0]; + }; + char bytes[0]; + }; +}; + +static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm) +{ + uint32_t rfbm_hi = rfbm >> 32; + uint32_t rfbm_lo = rfbm; + + asm volatile("xsave (%%rdi)" + : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi) + : "memory"); +} + +static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm) +{ + uint32_t rfbm_hi = rfbm >> 32; + uint32_t rfbm_lo = rfbm; + + asm volatile("xrstor (%%rdi)" + : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)); +} + +#define CPUID_LEAF_XSTATE 0xd +#define CPUID_SUBLEAF_XSTATE_USER 0x0 + +static inline uint32_t get_xbuf_size(void) +{ + uint32_t eax, ebx, ecx, edx; + + __cpuid_count(CPUID_LEAF_XSTATE, CPUID_SUBLEAF_XSTATE_USER, + eax, ebx, ecx, edx); + + /* + * EBX enumerates the size (in bytes) required by the XSAVE + * instruction for an XSAVE area containing all the user state + * components corresponding to bits currently set in XCR0. + */ + return ebx; +} + +struct xstate_info { + const char *name; + uint32_t num; + uint32_t mask; + uint32_t xbuf_offset; + uint32_t size; +}; + +static inline struct xstate_info get_xstate_info(uint32_t xfeature_num) +{ + struct xstate_info xstate = { }; + uint32_t eax, ebx, ecx, edx; + + if (xfeature_num >= XFEATURE_MAX) { + ksft_print_msg("unknown state\n"); + return xstate; + } + + xstate.name = xfeature_names[xfeature_num]; + xstate.num = xfeature_num; + xstate.mask = 1 << xfeature_num; + + __cpuid_count(CPUID_LEAF_XSTATE, xfeature_num, + eax, ebx, ecx, edx); + xstate.size = eax; + xstate.xbuf_offset = ebx; + return xstate; +} + +static inline struct xsave_buffer *alloc_xbuf(void) +{ + uint32_t xbuf_size = get_xbuf_size(); + + /* XSAVE buffer should be 64B-aligned. */ + return aligned_alloc(64, xbuf_size); +} + +static inline void clear_xstate_header(struct xsave_buffer *xbuf) +{ + memset(&xbuf->header, 0, sizeof(xbuf->header)); +} + +static inline void set_xstatebv(struct xsave_buffer *xbuf, uint64_t bv) +{ + /* XSTATE_BV is at the beginning of the header: */ + *(uint64_t *)(&xbuf->header) = bv; +} + +/* See 'struct _fpx_sw_bytes' at sigcontext.h */ +#define SW_BYTES_OFFSET 464 +/* N.B. The struct's field name varies so read from the offset. */ +#define SW_BYTES_BV_OFFSET (SW_BYTES_OFFSET + 8) + +static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *xbuf) +{ + return xbuf + SW_BYTES_OFFSET; +} + +static inline uint64_t get_fpx_sw_bytes_features(void *buffer) +{ + return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET); +} + +static inline void set_rand_data(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + int *ptr = (int *)&xbuf->bytes[xstate->xbuf_offset]; + int data, i; + + /* + * Ensure that 'data' is never 0. This ensures that + * the registers are never in their initial configuration + * and thus never tracked as being in the init state. + */ + data = rand() | 1; + + for (i = 0; i < xstate->size / sizeof(int); i++, ptr++) + *ptr = data; +} + +/* Testing kernel's context switching and ABI support for the xstate. */ +void test_xstate(uint32_t feature_num); + +#endif /* __SELFTESTS_X86_XSTATE_H */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 55153494ac70..e85b33a92624 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4240,15 +4240,14 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) if (fd < 0) return fd; - file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY); + file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu, + O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(vcpu->kvm); - - file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; @@ -5036,16 +5035,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) if (fd < 0) return fd; - file = anon_inode_getfile("kvm-vm-stats", - &kvm_vm_stats_fops, kvm, O_RDONLY); + file = anon_inode_getfile_fmode("kvm-vm-stats", + &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(kvm); - - file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; |