summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/PCI/pciebus-howto.rst14
-rw-r--r--MAINTAINERS9
-rw-r--r--drivers/pci/hotplug/pciehp_ctrl.c5
-rw-r--r--drivers/pci/hotplug/pciehp_hpc.c2
-rw-r--r--drivers/pci/pci.c60
-rw-r--r--drivers/pci/pci.h41
-rw-r--r--drivers/pci/pcie/Makefile2
-rw-r--r--drivers/pci/pcie/bwctrl.c366
-rw-r--r--drivers/pci/pcie/portdrv.c9
-rw-r--r--drivers/pci/pcie/portdrv.h6
-rw-r--r--drivers/pci/probe.c15
-rw-r--r--drivers/pci/quirks.c32
-rw-r--r--drivers/thermal/Kconfig9
-rw-r--r--drivers/thermal/Makefile2
-rw-r--r--drivers/thermal/pcie_cooling.c80
-rw-r--r--include/linux/pci-bwctrl.h28
-rw-r--r--include/linux/pci.h23
-rw-r--r--include/uapi/linux/pci_regs.h1
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/pcie_bwctrl/Makefile2
-rwxr-xr-xtools/testing/selftests/pcie_bwctrl/set_pcie_cooling_state.sh122
-rwxr-xr-xtools/testing/selftests/pcie_bwctrl/set_pcie_speed.sh67
22 files changed, 842 insertions, 54 deletions
diff --git a/Documentation/PCI/pciebus-howto.rst b/Documentation/PCI/pciebus-howto.rst
index f344452651e1..375d9ce171f6 100644
--- a/Documentation/PCI/pciebus-howto.rst
+++ b/Documentation/PCI/pciebus-howto.rst
@@ -217,8 +217,12 @@ capability structure except the PCI Express capability structure,
that is shared between many drivers including the service drivers.
RMW Capability accessors (pcie_capability_clear_and_set_word(),
pcie_capability_set_word(), and pcie_capability_clear_word()) protect
-a selected set of PCI Express Capability Registers (Link Control
-Register and Root Control Register). Any change to those registers
-should be performed using RMW accessors to avoid problems due to
-concurrent updates. For the up-to-date list of protected registers,
-see pcie_capability_clear_and_set_word().
+a selected set of PCI Express Capability Registers:
+
+* Link Control Register
+* Root Control Register
+* Link Control 2 Register
+
+Any change to those registers should be performed using RMW accessors to
+avoid problems due to concurrent updates. For the up-to-date list of
+protected registers, see pcie_capability_clear_and_set_word().
diff --git a/MAINTAINERS b/MAINTAINERS
index c27f3190737f..d7ffef4382df 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17933,6 +17933,15 @@ F: include/linux/of_pci.h
F: include/linux/pci*
F: include/uapi/linux/pci*
+PCIE BANDWIDTH CONTROLLER
+M: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
+L: linux-pci@vger.kernel.org
+S: Supported
+F: drivers/pci/pcie/bwctrl.c
+F: drivers/thermal/pcie_cooling.c
+F: include/linux/pci-bwctrl.h
+F: tools/testing/selftests/pcie_bwctrl/
+
PCIE DRIVER FOR AMAZON ANNAPURNA LABS
M: Jonathan Chocron <jonnyc@amazon.com>
L: linux-pci@vger.kernel.org
diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index dcdbfcf404dd..d603a7aa7483 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c
@@ -19,6 +19,8 @@
#include <linux/types.h>
#include <linux/pm_runtime.h>
#include <linux/pci.h>
+
+#include "../pci.h"
#include "pciehp.h"
/* The following routines constitute the bulk of the
@@ -127,6 +129,9 @@ static void remove_board(struct controller *ctrl, bool safe_removal)
pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
INDICATOR_NOOP);
+
+ /* Don't carry LBMS indications across */
+ pcie_reset_lbms_count(ctrl->pcie->port);
}
static int pciehp_enable_slot(struct controller *ctrl);
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 736ad8baa2a5..bb5a8d9f03ad 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -319,7 +319,7 @@ int pciehp_check_link_status(struct controller *ctrl)
return -1;
}
- pcie_update_link_speed(ctrl->pcie->port->subordinate, lnk_status);
+ __pcie_update_link_speed(ctrl->pcie->port->subordinate, lnk_status);
if (!found) {
ctrl_info(ctrl, "Slot(%s): No device found\n",
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7d85c04fbba2..f85f380cdb9b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4740,7 +4740,7 @@ int pcie_retrain_link(struct pci_dev *pdev, bool use_lt)
* to track link speed or width changes made by hardware itself
* in attempt to correct unreliable link operation.
*/
- pcie_capability_write_word(pdev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS);
+ pcie_reset_lbms_count(pdev);
return rc;
}
@@ -6189,38 +6189,64 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
EXPORT_SYMBOL(pcie_bandwidth_available);
/**
- * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * pcie_get_supported_speeds - query Supported Link Speed Vector
* @dev: PCI device to query
*
- * Query the PCI device speed capability. Return the maximum link speed
- * supported by the device.
+ * Query @dev supported link speeds.
+ *
+ * Implementation Note in PCIe r6.0 sec 7.5.3.18 recommends determining
+ * supported link speeds using the Supported Link Speeds Vector in the Link
+ * Capabilities 2 Register (when available).
+ *
+ * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18.
+ *
+ * Without Link Capabilities 2, i.e., prior to PCIe r3.0, Supported Link
+ * Speeds field in Link Capabilities is used and only 2.5 GT/s and 5.0 GT/s
+ * speeds were defined.
+ *
+ * For @dev without Supported Link Speed Vector, the field is synthesized
+ * from the Max Link Speed field in the Link Capabilities Register.
+ *
+ * Return: Supported Link Speeds Vector (+ reserved 0 at LSB).
*/
-enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+u8 pcie_get_supported_speeds(struct pci_dev *dev)
{
u32 lnkcap2, lnkcap;
+ u8 speeds;
/*
- * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18. The
- * implementation note there recommends using the Supported Link
- * Speeds Vector in Link Capabilities 2 when supported.
- *
- * Without Link Capabilities 2, i.e., prior to PCIe r3.0, software
- * should use the Supported Link Speeds field in Link Capabilities,
- * where only 2.5 GT/s and 5.0 GT/s speeds were defined.
+ * Speeds retain the reserved 0 at LSB before PCIe Supported Link
+ * Speeds Vector to allow using SLS Vector bit defines directly.
*/
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
+ speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS;
/* PCIe r3.0-compliant */
- if (lnkcap2)
- return PCIE_LNKCAP2_SLS2SPEED(lnkcap2);
+ if (speeds)
+ return speeds;
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+
+ /* Synthesize from the Max Link Speed field */
if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB)
- return PCIE_SPEED_5_0GT;
+ speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB;
else if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_2_5GB)
- return PCIE_SPEED_2_5GT;
+ speeds = PCI_EXP_LNKCAP2_SLS_2_5GB;
- return PCI_SPEED_UNKNOWN;
+ return speeds;
+}
+
+/**
+ * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * @dev: PCI device to query
+ *
+ * Query the PCI device speed capability.
+ *
+ * Return: the maximum link speed supported by the device.
+ */
+enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+{
+ return PCIE_LNKCAP2_SLS2SPEED(dev->supported_speeds);
}
EXPORT_SYMBOL(pcie_get_speed_cap);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 14d00ce45bfa..1d5c519e19b1 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -331,6 +331,17 @@ void pci_disable_bridge_window(struct pci_dev *dev);
struct pci_bus *pci_bus_get(struct pci_bus *bus);
void pci_bus_put(struct pci_bus *bus);
+#define PCIE_LNKCAP_SLS2SPEED(lnkcap) \
+({ \
+ ((lnkcap) == PCI_EXP_LNKCAP_SLS_64_0GB ? PCIE_SPEED_64_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_32_0GB ? PCIE_SPEED_32_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_16_0GB ? PCIE_SPEED_16_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_8_0GB ? PCIE_SPEED_8_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_5_0GB ? PCIE_SPEED_5_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_2_5GB ? PCIE_SPEED_2_5GT : \
+ PCI_SPEED_UNKNOWN); \
+})
+
/* PCIe link information from Link Capabilities 2 */
#define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \
((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \
@@ -341,6 +352,15 @@ void pci_bus_put(struct pci_bus *bus);
(lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \
PCI_SPEED_UNKNOWN)
+#define PCIE_LNKCTL2_TLS2SPEED(lnkctl2) \
+ ((lnkctl2) == PCI_EXP_LNKCTL2_TLS_64_0GT ? PCIE_SPEED_64_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_32_0GT ? PCIE_SPEED_32_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_16_0GT ? PCIE_SPEED_16_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_8_0GT ? PCIE_SPEED_8_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_5_0GT ? PCIE_SPEED_5_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_2_5GT ? PCIE_SPEED_2_5GT : \
+ PCI_SPEED_UNKNOWN)
+
/* PCIe speed to Mb/s reduced by encoding overhead */
#define PCIE_SPEED2MBS_ENC(speed) \
((speed) == PCIE_SPEED_64_0GT ? 64000*1/1 : \
@@ -373,12 +393,16 @@ static inline int pcie_dev_speed_mbps(enum pci_bus_speed speed)
return -EINVAL;
}
+u8 pcie_get_supported_speeds(struct pci_dev *dev);
const char *pci_speed_string(enum pci_bus_speed speed);
-enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
-enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
void __pcie_print_link_status(struct pci_dev *dev, bool verbose);
void pcie_report_downtraining(struct pci_dev *dev);
-void pcie_update_link_speed(struct pci_bus *bus, u16 link_status);
+
+static inline void __pcie_update_link_speed(struct pci_bus *bus, u16 linksta)
+{
+ bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS];
+}
+void pcie_update_link_speed(struct pci_bus *bus);
/* Single Root I/O Virtualization */
struct pci_sriov {
@@ -692,6 +716,17 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
static inline void pcie_ecrc_get_policy(char *str) { }
#endif
+#ifdef CONFIG_PCIEPORTBUS
+void pcie_reset_lbms_count(struct pci_dev *port);
+int pcie_lbms_count(struct pci_dev *port, unsigned long *val);
+#else
+static inline void pcie_reset_lbms_count(struct pci_dev *port) {}
+static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
struct pci_dev_reset_methods {
u16 vendor;
u16 device;
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 6461aa93fe76..53ccab62314d 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -4,7 +4,7 @@
pcieportdrv-y := portdrv.o rcec.o
-obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o
+obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o
obj-y += aspm.o
obj-$(CONFIG_PCIEAER) += aer.o err.o
diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
new file mode 100644
index 000000000000..b59cacc740fa
--- /dev/null
+++ b/drivers/pci/pcie/bwctrl.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCIe bandwidth controller
+ *
+ * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
+ *
+ * Copyright (C) 2019 Dell Inc
+ * Copyright (C) 2023-2024 Intel Corporation
+ *
+ * The PCIe bandwidth controller provides a way to alter PCIe Link Speeds
+ * and notify the operating system when the Link Width or Speed changes. The
+ * notification capability is required for all Root Ports and Downstream
+ * Ports supporting Link Width wider than x1 and/or multiple Link Speeds.
+ *
+ * This service port driver hooks into the Bandwidth Notification interrupt
+ * watching for changes or links becoming degraded in operation. It updates
+ * the cached Current Link Speed that is exposed to user space through sysfs.
+ */
+
+#define dev_fmt(fmt) "bwctrl: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bits.h>
+#include <linux/cleanup.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pci-bwctrl.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "../pci.h"
+#include "portdrv.h"
+
+/**
+ * struct pcie_bwctrl_data - PCIe bandwidth controller
+ * @set_speed_mutex: Serializes link speed changes
+ * @lbms_count: Count for LBMS (since last reset)
+ * @cdev: Thermal cooling device associated with the port
+ */
+struct pcie_bwctrl_data {
+ struct mutex set_speed_mutex;
+ atomic_t lbms_count;
+ struct thermal_cooling_device *cdev;
+};
+
+/*
+ * Prevent port removal during LBMS count accessors and Link Speed changes.
+ *
+ * These have to be differentiated because pcie_bwctrl_change_speed() calls
+ * pcie_retrain_link() which uses LBMS count reset accessor on success
+ * (using just one rwsem triggers "possible recursive locking detected"
+ * warning).
+ */
+static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem);
+static DECLARE_RWSEM(pcie_bwctrl_setspeed_rwsem);
+
+static bool pcie_valid_speed(enum pci_bus_speed speed)
+{
+ return (speed >= PCIE_SPEED_2_5GT) && (speed <= PCIE_SPEED_64_0GT);
+}
+
+static u16 pci_bus_speed2lnkctl2(enum pci_bus_speed speed)
+{
+ static const u8 speed_conv[] = {
+ [PCIE_SPEED_2_5GT] = PCI_EXP_LNKCTL2_TLS_2_5GT,
+ [PCIE_SPEED_5_0GT] = PCI_EXP_LNKCTL2_TLS_5_0GT,
+ [PCIE_SPEED_8_0GT] = PCI_EXP_LNKCTL2_TLS_8_0GT,
+ [PCIE_SPEED_16_0GT] = PCI_EXP_LNKCTL2_TLS_16_0GT,
+ [PCIE_SPEED_32_0GT] = PCI_EXP_LNKCTL2_TLS_32_0GT,
+ [PCIE_SPEED_64_0GT] = PCI_EXP_LNKCTL2_TLS_64_0GT,
+ };
+
+ if (WARN_ON_ONCE(!pcie_valid_speed(speed)))
+ return 0;
+
+ return speed_conv[speed];
+}
+
+static inline u16 pcie_supported_speeds2target_speed(u8 supported_speeds)
+{
+ return __fls(supported_speeds);
+}
+
+/**
+ * pcie_bwctrl_select_speed - Select Target Link Speed
+ * @port: PCIe Port
+ * @speed_req: Requested PCIe Link Speed
+ *
+ * Select Target Link Speed by take into account Supported Link Speeds of
+ * both the Root Port and the Endpoint.
+ *
+ * Return: Target Link Speed (1=2.5GT/s, 2=5GT/s, 3=8GT/s, etc.)
+ */
+static u16 pcie_bwctrl_select_speed(struct pci_dev *port, enum pci_bus_speed speed_req)
+{
+ struct pci_bus *bus = port->subordinate;
+ u8 desired_speeds, supported_speeds;
+ struct pci_dev *dev;
+
+ desired_speeds = GENMASK(pci_bus_speed2lnkctl2(speed_req),
+ __fls(PCI_EXP_LNKCAP2_SLS_2_5GB));
+
+ supported_speeds = port->supported_speeds;
+ if (bus) {
+ down_read(&pci_bus_sem);
+ dev = list_first_entry_or_null(&bus->devices, struct pci_dev, bus_list);
+ if (dev)
+ supported_speeds &= dev->supported_speeds;
+ up_read(&pci_bus_sem);
+ }
+ if (!supported_speeds)
+ return PCI_EXP_LNKCAP2_SLS_2_5GB;
+
+ return pcie_supported_speeds2target_speed(supported_speeds & desired_speeds);
+}
+
+static int pcie_bwctrl_change_speed(struct pci_dev *port, u16 target_speed, bool use_lt)
+{
+ int ret;
+
+ ret = pcie_capability_clear_and_set_word(port, PCI_EXP_LNKCTL2,
+ PCI_EXP_LNKCTL2_TLS, target_speed);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return pcibios_err_to_errno(ret);
+
+ ret = pcie_retrain_link(port, use_lt);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Ensure link speed updates also with platforms that have problems
+ * with notifications.
+ */
+ if (port->subordinate)
+ pcie_update_link_speed(port->subordinate);
+
+ return 0;
+}
+
+/**
+ * pcie_set_target_speed - Set downstream Link Speed for PCIe Port
+ * @port: PCIe Port
+ * @speed_req: Requested PCIe Link Speed
+ * @use_lt: Wait for the LT or DLLLA bit to detect the end of link training
+ *
+ * Attempt to set PCIe Port Link Speed to @speed_req. @speed_req may be
+ * adjusted downwards to the best speed supported by both the Port and PCIe
+ * Device underneath it.
+ *
+ * Return:
+ * * 0 - on success
+ * * -EINVAL - @speed_req is not a PCIe Link Speed
+ * * -ENODEV - @port is not controllable
+ * * -ETIMEDOUT - changing Link Speed took too long
+ * * -EAGAIN - Link Speed was changed but @speed_req was not achieved
+ */
+int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
+ bool use_lt)
+{
+ struct pci_bus *bus = port->subordinate;
+ u16 target_speed;
+ int ret;
+
+ if (WARN_ON_ONCE(!pcie_valid_speed(speed_req)))
+ return -EINVAL;
+
+ if (bus && bus->cur_bus_speed == speed_req)
+ return 0;
+
+ target_speed = pcie_bwctrl_select_speed(port, speed_req);
+
+ scoped_guard(rwsem_read, &pcie_bwctrl_setspeed_rwsem) {
+ struct pcie_bwctrl_data *data = port->link_bwctrl;
+
+ /*
+ * port->link_bwctrl is NULL during initial scan when called
+ * e.g. from the Target Speed quirk.
+ */
+ if (data)
+ mutex_lock(&data->set_speed_mutex);
+
+ ret = pcie_bwctrl_change_speed(port, target_speed, use_lt);
+
+ if (data)
+ mutex_unlock(&data->set_speed_mutex);
+ }
+
+ /*
+ * Despite setting higher speed into the Target Link Speed, empty
+ * bus won't train to 5GT+ speeds.
+ */
+ if (!ret && bus && bus->cur_bus_speed != speed_req &&
+ !list_empty(&bus->devices))
+ ret = -EAGAIN;
+
+ return ret;
+}
+
+static void pcie_bwnotif_enable(struct pcie_device *srv)
+{
+ struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+ struct pci_dev *port = srv->port;
+ u16 link_status;
+ int ret;
+
+ /* Count LBMS seen so far as one */
+ ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+ if (ret == PCIBIOS_SUCCESSFUL && link_status & PCI_EXP_LNKSTA_LBMS)
+ atomic_inc(&data->lbms_count);
+
+ pcie_capability_set_word(port, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
+ pcie_capability_write_word(port, PCI_EXP_LNKSTA,
+ PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
+
+ /*
+ * Update after enabling notifications & clearing status bits ensures
+ * link speed is up to date.
+ */
+ pcie_update_link_speed(port->subordinate);
+}
+
+static void pcie_bwnotif_disable(struct pci_dev *port)
+{
+ pcie_capability_clear_word(port, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
+}
+
+static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
+{
+ struct pcie_device *srv = context;
+ struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+ struct pci_dev *port = srv->port;
+ u16 link_status, events;
+ int ret;
+
+ ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return IRQ_NONE;
+
+ events = link_status & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
+ if (!events)
+ return IRQ_NONE;
+
+ if (events & PCI_EXP_LNKSTA_LBMS)
+ atomic_inc(&data->lbms_count);
+
+ pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
+
+ /*
+ * Interrupts will not be triggered from any further Link Speed
+ * change until LBMS is cleared by the write. Therefore, re-read the
+ * speed (inside pcie_update_link_speed()) after LBMS has been
+ * cleared to avoid missing link speed changes.
+ */
+ pcie_update_link_speed(port->subordinate);
+
+ return IRQ_HANDLED;
+}
+
+void pcie_reset_lbms_count(struct pci_dev *port)
+{
+ struct pcie_bwctrl_data *data;
+
+ guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
+ data = port->link_bwctrl;
+ if (data)
+ atomic_set(&data->lbms_count, 0);
+ else
+ pcie_capability_write_word(port, PCI_EXP_LNKSTA,
+ PCI_EXP_LNKSTA_LBMS);
+}
+
+int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
+{
+ struct pcie_bwctrl_data *data;
+
+ guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
+ data = port->link_bwctrl;
+ if (!data)
+ return -ENOTTY;
+
+ *val = atomic_read(&data->lbms_count);
+
+ return 0;
+}
+
+static int pcie_bwnotif_probe(struct pcie_device *srv)
+{
+ struct pci_dev *port = srv->port;
+ int ret;
+
+ struct pcie_bwctrl_data *data = devm_kzalloc(&srv->device,
+ sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ ret = devm_mutex_init(&srv->device, &data->set_speed_mutex);
+ if (ret)
+ return ret;
+
+ ret = devm_request_irq(&srv->device, srv->irq, pcie_bwnotif_irq,
+ IRQF_SHARED, "PCIe bwctrl", srv);
+ if (ret)
+ return ret;
+
+ scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
+ scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
+ port->link_bwctrl = no_free_ptr(data);
+ pcie_bwnotif_enable(srv);
+ }
+ }
+
+ pci_dbg(port, "enabled with IRQ %d\n", srv->irq);
+
+ /* Don't fail on errors. Don't leave IS_ERR() "pointer" into ->cdev */
+ port->link_bwctrl->cdev = pcie_cooling_device_register(port);
+ if (IS_ERR(port->link_bwctrl->cdev))
+ port->link_bwctrl->cdev = NULL;
+
+ return 0;
+}
+
+static void pcie_bwnotif_remove(struct pcie_device *srv)
+{
+ struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+
+ pcie_cooling_device_unregister(data->cdev);
+
+ pcie_bwnotif_disable(srv->port);
+
+ scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem)
+ scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem)
+ srv->port->link_bwctrl = NULL;
+}
+
+static int pcie_bwnotif_suspend(struct pcie_device *srv)
+{
+ pcie_bwnotif_disable(srv->port);
+ return 0;
+}
+
+static int pcie_bwnotif_resume(struct pcie_device *srv)
+{
+ pcie_bwnotif_enable(srv);
+ return 0;
+}
+
+static struct pcie_port_service_driver pcie_bwctrl_driver = {
+ .name = "pcie_bwctrl",
+ .port_type = PCIE_ANY_PORT,
+ .service = PCIE_PORT_SERVICE_BWCTRL,
+ .probe = pcie_bwnotif_probe,
+ .suspend = pcie_bwnotif_suspend,
+ .resume = pcie_bwnotif_resume,
+ .remove = pcie_bwnotif_remove,
+};
+
+int __init pcie_bwctrl_init(void)
+{
+ return pcie_port_service_register(&pcie_bwctrl_driver);
+}
diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c
index 6af5e0425872..5e10306b6308 100644
--- a/drivers/pci/pcie/portdrv.c
+++ b/drivers/pci/pcie/portdrv.c
@@ -68,7 +68,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
*/
if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
- PCIE_PORT_SERVICE_BWNOTIF)) {
+ PCIE_PORT_SERVICE_BWCTRL)) {
pcie_capability_read_word(dev, PCI_EXP_FLAGS, &reg16);
*pme = FIELD_GET(PCI_EXP_FLAGS_IRQ, reg16);
nvec = *pme + 1;
@@ -150,11 +150,11 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
- PCIE_PORT_SERVICE_BWNOTIF)) {
+ PCIE_PORT_SERVICE_BWCTRL)) {
pcie_irq = pci_irq_vector(dev, pme);
irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
- irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
+ irqs[PCIE_PORT_SERVICE_BWCTRL_SHIFT] = pcie_irq;
}
if (mask & PCIE_PORT_SERVICE_AER)
@@ -271,7 +271,7 @@ static int get_port_device_capability(struct pci_dev *dev)
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap);
if (linkcap & PCI_EXP_LNKCAP_LBNC)
- services |= PCIE_PORT_SERVICE_BWNOTIF;
+ services |= PCIE_PORT_SERVICE_BWCTRL;
}
return services;
@@ -828,6 +828,7 @@ static void __init pcie_init_services(void)
pcie_aer_init();
pcie_pme_init();
pcie_dpc_init();
+ pcie_bwctrl_init();
pcie_hp_init();
}
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 12c89ea0313b..bd29d1cc7b8b 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -20,8 +20,8 @@
#define PCIE_PORT_SERVICE_HP (1 << PCIE_PORT_SERVICE_HP_SHIFT)
#define PCIE_PORT_SERVICE_DPC_SHIFT 3 /* Downstream Port Containment */
#define PCIE_PORT_SERVICE_DPC (1 << PCIE_PORT_SERVICE_DPC_SHIFT)
-#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT 4 /* Bandwidth notification */
-#define PCIE_PORT_SERVICE_BWNOTIF (1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
+#define PCIE_PORT_SERVICE_BWCTRL_SHIFT 4 /* Bandwidth Controller (notifications) */
+#define PCIE_PORT_SERVICE_BWCTRL (1 << PCIE_PORT_SERVICE_BWCTRL_SHIFT)
#define PCIE_PORT_DEVICE_MAXSERVICES 5
@@ -51,6 +51,8 @@ int pcie_dpc_init(void);
static inline int pcie_dpc_init(void) { return 0; }
#endif
+int pcie_bwctrl_init(void);
+
/* Port Type */
#define PCIE_ANY_PORT (~0)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..c138daf78961 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -742,9 +742,13 @@ const char *pci_speed_string(enum pci_bus_speed speed)
}
EXPORT_SYMBOL_GPL(pci_speed_string);
-void pcie_update_link_speed(struct pci_bus *bus, u16 linksta)
+void pcie_update_link_speed(struct pci_bus *bus)
{
- bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS];
+ struct pci_dev *bridge = bus->self;
+ u16 linksta;
+
+ pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, &linksta);
+ __pcie_update_link_speed(bus, linksta);
}
EXPORT_SYMBOL_GPL(pcie_update_link_speed);
@@ -827,13 +831,11 @@ static void pci_set_bus_speed(struct pci_bus *bus)
if (pci_is_pcie(bridge)) {
u32 linkcap;
- u16 linksta;
pcie_capability_read_dword(bridge, PCI_EXP_LNKCAP, &linkcap);
bus->max_bus_speed = pcie_link_speed[linkcap & PCI_EXP_LNKCAP_SLS];
- pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, &linksta);
- pcie_update_link_speed(bus, linksta);
+ pcie_update_link_speed(bus);
}
}
@@ -1947,6 +1949,9 @@ int pci_setup_device(struct pci_dev *dev)
set_pcie_untrusted(dev);
+ if (pci_is_pcie(dev))
+ dev->supported_speeds = pcie_get_supported_speeds(dev);
+
/* "Unknown power state" */
dev->current_state = PCI_UNKNOWN;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index dccb60c1d9cc..dcf1c86a5488 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -33,6 +33,18 @@
#include <linux/switchtec.h>
#include "pci.h"
+static bool pcie_lbms_seen(struct pci_dev *dev, u16 lnksta)
+{
+ unsigned long count;
+ int ret;
+
+ ret = pcie_lbms_count(dev, &count);
+ if (ret < 0)
+ return lnksta & PCI_EXP_LNKSTA_LBMS;
+
+ return count > 0;
+}
+
/*
* Retrain the link of a downstream PCIe port by hand if necessary.
*
@@ -96,22 +108,16 @@ int pcie_failed_link_retrain(struct pci_dev *dev)
pcie_capability_read_word(dev, PCI_EXP_LNKCTL2, &lnkctl2);
pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
- if ((lnksta & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_DLLLA)) ==
- PCI_EXP_LNKSTA_LBMS) {
+ if (!(lnksta & PCI_EXP_LNKSTA_DLLLA) && pcie_lbms_seen(dev, lnksta)) {
u16 oldlnkctl2 = lnkctl2;
pci_info(dev, "broken device, retraining non-functional downstream link at 2.5GT/s\n");
- lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
- lnkctl2 |= PCI_EXP_LNKCTL2_TLS_2_5GT;
- pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
-
- ret = pcie_retrain_link(dev, false);
+ ret = pcie_set_target_speed(dev, PCIE_SPEED_2_5GT, false);
if (ret) {
pci_info(dev, "retraining failed\n");
- pcie_capability_write_word(dev, PCI_EXP_LNKCTL2,
- oldlnkctl2);
- pcie_retrain_link(dev, true);
+ pcie_set_target_speed(dev, PCIE_LNKCTL2_TLS2SPEED(oldlnkctl2),
+ true);
return ret;
}
@@ -125,11 +131,7 @@ int pcie_failed_link_retrain(struct pci_dev *dev)
pci_info(dev, "removing 2.5GT/s downstream link speed restriction\n");
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
- lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
- lnkctl2 |= lnkcap & PCI_EXP_LNKCAP_SLS;
- pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
-
- ret = pcie_retrain_link(dev, false);
+ ret = pcie_set_target_speed(dev, PCIE_LNKCAP_SLS2SPEED(lnkcap), false);
if (ret) {
pci_info(dev, "retraining failed\n");
return ret;
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 61e7ae524b1f..d3f9686e26e7 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -220,6 +220,15 @@ config DEVFREQ_THERMAL
If you want this support, you should say Y here.
+config PCIE_THERMAL
+ bool "PCIe cooling support"
+ depends on PCIEPORTBUS
+ help
+ This implements PCIe cooling mechanism through bandwidth reduction
+ for PCIe devices.
+
+ If you want this support, you should say Y here.
+
config THERMAL_EMULATION
bool "Thermal emulation mode support"
help
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 41c4d56beb40..210c16c91461 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -31,6 +31,8 @@ thermal_sys-$(CONFIG_CPU_IDLE_THERMAL) += cpuidle_cooling.o
# devfreq cooling
thermal_sys-$(CONFIG_DEVFREQ_THERMAL) += devfreq_cooling.o
+thermal_sys-$(CONFIG_PCIE_THERMAL) += pcie_cooling.o
+
obj-$(CONFIG_K3_THERMAL) += k3_bandgap.o k3_j72xx_bandgap.o
# platform thermal drivers
obj-y += broadcom/
diff --git a/drivers/thermal/pcie_cooling.c b/drivers/thermal/pcie_cooling.c
new file mode 100644
index 000000000000..a876d64f1582
--- /dev/null
+++ b/drivers/thermal/pcie_cooling.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PCIe cooling device
+ *
+ * Copyright (C) 2023-2024 Intel Corporation
+ */
+
+#include <linux/build_bug.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci-bwctrl.h>
+#include <linux/slab.h>
+#include <linux/sprintf.h>
+#include <linux/thermal.h>
+
+#define COOLING_DEV_TYPE_PREFIX "PCIe_Port_Link_Speed_"
+
+static int pcie_cooling_get_max_level(struct thermal_cooling_device *cdev, unsigned long *state)
+{
+ struct pci_dev *port = cdev->devdata;
+
+ /* cooling state 0 is same as the maximum PCIe speed */
+ *state = port->subordinate->max_bus_speed - PCIE_SPEED_2_5GT;
+
+ return 0;
+}
+
+static int pcie_cooling_get_cur_level(struct thermal_cooling_device *cdev, unsigned long *state)
+{
+ struct pci_dev *port = cdev->devdata;
+
+ /* cooling state 0 is same as the maximum PCIe speed */
+ *state = cdev->max_state - (port->subordinate->cur_bus_speed - PCIE_SPEED_2_5GT);
+
+ return 0;
+}
+
+static int pcie_cooling_set_cur_level(struct thermal_cooling_device *cdev, unsigned long state)
+{
+ struct pci_dev *port = cdev->devdata;
+ enum pci_bus_speed speed;
+
+ /* cooling state 0 is same as the maximum PCIe speed */
+ speed = (cdev->max_state - state) + PCIE_SPEED_2_5GT;
+
+ return pcie_set_target_speed(port, speed, true);
+}
+
+static struct thermal_cooling_device_ops pcie_cooling_ops = {
+ .get_max_state = pcie_cooling_get_max_level,
+ .get_cur_state = pcie_cooling_get_cur_level,
+ .set_cur_state = pcie_cooling_set_cur_level,
+};
+
+struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port)
+{
+ char *name __free(kfree) =
+ kasprintf(GFP_KERNEL, COOLING_DEV_TYPE_PREFIX "%s", pci_name(port));
+ if (!name)
+ return ERR_PTR(-ENOMEM);
+
+ return thermal_cooling_device_register(name, port, &pcie_cooling_ops);
+}
+
+void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev)
+{
+ thermal_cooling_device_unregister(cdev);
+}
+
+/* For bus_speed <-> state arithmetic */
+static_assert(PCIE_SPEED_2_5GT + 1 == PCIE_SPEED_5_0GT);
+static_assert(PCIE_SPEED_5_0GT + 1 == PCIE_SPEED_8_0GT);
+static_assert(PCIE_SPEED_8_0GT + 1 == PCIE_SPEED_16_0GT);
+static_assert(PCIE_SPEED_16_0GT + 1 == PCIE_SPEED_32_0GT);
+static_assert(PCIE_SPEED_32_0GT + 1 == PCIE_SPEED_64_0GT);
+
+MODULE_AUTHOR("Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>");
+MODULE_DESCRIPTION("PCIe cooling driver");
diff --git a/include/linux/pci-bwctrl.h b/include/linux/pci-bwctrl.h
new file mode 100644
index 000000000000..cee07127455b
--- /dev/null
+++ b/include/linux/pci-bwctrl.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * PCIe bandwidth controller
+ *
+ * Copyright (C) 2023-2024 Intel Corporation
+ */
+
+#ifndef LINUX_PCI_BWCTRL_H
+#define LINUX_PCI_BWCTRL_H
+
+#include <linux/pci.h>
+
+struct thermal_cooling_device;
+
+#ifdef CONFIG_PCIE_THERMAL
+struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port);
+void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev);
+#else
+static inline struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port)
+{
+ return NULL;
+}
+static inline void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev)
+{
+}
+#endif
+
+#endif
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 573b4c4c2be6..4f02d9f91399 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -313,12 +313,20 @@ struct pci_vpd {
};
struct irq_affinity;
+struct pcie_bwctrl_data;
struct pcie_link_state;
struct pci_sriov;
struct pci_p2pdma;
struct rcec_ea;
-/* The pci_dev structure describes PCI devices */
+/* struct pci_dev - describes a PCI device
+ *
+ * @supported_speeds: PCIe Supported Link Speeds Vector (+ reserved 0 at
+ * LSB). 0 when the supported speeds cannot be
+ * determined (e.g., for Root Complex Integrated
+ * Endpoints without the relevant Capability
+ * Registers).
+ */
struct pci_dev {
struct list_head bus_list; /* Node in per-bus list */
struct pci_bus *bus; /* Bus this device is on */
@@ -495,6 +503,7 @@ struct pci_dev {
unsigned int dpc_rp_extensions:1;
u8 dpc_rp_log_size;
#endif
+ struct pcie_bwctrl_data *link_bwctrl;
#ifdef CONFIG_PCI_ATS
union {
struct pci_sriov *sriov; /* PF: SR-IOV info */
@@ -522,6 +531,7 @@ struct pci_dev {
struct npem *npem; /* Native PCIe Enclosure Management */
#endif
u16 acs_cap; /* ACS Capability offset */
+ u8 supported_speeds; /* Supported Link Speeds Vector */
phys_addr_t rom; /* Physical address if not from BAR */
size_t romlen; /* Length if not from BAR */
/*
@@ -1274,6 +1284,7 @@ static inline int pcie_capability_clear_and_set_word(struct pci_dev *dev,
{
switch (pos) {
case PCI_EXP_LNKCTL:
+ case PCI_EXP_LNKCTL2:
case PCI_EXP_RTCTL:
return pcie_capability_clear_and_set_word_locked(dev, pos,
clear, set);
@@ -1786,9 +1797,19 @@ static inline int pci_irqd_intx_xlate(struct irq_domain *d,
#ifdef CONFIG_PCIEPORTBUS
extern bool pcie_ports_disabled;
extern bool pcie_ports_native;
+
+int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
+ bool use_lt);
#else
#define pcie_ports_disabled true
#define pcie_ports_native false
+
+static inline int pcie_set_target_speed(struct pci_dev *port,
+ enum pci_bus_speed speed_req,
+ bool use_lt)
+{
+ return -EOPNOTSUPP;
+}
#endif
#define PCIE_LINK_STATE_L0S (BIT(0) | BIT(1)) /* Upstr/dwnstr L0s */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 12323b3334a9..f3c9de0a497c 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -678,6 +678,7 @@
#define PCI_EXP_DEVSTA2 0x2a /* Device Status 2 */
#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 0x2c /* end of v2 EPs w/o link */
#define PCI_EXP_LNKCAP2 0x2c /* Link Capabilities 2 */
+#define PCI_EXP_LNKCAP2_SLS 0x000000fe /* Supported Link Speeds Vector */
#define PCI_EXP_LNKCAP2_SLS_2_5GB 0x00000002 /* Supported Speed 2.5GT/s */
#define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5GT/s */
#define PCI_EXP_LNKCAP2_SLS_8_0GB 0x00000008 /* Supported Speed 8GT/s */
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index b38199965f99..7181756f47ff 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -72,6 +72,7 @@ TARGETS += net/packetdrill
TARGETS += net/rds
TARGETS += net/tcp_ao
TARGETS += nsfs
+TARGETS += pcie_bwctrl
TARGETS += perf_events
TARGETS += pidfd
TARGETS += pid_namespace
diff --git a/tools/testing/selftests/pcie_bwctrl/Makefile b/tools/testing/selftests/pcie_bwctrl/Makefile
new file mode 100644
index 000000000000..3e84e26341d1
--- /dev/null
+++ b/tools/testing/selftests/pcie_bwctrl/Makefile
@@ -0,0 +1,2 @@
+TEST_PROGS = set_pcie_cooling_state.sh
+include ../lib.mk
diff --git a/tools/testing/selftests/pcie_bwctrl/set_pcie_cooling_state.sh b/tools/testing/selftests/pcie_bwctrl/set_pcie_cooling_state.sh
new file mode 100755
index 000000000000..9df606552af3
--- /dev/null
+++ b/tools/testing/selftests/pcie_bwctrl/set_pcie_cooling_state.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+SYSFS=
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+retval=0
+skipmsg="skip all tests:"
+
+PCIEPORTTYPE="PCIe_Port_Link_Speed"
+
+prerequisite()
+{
+ local ports
+
+ if [ $UID != 0 ]; then
+ echo $skipmsg must be run as root >&2
+ exit $ksft_skip
+ fi
+
+ SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
+
+ if [ ! -d "$SYSFS" ]; then
+ echo $skipmsg sysfs is not mounted >&2
+ exit $ksft_skip
+ fi
+
+ if ! ls $SYSFS/class/thermal/cooling_device* > /dev/null 2>&1; then
+ echo $skipmsg thermal cooling devices missing >&2
+ exit $ksft_skip
+ fi
+
+ ports=`grep -e "^$PCIEPORTTYPE" $SYSFS/class/thermal/cooling_device*/type | wc -l`
+ if [ $ports -eq 0 ]; then
+ echo $skipmsg pcie cooling devices missing >&2
+ exit $ksft_skip
+ fi
+}
+
+testport=
+find_pcie_port()
+{
+ local patt="$1"
+ local pcieports
+ local max
+ local cur
+ local delta
+ local bestdelta=-1
+
+ pcieports=`grep -l -F -e "$patt" /sys/class/thermal/cooling_device*/type`
+ if [ -z "$pcieports" ]; then
+ return
+ fi
+ pcieports=${pcieports//\/type/}
+ # Find the port with the highest PCIe Link Speed
+ for port in $pcieports; do
+ max=`cat $port/max_state`
+ cur=`cat $port/cur_state`
+ delta=$((max-cur))
+ if [ $delta -gt $bestdelta ]; then
+ testport="$port"
+ bestdelta=$delta
+ fi
+ done
+}
+
+sysfspcidev=
+find_sysfs_pci_dev()
+{
+ local typefile="$1/type"
+ local pcidir
+
+ pcidir="$SYSFS/bus/pci/devices/`sed -e "s|^${PCIEPORTTYPE}_||g" $typefile`"
+
+ if [ -r "$pcidir/current_link_speed" ]; then
+ sysfspcidev="$pcidir/current_link_speed"
+ fi
+}
+
+usage()
+{
+ echo "Usage $0 [ -d dev ]"
+ echo -e "\t-d: PCIe port BDF string (e.g., 0000:00:04.0)"
+}
+
+pattern="$PCIEPORTTYPE"
+parse_arguments()
+{
+ while getopts d:h opt; do
+ case $opt in
+ h)
+ usage "$0"
+ exit 0
+ ;;
+ d)
+ pattern="$PCIEPORTTYPE_$OPTARG"
+ ;;
+ *)
+ usage "$0"
+ exit 0
+ ;;
+ esac
+ done
+}
+
+parse_arguments "$@"
+prerequisite
+find_pcie_port "$pattern"
+if [ -z "$testport" ]; then
+ echo $skipmsg "pcie cooling device not found from sysfs" >&2
+ exit $ksft_skip
+fi
+find_sysfs_pci_dev "$testport"
+if [ -z "$sysfspcidev" ]; then
+ echo $skipmsg "PCIe port device not found from sysfs" >&2
+ exit $ksft_skip
+fi
+
+./set_pcie_speed.sh "$testport" "$sysfspcidev"
+retval=$?
+
+exit $retval
diff --git a/tools/testing/selftests/pcie_bwctrl/set_pcie_speed.sh b/tools/testing/selftests/pcie_bwctrl/set_pcie_speed.sh
new file mode 100755
index 000000000000..584596949312
--- /dev/null
+++ b/tools/testing/selftests/pcie_bwctrl/set_pcie_speed.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+set -e
+
+TESTNAME=set_pcie_speed
+
+declare -a PCIELINKSPEED=(
+ "2.5 GT/s PCIe"
+ "5.0 GT/s PCIe"
+ "8.0 GT/s PCIe"
+ "16.0 GT/s PCIe"
+ "32.0 GT/s PCIe"
+ "64.0 GT/s PCIe"
+)
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+retval=0
+
+coolingdev="$1"
+statefile="$coolingdev/cur_state"
+maxfile="$coolingdev/max_state"
+linkspeedfile="$2"
+
+oldstate=`cat $statefile`
+maxstate=`cat $maxfile`
+
+set_state()
+{
+ local state=$1
+ local linkspeed
+ local expected_linkspeed
+
+ echo $state > $statefile
+
+ sleep 1
+
+ linkspeed="`cat $linkspeedfile`"
+ expected_linkspeed=$((maxstate-state))
+ expected_str="${PCIELINKSPEED[$expected_linkspeed]}"
+ if [ ! "${expected_str}" = "${linkspeed}" ]; then
+ echo "$TESTNAME failed: expected: ${expected_str}; got ${linkspeed}"
+ retval=1
+ fi
+}
+
+cleanup_skip ()
+{
+ set_state $oldstate
+ exit $ksft_skip
+}
+
+trap cleanup_skip EXIT
+
+echo "$TESTNAME: testing states $maxstate .. $oldstate with $coolingdev"
+for i in $(seq $maxstate -1 $oldstate); do
+ set_state "$i"
+done
+
+trap EXIT
+if [ $retval -eq 0 ]; then
+ echo "$TESTNAME [PASS]"
+else
+ echo "$TESTNAME [FAIL]"
+fi
+exit $retval