summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt3
-rw-r--r--Documentation/admin-guide/pm/suspend-flows.rst270
-rw-r--r--Documentation/admin-guide/pm/system-wide.rst1
-rw-r--r--drivers/acpi/sleep.c4
-rw-r--r--drivers/acpi/sleep.h1
-rw-r--r--drivers/acpi/wakeup.c81
-rw-r--r--drivers/base/power/main.c7
-rw-r--r--drivers/cpufreq/Kconfig4
-rw-r--r--drivers/cpufreq/Kconfig.x862
-rw-r--r--drivers/platform/x86/intel_int0002_vgpio.c10
-rw-r--r--include/linux/acpi.h5
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/main.c7
13 files changed, 389 insertions, 8 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4d5a4fe22703..c78b463f5f79 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3720,6 +3720,9 @@
Override pmtimer IOPort with a hex value.
e.g. pmtmr=0x508
+ pm_debug_messages [SUSPEND,KNL]
+ Enable suspend/resume debug messages during boot up.
+
pnp.debug=1 [PNP]
Enable PNP debug messages (depends on the
CONFIG_PNP_DEBUG_MESSAGES option). Change at run-time
diff --git a/Documentation/admin-guide/pm/suspend-flows.rst b/Documentation/admin-guide/pm/suspend-flows.rst
new file mode 100644
index 000000000000..c479d7462647
--- /dev/null
+++ b/Documentation/admin-guide/pm/suspend-flows.rst
@@ -0,0 +1,270 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+=========================
+System Suspend Code Flows
+=========================
+
+:Copyright: |copy| 2020 Intel Corporation
+
+:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+At least one global system-wide transition needs to be carried out for the
+system to get from the working state into one of the supported
+:doc:`sleep states <sleep-states>`. Hibernation requires more than one
+transition to occur for this purpose, but the other sleep states, commonly
+referred to as *system-wide suspend* (or simply *system suspend*) states, need
+only one.
+
+For those sleep states, the transition from the working state of the system into
+the target sleep state is referred to as *system suspend* too (in the majority
+of cases, whether this means a transition or a sleep state of the system should
+be clear from the context) and the transition back from the sleep state into the
+working state is referred to as *system resume*.
+
+The kernel code flows associated with the suspend and resume transitions for
+different sleep states of the system are quite similar, but there are some
+significant differences between the :ref:`suspend-to-idle <s2idle>` code flows
+and the code flows related to the :ref:`suspend-to-RAM <s2ram>` and
+:ref:`standby <standby>` sleep states.
+
+The :ref:`suspend-to-RAM <s2ram>` and :ref:`standby <standby>` sleep states
+cannot be implemented without platform support and the difference between them
+boils down to the platform-specific actions carried out by the suspend and
+resume hooks that need to be provided by the platform driver to make them
+available. Apart from that, the suspend and resume code flows for these sleep
+states are mostly identical, so they both together will be referred to as
+*platform-dependent suspend* states in what follows.
+
+
+.. _s2idle_suspend:
+
+Suspend-to-idle Suspend Code Flow
+=================================
+
+The following steps are taken in order to transition the system from the working
+state to the :ref:`suspend-to-idle <s2idle>` sleep state:
+
+ 1. Invoking system-wide suspend notifiers.
+
+ Kernel subsystems can register callbacks to be invoked when the suspend
+ transition is about to occur and when the resume transition has finished.
+
+ That allows them to prepare for the change of the system state and to clean
+ up after getting back to the working state.
+
+ 2. Freezing tasks.
+
+ Tasks are frozen primarily in order to avoid unchecked hardware accesses
+ from user space through MMIO regions or I/O registers exposed directly to
+ it and to prevent user space from entering the kernel while the next step
+ of the transition is in progress (which might have been problematic for
+ various reasons).
+
+ All user space tasks are intercepted as though they were sent a signal and
+ put into uninterruptible sleep until the end of the subsequent system resume
+ transition.
+
+ The kernel threads that choose to be frozen during system suspend for
+ specific reasons are frozen subsequently, but they are not intercepted.
+ Instead, they are expected to periodically check whether or not they need
+ to be frozen and to put themselves into uninterruptible sleep if so. [Note,
+ however, that kernel threads can use locking and other concurrency controls
+ available in kernel space to synchronize themselves with system suspend and
+ resume, which can be much more precise than the freezing, so the latter is
+ not a recommended option for kernel threads.]
+
+ 3. Suspending devices and reconfiguring IRQs.
+
+ Devices are suspended in four phases called *prepare*, *suspend*,
+ *late suspend* and *noirq suspend* (see :ref:`driverapi_pm_devices` for more
+ information on what exactly happens in each phase).
+
+ Every device is visited in each phase, but typically it is not physically
+ accessed in more than two of them.
+
+ The runtime PM API is disabled for every device during the *late* suspend
+ phase and high-level ("action") interrupt handlers are prevented from being
+ invoked before the *noirq* suspend phase.
+
+ Interrupts are still handled after that, but they are only acknowledged to
+ interrupt controllers without performing any device-specific actions that
+ would be triggered in the working state of the system (those actions are
+ deferred till the subsequent system resume transition as described
+ `below <s2idle_resume_>`_).
+
+ IRQs associated with system wakeup devices are "armed" so that the resume
+ transition of the system is started when one of them signals an event.
+
+ 4. Freezing the scheduler tick and suspending timekeeping.
+
+ When all devices have been suspended, CPUs enter the idle loop and are put
+ into the deepest available idle state. While doing that, each of them
+ "freezes" its own scheduler tick so that the timer events associated with
+ the tick do not occur until the CPU is woken up by another interrupt source.
+
+ The last CPU to enter the idle state also stops the timekeeping which
+ (among other things) prevents high resolution timers from triggering going
+ forward until the first CPU that is woken up restarts the timekeeping.
+ That allows the CPUs to stay in the deep idle state relatively long in one
+ go.
+
+ From this point on, the CPUs can only be woken up by non-timer hardware
+ interrupts. If that happens, they go back to the idle state unless the
+ interrupt that woke up one of them comes from an IRQ that has been armed for
+ system wakeup, in which case the system resume transition is started.
+
+
+.. _s2idle_resume:
+
+Suspend-to-idle Resume Code Flow
+================================
+
+The following steps are taken in order to transition the system from the
+:ref:`suspend-to-idle <s2idle>` sleep state into the working state:
+
+ 1. Resuming timekeeping and unfreezing the scheduler tick.
+
+ When one of the CPUs is woken up (by a non-timer hardware interrupt), it
+ leaves the idle state entered in the last step of the preceding suspend
+ transition, restarts the timekeeping (unless it has been restarted already
+ by another CPU that woke up earlier) and the scheduler tick on that CPU is
+ unfrozen.
+
+ If the interrupt that has woken up the CPU was armed for system wakeup,
+ the system resume transition begins.
+
+ 2. Resuming devices and restoring the working-state configuration of IRQs.
+
+ Devices are resumed in four phases called *noirq resume*, *early resume*,
+ *resume* and *complete* (see :ref:`driverapi_pm_devices` for more
+ information on what exactly happens in each phase).
+
+ Every device is visited in each phase, but typically it is not physically
+ accessed in more than two of them.
+
+ The working-state configuration of IRQs is restored after the *noirq* resume
+ phase and the runtime PM API is re-enabled for every device whose driver
+ supports it during the *early* resume phase.
+
+ 3. Thawing tasks.
+
+ Tasks frozen in step 2 of the preceding `suspend <s2idle_suspend_>`_
+ transition are "thawed", which means that they are woken up from the
+ uninterruptible sleep that they went into at that time and user space tasks
+ are allowed to exit the kernel.
+
+ 4. Invoking system-wide resume notifiers.
+
+ This is analogous to step 1 of the `suspend <s2idle_suspend_>`_ transition
+ and the same set of callbacks is invoked at this point, but a different
+ "notification type" parameter value is passed to them.
+
+
+Platform-dependent Suspend Code Flow
+====================================
+
+The following steps are taken in order to transition the system from the working
+state to platform-dependent suspend state:
+
+ 1. Invoking system-wide suspend notifiers.
+
+ This step is the same as step 1 of the suspend-to-idle suspend transition
+ described `above <s2idle_suspend_>`_.
+
+ 2. Freezing tasks.
+
+ This step is the same as step 2 of the suspend-to-idle suspend transition
+ described `above <s2idle_suspend_>`_.
+
+ 3. Suspending devices and reconfiguring IRQs.
+
+ This step is analogous to step 3 of the suspend-to-idle suspend transition
+ described `above <s2idle_suspend_>`_, but the arming of IRQs for system
+ wakeup generally does not have any effect on the platform.
+
+ There are platforms that can go into a very deep low-power state internally
+ when all CPUs in them are in sufficiently deep idle states and all I/O
+ devices have been put into low-power states. On those platforms,
+ suspend-to-idle can reduce system power very effectively.
+
+ On the other platforms, however, low-level components (like interrupt
+ controllers) need to be turned off in a platform-specific way (implemented
+ in the hooks provided by the platform driver) to achieve comparable power
+ reduction.
+
+ That usually prevents in-band hardware interrupts from waking up the system,
+ which must be done in a special platform-dependent way. Then, the
+ configuration of system wakeup sources usually starts when system wakeup
+ devices are suspended and is finalized by the platform suspend hooks later
+ on.
+
+ 4. Disabling non-boot CPUs.
+
+ On some platforms the suspend hooks mentioned above must run in a one-CPU
+ configuration of the system (in particular, the hardware cannot be accessed
+ by any code running in parallel with the platform suspend hooks that may,
+ and often do, trap into the platform firmware in order to finalize the
+ suspend transition).
+
+ For this reason, the CPU offline/online (CPU hotplug) framework is used
+ to take all of the CPUs in the system, except for one (the boot CPU),
+ offline (typically, the CPUs that have been taken offline go into deep idle
+ states).
+
+ This means that all tasks are migrated away from those CPUs and all IRQs are
+ rerouted to the only CPU that remains online.
+
+ 5. Suspending core system components.
+
+ This prepares the core system components for (possibly) losing power going
+ forward and suspends the timekeeping.
+
+ 6. Platform-specific power removal.
+
+ This is expected to remove power from all of the system components except
+ for the memory controller and RAM (in order to preserve the contents of the
+ latter) and some devices designated for system wakeup.
+
+ In many cases control is passed to the platform firmware which is expected
+ to finalize the suspend transition as needed.
+
+
+Platform-dependent Resume Code Flow
+===================================
+
+The following steps are taken in order to transition the system from a
+platform-dependent suspend state into the working state:
+
+ 1. Platform-specific system wakeup.
+
+ The platform is woken up by a signal from one of the designated system
+ wakeup devices (which need not be an in-band hardware interrupt) and
+ control is passed back to the kernel (the working configuration of the
+ platform may need to be restored by the platform firmware before the
+ kernel gets control again).
+
+ 2. Resuming core system components.
+
+ The suspend-time configuration of the core system components is restored and
+ the timekeeping is resumed.
+
+ 3. Re-enabling non-boot CPUs.
+
+ The CPUs disabled in step 4 of the preceding suspend transition are taken
+ back online and their suspend-time configuration is restored.
+
+ 4. Resuming devices and restoring the working-state configuration of IRQs.
+
+ This step is the same as step 2 of the suspend-to-idle suspend transition
+ described `above <s2idle_resume_>`_.
+
+ 5. Thawing tasks.
+
+ This step is the same as step 3 of the suspend-to-idle suspend transition
+ described `above <s2idle_resume_>`_.
+
+ 6. Invoking system-wide resume notifiers.
+
+ This step is the same as step 4 of the suspend-to-idle suspend transition
+ described `above <s2idle_resume_>`_.
diff --git a/Documentation/admin-guide/pm/system-wide.rst b/Documentation/admin-guide/pm/system-wide.rst
index 2b1f987b34f0..1a1924d71006 100644
--- a/Documentation/admin-guide/pm/system-wide.rst
+++ b/Documentation/admin-guide/pm/system-wide.rst
@@ -8,3 +8,4 @@ System-Wide Power Management
:maxdepth: 2
sleep-states
+ suspend-flows
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index bb1ae400ec1f..4edc8a3ce40f 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -1009,6 +1009,10 @@ static bool acpi_s2idle_wake(void)
if (acpi_any_fixed_event_status_set())
return true;
+ /* Check wakeups from drivers sharing the SCI. */
+ if (acpi_check_wakeup_handlers())
+ return true;
+
/*
* If the status bit is set for any enabled GPE other than the
* EC one, the wakeup is regarded as a genuine one.
diff --git a/drivers/acpi/sleep.h b/drivers/acpi/sleep.h
index 41675d24a9bc..3d90480ce1b1 100644
--- a/drivers/acpi/sleep.h
+++ b/drivers/acpi/sleep.h
@@ -2,6 +2,7 @@
extern void acpi_enable_wakeup_devices(u8 sleep_state);
extern void acpi_disable_wakeup_devices(u8 sleep_state);
+extern bool acpi_check_wakeup_handlers(void);
extern struct list_head acpi_wakeup_device_list;
extern struct mutex acpi_device_lock;
diff --git a/drivers/acpi/wakeup.c b/drivers/acpi/wakeup.c
index c28244df56a5..0b2e42530adf 100644
--- a/drivers/acpi/wakeup.c
+++ b/drivers/acpi/wakeup.c
@@ -12,6 +12,15 @@
#include "internal.h"
#include "sleep.h"
+struct acpi_wakeup_handler {
+ struct list_head list_node;
+ bool (*wakeup)(void *context);
+ void *context;
+};
+
+static LIST_HEAD(acpi_wakeup_handler_head);
+static DEFINE_MUTEX(acpi_wakeup_handler_mutex);
+
/*
* We didn't lock acpi_device_lock in the file, because it invokes oops in
* suspend/resume and isn't really required as this is called in S-state. At
@@ -90,3 +99,75 @@ int __init acpi_wakeup_device_init(void)
mutex_unlock(&acpi_device_lock);
return 0;
}
+
+/**
+ * acpi_register_wakeup_handler - Register wakeup handler
+ * @wake_irq: The IRQ through which the device may receive wakeups
+ * @wakeup: Wakeup-handler to call when the SCI has triggered a wakeup
+ * @context: Context to pass to the handler when calling it
+ *
+ * Drivers which may share an IRQ with the SCI can use this to register
+ * a handler which returns true when the device they are managing wants
+ * to trigger a wakeup.
+ */
+int acpi_register_wakeup_handler(int wake_irq, bool (*wakeup)(void *context),
+ void *context)
+{
+ struct acpi_wakeup_handler *handler;
+
+ /*
+ * If the device is not sharing its IRQ with the SCI, there is no
+ * need to register the handler.
+ */
+ if (!acpi_sci_irq_valid() || wake_irq != acpi_sci_irq)
+ return 0;
+
+ handler = kmalloc(sizeof(*handler), GFP_KERNEL);
+ if (!handler)
+ return -ENOMEM;
+
+ handler->wakeup = wakeup;
+ handler->context = context;
+
+ mutex_lock(&acpi_wakeup_handler_mutex);
+ list_add(&handler->list_node, &acpi_wakeup_handler_head);
+ mutex_unlock(&acpi_wakeup_handler_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_register_wakeup_handler);
+
+/**
+ * acpi_unregister_wakeup_handler - Unregister wakeup handler
+ * @wakeup: Wakeup-handler passed to acpi_register_wakeup_handler()
+ * @context: Context passed to acpi_register_wakeup_handler()
+ */
+void acpi_unregister_wakeup_handler(bool (*wakeup)(void *context),
+ void *context)
+{
+ struct acpi_wakeup_handler *handler;
+
+ mutex_lock(&acpi_wakeup_handler_mutex);
+ list_for_each_entry(handler, &acpi_wakeup_handler_head, list_node) {
+ if (handler->wakeup == wakeup && handler->context == context) {
+ list_del(&handler->list_node);
+ kfree(handler);
+ break;
+ }
+ }
+ mutex_unlock(&acpi_wakeup_handler_mutex);
+}
+EXPORT_SYMBOL_GPL(acpi_unregister_wakeup_handler);
+
+bool acpi_check_wakeup_handlers(void)
+{
+ struct acpi_wakeup_handler *handler;
+
+ /* No need to lock, nothing else is running when we're called. */
+ list_for_each_entry(handler, &acpi_wakeup_handler_head, list_node) {
+ if (handler->wakeup(handler->context))
+ return true;
+ }
+
+ return false;
+}
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 6d1dee7051eb..fdd508a78ffd 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1922,10 +1922,6 @@ static int device_prepare(struct device *dev, pm_message_t state)
if (dev->power.syscore)
return 0;
- WARN_ON(!pm_runtime_enabled(dev) &&
- dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND |
- DPM_FLAG_LEAVE_SUSPENDED));
-
/*
* If a device's parent goes into runtime suspend at the wrong time,
* it won't be possible to resume the device. To prevent this we
@@ -1973,8 +1969,7 @@ unlock:
*/
spin_lock_irq(&dev->power.lock);
dev->power.direct_complete = state.event == PM_EVENT_SUSPEND &&
- ((pm_runtime_suspended(dev) && ret > 0) ||
- dev->power.no_pm_callbacks) &&
+ (ret > 0 || dev->power.no_pm_callbacks) &&
!dev_pm_test_driver_flags(dev, DPM_FLAG_NEVER_SKIP);
spin_unlock_irq(&dev->power.lock);
return 0;
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index bff5295016ae..c3e6bd59e920 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -37,10 +37,12 @@ config CPU_FREQ_STAT
choice
prompt "Default CPUFreq governor"
default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ
+ default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if BIG_LITTLE
+ default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if X86_INTEL_PSTATE && SMP
default CPU_FREQ_DEFAULT_GOV_PERFORMANCE
help
This option sets which CPUFreq governor shall be loaded at
- startup. If in doubt, select 'performance'.
+ startup. If in doubt, use the default setting.
config CPU_FREQ_DEFAULT_GOV_PERFORMANCE
bool "performance"
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 62502d0e4c33..bc58a0809d11 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -8,6 +8,8 @@ config X86_INTEL_PSTATE
depends on X86
select ACPI_PROCESSOR if ACPI
select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
+ select CPU_FREQ_GOV_PERFORMANCE
+ select CPU_FREQ_GOV_SCHEDUTIL if SMP
help
This driver provides a P state for Intel core processors.
The driver implements an internal governor and will become
diff --git a/drivers/platform/x86/intel_int0002_vgpio.c b/drivers/platform/x86/intel_int0002_vgpio.c
index 7b23efc46a43..289c6655d425 100644
--- a/drivers/platform/x86/intel_int0002_vgpio.c
+++ b/drivers/platform/x86/intel_int0002_vgpio.c
@@ -127,6 +127,14 @@ static irqreturn_t int0002_irq(int irq, void *data)
return IRQ_HANDLED;
}
+static bool int0002_check_wake(void *data)
+{
+ u32 gpe_sts_reg;
+
+ gpe_sts_reg = inl(GPE0A_STS_PORT);
+ return (gpe_sts_reg & GPE0A_PME_B0_STS_BIT);
+}
+
static struct irq_chip int0002_byt_irqchip = {
.name = DRV_NAME,
.irq_ack = int0002_irq_ack,
@@ -220,6 +228,7 @@ static int int0002_probe(struct platform_device *pdev)
return ret;
}
+ acpi_register_wakeup_handler(irq, int0002_check_wake, NULL);
device_init_wakeup(dev, true);
return 0;
}
@@ -227,6 +236,7 @@ static int int0002_probe(struct platform_device *pdev)
static int int0002_remove(struct platform_device *pdev)
{
device_init_wakeup(&pdev->dev, false);
+ acpi_unregister_wakeup_handler(int0002_check_wake, NULL);
return 0;
}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b7d3caf6f205..9f70b7807d53 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -488,6 +488,11 @@ void __init acpi_nvs_nosave_s3(void);
void __init acpi_sleep_no_blacklist(void);
#endif /* CONFIG_PM_SLEEP */
+int acpi_register_wakeup_handler(
+ int wake_irq, bool (*wakeup)(void *context), void *context);
+void acpi_unregister_wakeup_handler(
+ bool (*wakeup)(void *context), void *context);
+
struct acpi_osc_context {
char *uuid_str; /* UUID string */
int rev;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6dbeedb7354c..86aba8706b16 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -678,7 +678,7 @@ static int load_image_and_restore(void)
error = swsusp_read(&flags);
swsusp_close(FMODE_READ);
if (!error)
- hibernation_restore(flags & SF_PLATFORM_MODE);
+ error = hibernation_restore(flags & SF_PLATFORM_MODE);
pr_err("Failed to load image, recovering.\n");
swsusp_free();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 69b7a8aeca3b..40f86ec4ab30 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -535,6 +535,13 @@ static ssize_t pm_debug_messages_store(struct kobject *kobj,
power_attr(pm_debug_messages);
+static int __init pm_debug_messages_setup(char *str)
+{
+ pm_debug_messages_on = true;
+ return 1;
+}
+__setup("pm_debug_messages", pm_debug_messages_setup);
+
/**
* __pm_pr_dbg - Print a suspend debug message to the kernel log.
* @defer: Whether or not to use printk_deferred() to print the message.