diff options
Diffstat (limited to 'kernel/power')
| -rw-r--r-- | kernel/power/Kconfig | 222 | ||||
| -rw-r--r-- | kernel/power/Makefile | 17 | ||||
| -rw-r--r-- | kernel/power/autosleep.c | 9 | ||||
| -rw-r--r-- | kernel/power/block_io.c | 103 | ||||
| -rw-r--r-- | kernel/power/console.c | 26 | ||||
| -rw-r--r-- | kernel/power/em_netlink.c | 308 | ||||
| -rw-r--r-- | kernel/power/em_netlink.h | 39 | ||||
| -rw-r--r-- | kernel/power/em_netlink_autogen.c | 48 | ||||
| -rw-r--r-- | kernel/power/em_netlink_autogen.h | 23 | ||||
| -rw-r--r-- | kernel/power/energy_model.c | 1046 | ||||
| -rw-r--r-- | kernel/power/hibernate.c | 909 | ||||
| -rw-r--r-- | kernel/power/main.c | 787 | ||||
| -rw-r--r-- | kernel/power/power.h | 120 | ||||
| -rw-r--r-- | kernel/power/poweroff.c | 7 | ||||
| -rw-r--r-- | kernel/power/process.c | 114 | ||||
| -rw-r--r-- | kernel/power/qos.c | 858 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 1901 | ||||
| -rw-r--r-- | kernel/power/suspend.c | 504 | ||||
| -rw-r--r-- | kernel/power/suspend_test.c | 86 | ||||
| -rw-r--r-- | kernel/power/swap.c | 990 | ||||
| -rw-r--r-- | kernel/power/user.c | 265 | ||||
| -rw-r--r-- | kernel/power/wakelock.c | 66 |
22 files changed, 6117 insertions, 2331 deletions
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d444c4e834f4..05337f437cca 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,8 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE default y - ---help--- + help Allow the system to enter sleep states in which main memory is powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). @@ -18,6 +19,19 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config SUSPEND_SKIP_SYNC + bool "Skip kernel's sys_sync() on suspend to RAM/standby" + depends on SUSPEND + depends on EXPERT + help + Skip the kernel sys_sync() before freezing user processes. + Some systems prefer not to pay this cost on every invocation + of suspend, or they are content with invoking sync() from + user-space before invoking suspend. There's a run-time switch + at '/sys/power/sync_on_suspend' to configure this behaviour. + This setting changes the default for the run-tim switch. Say Y + to change the default to disable the kernel sys_sync(). + config HIBERNATE_CALLBACKS bool @@ -25,10 +39,10 @@ config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on SWAP && ARCH_HIBERNATION_POSSIBLE select HIBERNATE_CALLBACKS - select LZO_COMPRESS - select LZO_DECOMPRESS select CRC32 - ---help--- + select CRYPTO + select CRYPTO_LZO + help Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the system and powers it off; and restores that checkpoint on reboot. @@ -55,7 +69,7 @@ config HIBERNATION need to run mkswap against the swap partition used for the suspend. It also works with swap files to a limited extent (for details see - <file:Documentation/power/swsusp-and-swap-files.txt>). + <file:Documentation/power/swsusp-and-swap-files.rst>). Right now you may boot without resuming and resume later but in the meantime you cannot use the swap partition(s)/file(s) involved in @@ -64,36 +78,68 @@ config HIBERNATION MOUNT any journaled filesystems mounted before the suspend or they will get corrupted in a nasty way. - For more information take a look at <file:Documentation/power/swsusp.txt>. + For more information take a look at <file:Documentation/power/swsusp.rst>. -config ARCH_SAVE_PAGE_KEYS - bool +config HIBERNATION_SNAPSHOT_DEV + bool "Userspace snapshot device" + depends on HIBERNATION + default y + help + Device used by the uswsusp tools. + + Say N if no snapshotting from userspace is needed, this also + reduces the attack surface of the kernel. + + If in doubt, say Y. + +choice + prompt "Default compressor" + default HIBERNATION_COMP_LZO + depends on HIBERNATION + +config HIBERNATION_COMP_LZO + bool "lzo" + depends on CRYPTO_LZO + +config HIBERNATION_COMP_LZ4 + bool "lz4" + depends on CRYPTO_LZ4 + +endchoice + +config HIBERNATION_DEF_COMP + string + default "lzo" if HIBERNATION_COMP_LZO + default "lz4" if HIBERNATION_COMP_LZ4 + help + Default compressor to be used for hibernation. config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION default "" - ---help--- + help The default resume partition is the partition that the suspend- - to-disk implementation will look for a suspended disk image. + to-disk implementation will look for a suspended disk image. - The partition specified here will be different for almost every user. + The partition specified here will be different for almost every user. It should be a valid swap partition (at least for now) that is turned - on before suspending. + on before suspending. The partition specified can be overridden by specifying: - resume=/dev/<other device> + resume=/dev/<other device> - which will set the resume partition to the device specified. + which will set the resume partition to the device specified. Note there is currently not a way to specify which device to save the - suspended image to. It will simply pick the first available swap + suspended image to. It will simply pick the first available swap device. config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS + select PM config PM_SLEEP_SMP def_bool y @@ -102,19 +148,46 @@ config PM_SLEEP_SMP depends on PM_SLEEP select HOTPLUG_CPU +config PM_SLEEP_SMP_NONZERO_CPU + def_bool y + depends on PM_SLEEP_SMP + depends on ARCH_SUSPEND_NONZERO_CPU + help + If an arch can suspend (for suspend, hibernate, kexec, etc) on a + non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This + will allow nohz_full mask to include CPU0. + config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP - default n - ---help--- + help Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. +config PM_USERSPACE_AUTOSLEEP + bool "Userspace opportunistic sleep" + depends on PM_SLEEP + help + Notify kernel of aggressive userspace autosleep power management policy. + + This option changes the behavior of various sleep-sensitive code to deal + with frequent userspace-initiated transitions into a global sleep state. + + Saying Y here, disables code paths that most users really should keep + enabled. In particular, only enable this if it is very common to be + asleep/awake for very short periods of time (<= 2 seconds). + + Only platforms, such as Android, that implement opportunistic sleep from + a userspace power manager service should enable this option; and not + other machines. Therefore, you should say N here, unless you are + extremely certain that this is what you want. The option otherwise has + bad, undesirable effects, and should not be enabled just for fun. + + config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP - default n - ---help--- + help Allow user space to create, activate and deactivate wakeup source objects with the help of a sysfs-based interface. @@ -129,28 +202,34 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y -config PM_RUNTIME - bool "Run-time PM core functionality" - depends on !IA64_HP_SIM - ---help--- +config PM_QOS_CPU_SYSTEM_WAKEUP + bool "User space interface for CPU system wakeup QoS" + depends on CPU_IDLE + help + Enable this to allow user space via the cpu_wakeup_latency file to + specify a CPU system wakeup latency limit. + + This may be particularly useful for platforms supporting multiple low + power states for CPUs during system-wide suspend and s2idle in + particular. + +config PM + bool "Device power management core functionality" + help Enable functionality allowing I/O devices to be put into energy-saving - (low power) states at run time (or autosuspended) after a specified - period of inactivity and woken up in response to a hardware-generated + (low power) states, for example after a specified period of inactivity + (autosuspended), and woken up in response to a hardware-generated wake-up event or a driver's request. Hardware support is generally required for this functionality to work and the bus type drivers of the buses the devices are on are - responsible for the actual handling of the autosuspend requests and + responsible for the actual handling of device suspend requests and wake-up events. -config PM - def_bool y - depends on PM_SLEEP || PM_RUNTIME - config PM_DEBUG bool "Power Management Debug Support" depends on PM - ---help--- + help This option enables various debugging support in the Power Management code. This is helpful when debugging and reporting PM bugs, like suspend support. @@ -158,7 +237,7 @@ config PM_DEBUG config PM_ADVANCED_DEBUG bool "Extra PM attributes in sysfs for low-level debugging/testing" depends on PM_DEBUG - ---help--- + help Add extra sysfs attributes allowing one to access some Power Management fields of device objects from user space. If you are not a kernel developer interested in debugging/testing Power Management, say "no". @@ -166,7 +245,7 @@ config PM_ADVANCED_DEBUG config PM_TEST_SUSPEND bool "Test suspend/resume and wakealarm during bootup" depends on SUSPEND && PM_DEBUG && RTC_CLASS=y - ---help--- + help This option will let you suspend your machine during bootup, and make it wake up a few seconds later using an RTC wakeup alarm. Enable this with a kernel parameter like "test_suspend=mem". @@ -178,6 +257,41 @@ config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP +config DPM_WATCHDOG + bool "Device suspend/resume watchdog" + depends on PM_DEBUG && PSTORE && EXPERT + help + Sets up a watchdog timer to capture drivers that are + locked up attempting to suspend/resume a device. + A detected lockup causes system panic with message + captured in pstore device for inspection in subsequent + boot session. + +config DPM_WATCHDOG_TIMEOUT + int "Watchdog timeout to panic in seconds" + range 1 120 + default 120 + depends on DPM_WATCHDOG + +config DPM_WATCHDOG_WARNING_TIMEOUT + int "Watchdog timeout to warn in seconds" + range 1 DPM_WATCHDOG_TIMEOUT + default DPM_WATCHDOG_TIMEOUT + depends on DPM_WATCHDOG + help + If the DPM watchdog warning timeout and main timeout are + different then a non-fatal warning (with a stack trace of + the stuck suspend routine) will be printed when the warning + timeout expires. If the suspend routine gets un-stuck + before the main timeout expires then no other action is + taken. If the routine continues to be stuck and the main + timeout expires then an emergency-level message and stack + trace will be printed and the system will panic. + + If the warning timeout is equal to the main timeout (the + default) then the warning will never happen and the system + will jump straight to panic when the main timeout expires. + config PM_TRACE bool help @@ -198,7 +312,7 @@ config PM_TRACE_RTC depends on PM_SLEEP_DEBUG depends on X86 select PM_TRACE - ---help--- + help This enables some cheesy code to save the last PM event point in the RTC across reboots, so that you can debug a machine that just hangs during suspend (or more commonly, during resume). @@ -213,7 +327,7 @@ config PM_TRACE_RTC config APM_EMULATION tristate "Advanced Power Management Emulation" - depends on PM && SYS_SUPPORTS_APM_EMULATION + depends on SYS_SUPPORTS_APM_EMULATION help APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -223,7 +337,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/power/apm-acpi.txt> + and more information, read <file:Documentation/power/apm-acpi.rst> and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. @@ -237,23 +351,6 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config ARCH_HAS_OPP - bool - -config PM_OPP - bool "Operating Performance Point (OPP) Layer library" - depends on ARCH_HAS_OPP - ---help--- - SOCs have a standard set of tuples consisting of frequency and - voltage pairs that the device will support per voltage domain. This - is called Operating Performance Point or OPP. The actual definitions - of OPP varies over silicon within the same family of devices. - - OPP layer organizes the data internally using device pointers - representing individual voltage domains and provides SOC - implementations a ready to use framework to manage OPPs. - For more information, read <file:Documentation/power/opp.txt> - config PM_CLK def_bool y depends on PM && HAVE_CLK @@ -265,7 +362,6 @@ config PM_GENERIC_DOMAINS config WQ_POWER_EFFICIENT_DEFAULT bool "Enable workqueue power-efficient mode by default" depends on PM - default n help Per-cpu workqueues are generally preferred because they show better performance thanks to cache locality; unfortunately, @@ -286,10 +382,22 @@ config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS -config PM_GENERIC_DOMAINS_RUNTIME +config PM_GENERIC_DOMAINS_OF def_bool y - depends on PM_RUNTIME && PM_GENERIC_DOMAINS + depends on PM_GENERIC_DOMAINS && OF config CPU_PM bool - depends on SUSPEND || CPU_IDLE + +config ENERGY_MODEL + bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" + depends on CPU_FREQ || PM_DEVFREQ + help + Several subsystems (thermal and/or the task scheduler for example) + can leverage information about the energy consumed by devices to + make smarter decisions. This config option enables the framework + from which subsystems can access the energy models. + + The exact usage of the energy model is subsystem-dependent. + + If in doubt, say N. diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bff11ef..773e2789412b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,5 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +ifeq ($(CONFIG_DYNAMIC_DEBUG), y) +CFLAGS_swap.o := -DDEBUG +CFLAGS_snapshot.o := -DDEBUG +CFLAGS_energy_model.o := -DDEBUG +endif + +KASAN_SANITIZE_snapshot.o := n obj-y += qos.o obj-$(CONFIG_PM) += main.o @@ -7,9 +14,13 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ - block_io.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o +obj-$(CONFIG_HIBERNATION_SNAPSHOT_DEV) += user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o + +obj-$(CONFIG_ENERGY_MODEL) += em.o +em-y := energy_model.o +em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index c6422ffeda9a..865df641b97c 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kernel/power/autosleep.c * @@ -8,7 +9,6 @@ #include <linux/device.h> #include <linux/mutex.h> -#include <linux/pm_wakeup.h> #include "power.h" @@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work) mutex_lock(&autosleep_lock); - if (!pm_save_wakeup_count(initial_count)) { + if (!pm_save_wakeup_count(initial_count) || + system_state != SYSTEM_RUNNING) { mutex_unlock(&autosleep_lock); goto out; } @@ -52,7 +53,7 @@ static void try_to_suspend(struct work_struct *work) goto out; /* - * If the wakeup occured for an unknown reason, wait to prevent the + * If the wakeup occurred for an unknown reason, wait to prevent the * system from trying to suspend and waking up in a tight loop. */ if (final_count == initial_count) @@ -114,7 +115,7 @@ int pm_autosleep_set_state(suspend_state_t state) int __init pm_autosleep_init(void) { - autosleep_ws = wakeup_source_register("autosleep"); + autosleep_ws = wakeup_source_register(NULL, "autosleep"); if (!autosleep_ws) return -ENOMEM; diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c deleted file mode 100644 index d09dd10c5a5e..000000000000 --- a/kernel/power/block_io.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * This file provides functions for block I/O operations on swap/file. - * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> - * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * This file is released under the GPLv2. - */ - -#include <linux/bio.h> -#include <linux/kernel.h> -#include <linux/pagemap.h> -#include <linux/swap.h> - -#include "power.h" - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, struct block_device *bdev, sector_t sector, - struct page *page, struct bio **bio_chain) -{ - const int bio_rw = rw | REQ_SYNC; - struct bio *bio; - - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_sector = sector; - bio->bi_bdev = bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", - (unsigned long long)sector); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(bio_rw, bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(bio_rw, bio); - } - return 0; -} - -int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} diff --git a/kernel/power/console.c b/kernel/power/console.c index 463aa6736751..a906a0ac0f9b 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Functions for saving/restoring console. * @@ -9,11 +10,13 @@ #include <linux/kbd_kern.h> #include <linux/vt.h> #include <linux/module.h> +#include <linux/slab.h> #include "power.h" #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static bool vt_switch_done; static DEFINE_MUTEX(vt_switch_mutex); @@ -41,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list); * no_console_suspend argument has been passed on the command line, VT * switches will occur. */ -void pm_vt_switch_required(struct device *dev, bool required) +int pm_vt_switch_required(struct device *dev, bool required) { struct pm_vt_switch *entry, *tmp; + int ret = 0; mutex_lock(&vt_switch_mutex); list_for_each_entry(tmp, &pm_vt_switch_list, head) { @@ -55,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required) } entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) + if (!entry) { + ret = -ENOMEM; goto out; + } entry->required = required; entry->dev = dev; @@ -64,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required) list_add(&entry->head, &pm_vt_switch_list); out: mutex_unlock(&vt_switch_mutex); + return ret; } EXPORT_SYMBOL(pm_vt_switch_required); @@ -81,6 +88,7 @@ void pm_vt_switch_unregister(struct device *dev) list_for_each_entry(tmp, &pm_vt_switch_list, head) { if (tmp->dev == dev) { list_del(&tmp->head); + kfree(tmp); break; } } @@ -124,26 +132,30 @@ out: return ret; } -int pm_prepare_console(void) +void pm_prepare_console(void) { if (!pm_vt_switch()) - return 0; + return; orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); if (orig_fgconsole < 0) - return 1; + return; + + vt_switch_done = true; orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); - return 0; + return; } void pm_restore_console(void) { - if (!pm_vt_switch()) + if (!pm_vt_switch() && !vt_switch_done) return; if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); } + + vt_switch_done = false; } diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c new file mode 100644 index 000000000000..4b85da138a06 --- /dev/null +++ b/kernel/power/em_netlink.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include <linux/energy_model.h> +#include <net/sock.h> +#include <net/genetlink.h> +#include <uapi/linux/energy_model.h> + +#include "em_netlink.h" +#include "em_netlink_autogen.h" + +#define EM_A_PD_CPUS_LEN 256 + +/*************************** Command encoding ********************************/ +static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + int *tot_msg_sz = data; + int msg_sz, cpus_sz; + + cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + + msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */ + nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */ + nla_total_size(cpus_sz); /* EM_A_PD_CPUS */ + + *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz)); + return 0; +} + +static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + struct sk_buff *msg = data; + struct nlattr *entry; + + entry = nla_nest_start(msg, EM_A_PDS_PD); + if (!entry) + goto out_cancel_nest; + + if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id)) + goto out_cancel_nest; + + if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD)) + goto out_cancel_nest; + + snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf)) + goto out_cancel_nest; + + nla_nest_end(msg, entry); + + return 0; + +out_cancel_nest: + nla_nest_cancel(msg, entry); + + return -EMSGSIZE; +} + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + int cmd = info->genlhdr->cmd; + int ret = -EMSGSIZE, msg_sz = 0; + + for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = for_each_em_perf_domain(__em_nl_get_pd, msg); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) +{ + struct em_perf_domain *pd; + int id; + + if (!attrs[EM_A_PD_TABLE_PD_ID]) + return NULL; + + id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]); + pd = em_perf_domain_get_by_id(id); + return pd; +} + +static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd) +{ + int id_sz, ps_sz; + + id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */ + nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */ + ps_sz *= pd->nr_perf_states; + + return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz)); +} + +static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) +{ + struct em_perf_state *table, *ps; + struct nlattr *entry; + int i; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) + goto out_err; + + rcu_read_lock(); + table = em_perf_state_from_pd((struct em_perf_domain *)pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &table[i]; + + entry = nla_nest_start(msg, EM_A_PD_TABLE_PS); + if (!entry) + goto out_unlock_ps; + + if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE, + ps->performance, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY, + ps->frequency, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_POWER, + ps->power, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_COST, + ps->cost, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS, + ps->flags, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + + nla_nest_end(msg, entry); + } + rcu_read_unlock(); + return 0; + +out_cancel_ps_nest: + nla_nest_cancel(msg, entry); +out_unlock_ps: + rcu_read_unlock(); +out_err: + return -EMSGSIZE; +} + +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) +{ + int cmd = info->genlhdr->cmd; + int msg_sz, ret = -EMSGSIZE; + struct em_perf_domain *pd; + struct sk_buff *msg; + void *hdr; + + pd = __em_nl_get_pd_table_id(info->attrs); + if (!pd) + return -EINVAL; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +out_free_msg: + nlmsg_free(msg); + return ret; +} + + +/**************************** Event encoding *********************************/ +static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) +{ + struct sk_buff *msg; + int msg_sz, ret = -EMSGSIZE; + void *hdr; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +void em_notify_pd_created(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_CREATED); +} + +void em_notify_pd_updated(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_UPDATED); +} + +static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) +{ + int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + + return nlmsg_total_size(genlmsg_msg_size(id_sz)); +} + +void em_notify_pd_deleted(const struct em_perf_domain *pd) +{ + struct sk_buff *msg; + void *hdr; + int msg_sz; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_notify_pd_deleted_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED); + if (!hdr) + goto out_free_msg; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) { + goto out_free_msg; + } + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +/**************************** Initialization *********************************/ +static int __init em_netlink_init(void) +{ + return genl_register_family(&em_nl_family); +} +postcore_initcall(em_netlink_init); diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h new file mode 100644 index 000000000000..583d7f1c3939 --- /dev/null +++ b/kernel/power/em_netlink.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ +#ifndef _EM_NETLINK_H +#define _EM_NETLINK_H + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data); +struct em_perf_domain *em_perf_domain_get_by_id(int id); +void em_notify_pd_created(const struct em_perf_domain *pd); +void em_notify_pd_deleted(const struct em_perf_domain *pd); +void em_notify_pd_updated(const struct em_perf_domain *pd); +#else +static inline +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + return -EINVAL; +} +static inline +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + return NULL; +} + +static inline void em_notify_pd_created(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {} +#endif + +#endif /* _EM_NETLINK_H */ diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c new file mode 100644 index 000000000000..a7a09ab1d1c2 --- /dev/null +++ b/kernel/power/em_netlink_autogen.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "em_netlink_autogen.h" + +#include <uapi/linux/energy_model.h> + +/* EM_CMD_GET_PD_TABLE - do */ +static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { + [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +}; + +/* Ops table for em */ +static const struct genl_split_ops em_nl_ops[] = { + { + .cmd = EM_CMD_GET_PDS, + .doit = em_nl_get_pds_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = EM_CMD_GET_PD_TABLE, + .doit = em_nl_get_pd_table_doit, + .policy = em_get_pd_table_nl_policy, + .maxattr = EM_A_PD_TABLE_PD_ID, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group em_nl_mcgrps[] = { + [EM_NLGRP_EVENT] = { "event", }, +}; + +struct genl_family em_nl_family __ro_after_init = { + .name = EM_FAMILY_NAME, + .version = EM_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = em_nl_ops, + .n_split_ops = ARRAY_SIZE(em_nl_ops), + .mcgrps = em_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), +}; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h new file mode 100644 index 000000000000..78ce609641f1 --- /dev/null +++ b/kernel/power/em_netlink_autogen.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_EM_GEN_H +#define _LINUX_EM_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/energy_model.h> + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + EM_NLGRP_EVENT, +}; + +extern struct genl_family em_nl_family; + +#endif /* _LINUX_EM_GEN_H */ diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c new file mode 100644 index 000000000000..11af9f64aa82 --- /dev/null +++ b/kernel/power/energy_model.c @@ -0,0 +1,1046 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Energy Model of devices + * + * Copyright (c) 2018-2021, Arm ltd. + * Written by: Quentin Perret, Arm ltd. + * Improvements provided by: Lukasz Luba, Arm ltd. + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include <linux/cpu.h> +#include <linux/cpufreq.h> +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/energy_model.h> +#include <linux/sched/topology.h> +#include <linux/slab.h> + +#include "em_netlink.h" + +/* + * Mutex serializing the registrations of performance domains and letting + * callbacks defined by drivers sleep. + */ +static DEFINE_MUTEX(em_pd_mutex); + +/* + * Manage performance domains with IDs. One can iterate the performance domains + * through the list and pick one with their associated ID. The mutex serializes + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be + * taken to avoid potential deadlock. + */ +static DEFINE_IDA(em_pd_ida); +static LIST_HEAD(em_pd_list); +static DEFINE_MUTEX(em_pd_list_mutex); + +static void em_cpufreq_update_efficiencies(struct device *dev, + struct em_perf_state *table); +static void em_check_capacity_update(void); +static void em_update_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn); + +static bool _is_cpu_device(struct device *dev) +{ + return (dev->bus == &cpu_subsys); +} + +#ifdef CONFIG_DEBUG_FS +static struct dentry *rootdir; + +struct em_dbg_info { + struct em_perf_domain *pd; + int ps_id; +}; + +#define DEFINE_EM_DBG_SHOW(name, fname) \ +static int em_debug_##fname##_show(struct seq_file *s, void *unused) \ +{ \ + struct em_dbg_info *em_dbg = s->private; \ + struct em_perf_state *table; \ + unsigned long val; \ + \ + rcu_read_lock(); \ + table = em_perf_state_from_pd(em_dbg->pd); \ + val = table[em_dbg->ps_id].name; \ + rcu_read_unlock(); \ + \ + seq_printf(s, "%lu\n", val); \ + return 0; \ +} \ +DEFINE_SHOW_ATTRIBUTE(em_debug_##fname) + +DEFINE_EM_DBG_SHOW(frequency, frequency); +DEFINE_EM_DBG_SHOW(power, power); +DEFINE_EM_DBG_SHOW(cost, cost); +DEFINE_EM_DBG_SHOW(performance, performance); +DEFINE_EM_DBG_SHOW(flags, inefficiency); + +static void em_debug_create_ps(struct em_perf_domain *em_pd, + struct em_dbg_info *em_dbg, int i, + struct dentry *pd) +{ + struct em_perf_state *table; + unsigned long freq; + struct dentry *d; + char name[24]; + + em_dbg[i].pd = em_pd; + em_dbg[i].ps_id = i; + + rcu_read_lock(); + table = em_perf_state_from_pd(em_pd); + freq = table[i].frequency; + rcu_read_unlock(); + + snprintf(name, sizeof(name), "ps:%lu", freq); + + /* Create per-ps directory */ + d = debugfs_create_dir(name, pd); + debugfs_create_file("frequency", 0444, d, &em_dbg[i], + &em_debug_frequency_fops); + debugfs_create_file("power", 0444, d, &em_dbg[i], + &em_debug_power_fops); + debugfs_create_file("cost", 0444, d, &em_dbg[i], + &em_debug_cost_fops); + debugfs_create_file("performance", 0444, d, &em_dbg[i], + &em_debug_performance_fops); + debugfs_create_file("inefficient", 0444, d, &em_dbg[i], + &em_debug_inefficiency_fops); +} + +static int em_debug_cpus_show(struct seq_file *s, void *unused) +{ + seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); + +static int em_debug_flags_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + + seq_printf(s, "%#lx\n", pd->flags); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_flags); + +static int em_debug_id_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + + seq_printf(s, "%d\n", pd->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_id); + +static void em_debug_create_pd(struct device *dev) +{ + struct em_dbg_info *em_dbg; + struct dentry *d; + int i; + + /* Create the directory of the performance domain */ + d = debugfs_create_dir(dev_name(dev), rootdir); + + if (_is_cpu_device(dev)) + debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, + &em_debug_cpus_fops); + + debugfs_create_file("flags", 0444, d, dev->em_pd, + &em_debug_flags_fops); + + debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops); + + em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states, + sizeof(*em_dbg), GFP_KERNEL); + if (!em_dbg) + return; + + /* Create a sub-directory for each performance state */ + for (i = 0; i < dev->em_pd->nr_perf_states; i++) + em_debug_create_ps(dev->em_pd, em_dbg, i, d); + +} + +static void em_debug_remove_pd(struct device *dev) +{ + debugfs_lookup_and_remove(dev_name(dev), rootdir); +} + +static int __init em_debug_init(void) +{ + /* Create /sys/kernel/debug/energy_model directory */ + rootdir = debugfs_create_dir("energy_model", NULL); + + return 0; +} +fs_initcall(em_debug_init); +#else /* CONFIG_DEBUG_FS */ +static void em_debug_create_pd(struct device *dev) {} +static void em_debug_remove_pd(struct device *dev) {} +#endif + +static void em_release_table_kref(struct kref *kref) +{ + /* It was the last owner of this table so we can free */ + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); +} + +/** + * em_table_free() - Handles safe free of the EM table when needed + * @table : EM table which is going to be freed + * + * No return values. + */ +void em_table_free(struct em_perf_table *table) +{ + kref_put(&table->kref, em_release_table_kref); +} + +/** + * em_table_alloc() - Allocate a new EM table + * @pd : EM performance domain for which this must be done + * + * Allocate a new EM table and initialize its kref to indicate that it + * has a user. + * Returns allocated table or NULL. + */ +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) +{ + struct em_perf_table *table; + int table_size; + + table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; + + table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL); + if (!table) + return NULL; + + kref_init(&table->kref); + + return table; +} + +static void em_init_performance(struct device *dev, struct em_perf_domain *pd, + struct em_perf_state *table, int nr_states) +{ + u64 fmax, max_cap; + int i, cpu; + + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return; + + cpu = cpumask_first(em_span_cpus(pd)); + + /* + * Calculate the performance value for each frequency with + * linear relationship. The final CPU capacity might not be ready at + * boot time, but the EM will be updated a bit later with correct one. + */ + fmax = (u64) table[nr_states - 1].frequency; + max_cap = (u64) arch_scale_cpu_capacity(cpu); + for (i = 0; i < nr_states; i++) + table[i].performance = div64_u64(max_cap * table[i].frequency, + fmax); +} + +static int em_compute_costs(struct device *dev, struct em_perf_state *table, + const struct em_data_callback *cb, int nr_states, + unsigned long flags) +{ + unsigned long prev_cost = ULONG_MAX; + int i, ret; + + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return 0; + + /* Compute the cost of each performance state. */ + for (i = nr_states - 1; i >= 0; i--) { + unsigned long power_res, cost; + + if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + return -EINVAL; + } + } else { + /* increase resolution of 'cost' precision */ + power_res = table[i].power * 10; + cost = power_res / table[i].performance; + } + + table[i].cost = cost; + + if (table[i].cost >= prev_cost) { + table[i].flags = EM_PERF_STATE_INEFFICIENT; + dev_dbg(dev, "EM: OPP:%lu is inefficient\n", + table[i].frequency); + } else { + prev_cost = table[i].cost; + } + } + + return 0; +} + +/** + * em_dev_compute_costs() - Calculate cost values for new runtime EM table + * @dev : Device for which the EM table is to be updated + * @table : The new EM table that is going to get the costs calculated + * @nr_states : Number of performance states + * + * Calculate the em_perf_state::cost values for new runtime EM table. The + * values are used for EAS during task placement. It also calculates and sets + * the efficiency flag for each performance state. When the function finish + * successfully the EM table is ready to be updated and used by EAS. + * + * Return 0 on success or a proper error in case of failure. + */ +int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, + int nr_states) +{ + return em_compute_costs(dev, table, NULL, nr_states, 0); +} + +/** + * em_dev_update_perf_domain() - Update runtime EM table for a device + * @dev : Device for which the EM is to be updated + * @new_table : The new EM table that is going to be used from now + * + * Update EM runtime modifiable table for the @dev using the provided @table. + * + * This function uses a mutex to serialize writers, so it must not be called + * from a non-sleeping context. + * + * Return 0 on success or an error code on failure. + */ +int em_dev_update_perf_domain(struct device *dev, + struct em_perf_table *new_table) +{ + struct em_perf_table *old_table; + struct em_perf_domain *pd; + + if (!dev) + return -EINVAL; + + /* Serialize update/unregister or concurrent updates */ + mutex_lock(&em_pd_mutex); + + if (!dev->em_pd) { + mutex_unlock(&em_pd_mutex); + return -EINVAL; + } + pd = dev->em_pd; + + kref_get(&new_table->kref); + + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); + rcu_assign_pointer(pd->em_table, new_table); + + em_cpufreq_update_efficiencies(dev, new_table->state); + + em_table_free(old_table); + + mutex_unlock(&em_pd_mutex); + + em_notify_pd_updated(pd); + return 0; +} +EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); + +static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, + struct em_perf_state *table, + const struct em_data_callback *cb, + unsigned long flags) +{ + unsigned long power, freq, prev_freq = 0; + int nr_states = pd->nr_perf_states; + int i, ret; + + /* Build the list of performance states for this performance domain */ + for (i = 0, freq = 0; i < nr_states; i++, freq++) { + /* + * active_power() is a driver callback which ceils 'freq' to + * lowest performance state of 'dev' above 'freq' and updates + * 'power' and 'freq' accordingly. + */ + ret = cb->active_power(dev, &power, &freq); + if (ret) { + dev_err(dev, "EM: invalid perf. state: %d\n", + ret); + return -EINVAL; + } + + /* + * We expect the driver callback to increase the frequency for + * higher performance states. + */ + if (freq <= prev_freq) { + dev_err(dev, "EM: non-increasing freq: %lu\n", + freq); + return -EINVAL; + } + + /* + * The power returned by active_state() is expected to be + * positive and be in range. + */ + if (!power || power > EM_MAX_POWER) { + dev_err(dev, "EM: invalid power: %lu\n", + power); + return -EINVAL; + } + + table[i].power = power; + table[i].frequency = prev_freq = freq; + } + + em_init_performance(dev, pd, table, nr_states); + + ret = em_compute_costs(dev, table, cb, nr_states, flags); + if (ret) + return -EINVAL; + + return 0; +} + +static int em_create_pd(struct device *dev, int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, + unsigned long flags) +{ + struct em_perf_table *em_table; + struct em_perf_domain *pd; + struct device *cpu_dev; + int cpu, ret, num_cpus, id; + + if (_is_cpu_device(dev)) { + num_cpus = cpumask_weight(cpus); + + /* Prevent max possible energy calculation to not overflow */ + if (num_cpus > EM_MAX_NUM_CPUS) { + dev_err(dev, "EM: too many CPUs, overflow possible\n"); + return -EINVAL; + } + + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return -ENOMEM; + + cpumask_copy(em_span_cpus(pd), cpus); + } else { + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return -ENOMEM; + } + + pd->nr_perf_states = nr_states; + + INIT_LIST_HEAD(&pd->node); + + id = ida_alloc(&em_pd_ida, GFP_KERNEL); + if (id < 0) + return -ENOMEM; + pd->id = id; + + em_table = em_table_alloc(pd); + if (!em_table) + goto free_pd; + + ret = em_create_perf_table(dev, pd, em_table->state, cb, flags); + if (ret) + goto free_pd_table; + + rcu_assign_pointer(pd->em_table, em_table); + + if (_is_cpu_device(dev)) + for_each_cpu(cpu, cpus) { + cpu_dev = get_cpu_device(cpu); + cpu_dev->em_pd = pd; + } + + dev->em_pd = pd; + + return 0; + +free_pd_table: + kfree(em_table); +free_pd: + kfree(pd); + ida_free(&em_pd_ida, id); + return -EINVAL; +} + +static void +em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table) +{ + struct em_perf_domain *pd = dev->em_pd; + struct cpufreq_policy *policy; + int found = 0; + int i, cpu; + + if (!_is_cpu_device(dev)) + return; + + /* Try to get a CPU which is active and in this PD */ + cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask); + if (cpu >= nr_cpu_ids) { + dev_warn(dev, "EM: No online CPU for CPUFreq policy\n"); + return; + } + + policy = cpufreq_cpu_get(cpu); + if (!policy) { + dev_warn(dev, "EM: Access to CPUFreq policy failed\n"); + return; + } + + for (i = 0; i < pd->nr_perf_states; i++) { + if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) + continue; + + if (!cpufreq_table_set_inefficient(policy, table[i].frequency)) + found++; + } + + cpufreq_cpu_put(policy); + + if (!found) + return; + + /* + * Efficiencies have been installed in CPUFreq, inefficient frequencies + * will be skipped. The EM can do the same. + */ + pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES; +} + +/** + * em_pd_get() - Return the performance domain for a device + * @dev : Device to find the performance domain for + * + * Returns the performance domain to which @dev belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_pd_get(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev)) + return NULL; + + return dev->em_pd; +} +EXPORT_SYMBOL_GPL(em_pd_get); + +/** + * em_cpu_get() - Return the performance domain for a CPU + * @cpu : CPU to find the performance domain for + * + * Returns the performance domain to which @cpu belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_cpu_get(int cpu) +{ + struct device *cpu_dev; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return NULL; + + return em_pd_get(cpu_dev); +} +EXPORT_SYMBOL_GPL(em_cpu_get); + +/** + * em_dev_register_perf_domain() - Register the Energy Model (EM) for a device + * @dev : Device for which the EM is to register + * @nr_states : Number of performance states to register + * @cb : Callback functions providing the data of the Energy Model + * @cpus : Pointer to cpumask_t, which in case of a CPU device is + * obligatory. It can be taken from i.e. 'policy->cpus'. For other + * type of devices this should be set to NULL. + * @microwatts : Flag indicating that the power values are in micro-Watts or + * in some other scale. It must be set properly. + * + * Create Energy Model tables for a performance domain using the callbacks + * defined in cb. + * + * The @microwatts is important to set with correct value. Some kernel + * sub-systems might rely on this flag and check if all devices in the EM are + * using the same scale. + * + * If multiple clients register the same performance domain, all but the first + * registration will be ignored. + * + * Return 0 on success + */ +int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ + int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_register_pd_no_update() - Register a perf domain for a device + * @dev : Device to register the PD for + * @nr_states : Number of performance states in the new PD + * @cb : Callback functions for populating the energy model + * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) + * @microwatts : Whether or not the power values in the EM will be in uW + * + * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity + * update after registering the PD, even if @dev is a CPU device. + */ +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ + struct em_perf_table *em_table; + unsigned long cap, prev_cap = 0; + unsigned long flags = 0; + int cpu, ret; + + if (!dev || !nr_states || !cb) + return -EINVAL; + + /* + * Use a mutex to serialize the registration of performance domains and + * let the driver-defined callback functions sleep. + */ + mutex_lock(&em_pd_mutex); + + if (dev->em_pd) { + ret = -EEXIST; + goto unlock; + } + + if (_is_cpu_device(dev)) { + if (!cpus) { + dev_err(dev, "EM: invalid CPU mask\n"); + ret = -EINVAL; + goto unlock; + } + + for_each_cpu(cpu, cpus) { + if (em_cpu_get(cpu)) { + dev_err(dev, "EM: exists for CPU%d\n", cpu); + ret = -EEXIST; + goto unlock; + } + /* + * All CPUs of a domain must have the same + * micro-architecture since they all share the same + * table. + */ + cap = arch_scale_cpu_capacity(cpu); + if (prev_cap && prev_cap != cap) { + dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(cpus)); + + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } + } + + if (microwatts) + flags |= EM_PERF_DOMAIN_MICROWATTS; + else if (cb->get_cost) + flags |= EM_PERF_DOMAIN_ARTIFICIAL; + + /* + * EM only supports uW (exception is artificial EM). + * Therefore, check and force the drivers to provide + * power in uW. + */ + if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) { + dev_err(dev, "EM: only supports uW power values\n"); + ret = -EINVAL; + goto unlock; + } + + ret = em_create_pd(dev, nr_states, cb, cpus, flags); + if (ret) + goto unlock; + + dev->em_pd->flags |= flags; + dev->em_pd->min_perf_state = 0; + dev->em_pd->max_perf_state = nr_states - 1; + + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); + + em_debug_create_pd(dev); + dev_info(dev, "EM: created perf domain\n"); + +unlock: + mutex_unlock(&em_pd_mutex); + if (ret) + return ret; + + mutex_lock(&em_pd_list_mutex); + list_add_tail(&dev->em_pd->node, &em_pd_list); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_created(dev->em_pd); + + return 0; +} +EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); + +/** + * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device + * @dev : Device for which the EM is registered + * + * Unregister the EM for the specified @dev (but not a CPU device). + */ +void em_dev_unregister_perf_domain(struct device *dev) +{ + if (IS_ERR_OR_NULL(dev) || !dev->em_pd) + return; + + if (_is_cpu_device(dev)) + return; + + mutex_lock(&em_pd_list_mutex); + list_del_init(&dev->em_pd->node); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_deleted(dev->em_pd); + + /* + * The mutex separates all register/unregister requests and protects + * from potential clean-up/setup issues in the debugfs directories. + * The debugfs directory name is the same as device's name. + */ + mutex_lock(&em_pd_mutex); + em_debug_remove_pd(dev); + + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); + + ida_free(&em_pd_ida, dev->em_pd->id); + + kfree(dev->em_pd); + dev->em_pd = NULL; + mutex_unlock(&em_pd_mutex); +} +EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); + +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) +{ + struct em_perf_table *em_table; + struct em_perf_state *ps, *new_ps; + int ps_size; + + em_table = em_table_alloc(pd); + if (!em_table) + return NULL; + + new_ps = em_table->state; + + rcu_read_lock(); + ps = em_perf_state_from_pd(pd); + /* Initialize data based on old table */ + ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states; + memcpy(new_ps, ps, ps_size); + + rcu_read_unlock(); + + return em_table; +} + +static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, + struct em_perf_table *em_table) +{ + int ret; + + if (!em_is_artificial(pd)) { + ret = em_compute_costs(dev, em_table->state, NULL, + pd->nr_perf_states, pd->flags); + if (ret) + goto free_em_table; + } + + ret = em_dev_update_perf_domain(dev, em_table); + if (ret) + goto free_em_table; + + /* + * This is one-time-update, so give up the ownership in this updater. + * The EM framework has incremented the usage counter and from now + * will keep the reference (then free the memory when needed). + */ +free_em_table: + em_table_free(em_table); + return ret; +} + +/* + * Adjustment of CPU performance values after boot, when all CPUs capacites + * are correctly calculated. + */ +static void em_adjust_new_capacity(unsigned int cpu, struct device *dev, + struct em_perf_domain *pd) +{ + unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu); + struct em_perf_table *em_table; + struct em_perf_state *table; + unsigned long em_max_perf; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + em_max_perf = table[pd->nr_perf_states - 1].performance; + rcu_read_unlock(); + + if (em_max_perf == cpu_capacity) + return; + + pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, + cpu_capacity, em_max_perf); + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return; + } + + em_init_performance(dev, pd, em_table->state, pd->nr_perf_states); + + em_recalc_and_update(dev, pd, em_table); +} + +/** + * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update. + * @cpu: Target CPU. + * + * Adjust the existing EM for @cpu after a capacity update under the assumption + * that the capacity has been updated in the same way for all of the CPUs in + * the same perf domain. + */ +void em_adjust_cpu_capacity(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + struct em_perf_domain *pd; + + pd = em_pd_get(dev); + if (pd) + em_adjust_new_capacity(cpu, dev, pd); +} + +static void em_check_capacity_update(void) +{ + cpumask_var_t cpu_done_mask; + int cpu, failed_cpus = 0; + + if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { + pr_warn("no free memory\n"); + return; + } + + /* Check if CPUs capacity has changed than update EM */ + for_each_possible_cpu(cpu) { + struct cpufreq_policy *policy; + struct em_perf_domain *pd; + struct device *dev; + + if (cpumask_test_cpu(cpu, cpu_done_mask)) + continue; + + policy = cpufreq_cpu_get(cpu); + if (!policy) { + failed_cpus++; + continue; + } + cpufreq_cpu_put(policy); + + dev = get_cpu_device(cpu); + pd = em_pd_get(dev); + if (!pd || em_is_artificial(pd)) + continue; + + cpumask_or(cpu_done_mask, cpu_done_mask, + em_span_cpus(pd)); + + em_adjust_new_capacity(cpu, dev, pd); + } + + if (failed_cpus) + schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000)); + + free_cpumask_var(cpu_done_mask); +} + +static void em_update_workfn(struct work_struct *work) +{ + em_check_capacity_update(); +} + +/** + * em_dev_update_chip_binning() - Update Energy Model after the new voltage + * information is present in the OPPs. + * @dev : Device for which the Energy Model has to be updated. + * + * This function allows to update easily the EM with new values available in + * the OPP framework and DT. It can be used after the chip has been properly + * verified by device drivers and the voltages adjusted for the 'chip binning'. + */ +int em_dev_update_chip_binning(struct device *dev) +{ + struct em_perf_table *em_table; + struct em_perf_domain *pd; + int i, ret; + + if (IS_ERR_OR_NULL(dev)) + return -EINVAL; + + pd = em_pd_get(dev); + if (!pd) { + dev_warn(dev, "Couldn't find Energy Model\n"); + return -EINVAL; + } + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return -ENOMEM; + } + + /* Update power values which might change due to new voltage in OPPs */ + for (i = 0; i < pd->nr_perf_states; i++) { + unsigned long freq = em_table->state[i].frequency; + unsigned long power; + + ret = dev_pm_opp_calc_power(dev, &power, &freq); + if (ret) { + em_table_free(em_table); + return ret; + } + + em_table->state[i].power = power; + } + + return em_recalc_and_update(dev, pd, em_table); +} +EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); + + +/** + * em_update_performance_limits() - Update Energy Model with performance + * limits information. + * @pd : Performance Domain with EM that has to be updated. + * @freq_min_khz : New minimum allowed frequency for this device. + * @freq_max_khz : New maximum allowed frequency for this device. + * + * This function allows to update the EM with information about available + * performance levels. It takes the minimum and maximum frequency in kHz + * and does internal translation to performance levels. + * Returns 0 on success or -EINVAL when failed. + */ +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + struct em_perf_state *table; + int min_ps = -1; + int max_ps = -1; + int i; + + if (!pd) + return -EINVAL; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + if (freq_min_khz == table[i].frequency) + min_ps = i; + if (freq_max_khz == table[i].frequency) + max_ps = i; + } + rcu_read_unlock(); + + /* Only update when both are found and sane */ + if (min_ps < 0 || max_ps < 0 || max_ps < min_ps) + return -EINVAL; + + + /* Guard simultaneous updates and make them atomic */ + mutex_lock(&em_pd_mutex); + pd->min_perf_state = min_ps; + pd->max_perf_state = max_ps; + mutex_unlock(&em_pd_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(em_update_performance_limits); + +static void rebuild_sd_workfn(struct work_struct *work) +{ + rebuild_sched_domains_energy(); +} + +void em_rebuild_sched_domains(void) +{ + static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); +} + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + int ret; + + ret = cb(pd, data); + if (ret) + return ret; + } + + return 0; +} + +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + if (pd->id == id) + return pd; + } + + return NULL; +} +#endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773e..af8d07bafe02 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. * @@ -6,13 +7,14 @@ * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> - * - * This file is released under the GPLv2. */ +#define pr_fmt(fmt) "PM: hibernation: " fmt + +#include <crypto/acompress.h> +#include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> -#include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> @@ -21,25 +23,39 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/pm.h> +#include <linux/nmi.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/ctype.h> -#include <linux/genhd.h> +#include <linux/ktime.h> +#include <linux/security.h> +#include <linux/secretmem.h> +#include <trace/events/power.h> #include "power.h" static int nocompress; static int noresume; +static int nohibernate; static int resume_wait; -static int resume_delay; +static unsigned int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; -int in_suspend __nosavedata; +__visible int in_suspend __nosavedata; + +static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP; + +/* + * Compression/decompression algorithm to be used while saving/loading + * image to/from disk. This would later be used in 'kernel/power/swap.c' + * to allocate comp streams. + */ +char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; enum { HIBERNATION_INVALID, @@ -49,6 +65,7 @@ enum { #ifdef CONFIG_SUSPEND HIBERNATION_SUSPEND, #endif + HIBERNATION_TEST_RESUME, /* keep last */ __HIBERNATION_AFTER_LAST }; @@ -61,27 +78,67 @@ bool freezer_test_done; static const struct platform_hibernation_ops *hibernation_ops; +static atomic_t hibernate_atomic = ATOMIC_INIT(1); + +#ifdef CONFIG_SUSPEND +/** + * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend + */ +bool pm_hibernation_mode_is_suspend(void) +{ + return hibernation_mode == HIBERNATION_SUSPEND; +} +EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend); +#endif + +bool hibernate_acquire(void) +{ + return atomic_add_unless(&hibernate_atomic, -1, 0); +} + +void hibernate_release(void) +{ + atomic_inc(&hibernate_atomic); +} + +bool hibernation_in_progress(void) +{ + return !atomic_read(&hibernate_atomic); +} + +bool hibernation_available(void) +{ + return nohibernate == 0 && + !security_locked_down(LOCKDOWN_HIBERNATION) && + !secretmem_active() && !cxl_mem_active(); +} + /** * hibernation_set_ops - Set the global hibernate operations. * @ops: Hibernation operations to use in subsequent hibernation transitions. */ void hibernation_set_ops(const struct platform_hibernation_ops *ops) { + unsigned int sleep_flags; + if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } - lock_system_sleep(); + + sleep_flags = lock_system_sleep(); + hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } +EXPORT_SYMBOL_GPL(hibernation_set_ops); static bool entering_platform_hibernation; @@ -92,10 +149,15 @@ bool system_entering_hibernation(void) EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from hibernation test"); static void hibernation_debug_sleep(void) { - printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); - mdelay(5000); + pr_info("hibernation debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); } static int hibernation_test(int level) @@ -117,7 +179,7 @@ static int hibernation_test(int level) { return 0; } static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? - hibernation_ops->begin() : 0; + hibernation_ops->begin(PMSG_FREEZE) : 0; } /** @@ -224,25 +286,30 @@ static void platform_recover(int platform_mode) * @nr_pages: Number of memory pages processed between @start and @stop. * @msg: Additional diagnostic message to print. */ -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) +void swsusp_show_speed(ktime_t start, ktime_t stop, + unsigned nr_pages, char *msg) { - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + ktime_t diff; + u64 elapsed_centisecs64; + unsigned int centisecs; + unsigned int k; + unsigned int kps; + + diff = ktime_sub(stop, start); + elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ k = nr_pages * (PAGE_SIZE / 1024); kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", - msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); + pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", + msg, k, centisecs / 100, centisecs % 100, kps / 1000, + (kps % 1000) / 10); +} + +__weak int arch_resume_nosmt(void) +{ + return 0; } /** @@ -260,8 +327,7 @@ static int create_image(int platform_mode) error = dpm_suspend_end(PMSG_FREEZE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting hibernation\n"); + pr_err("Some devices failed to power down, aborting\n"); return error; } @@ -269,16 +335,17 @@ static int create_image(int platform_mode) if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; - error = disable_nonboot_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; local_irq_disable(); + system_state = SYSTEM_SUSPEND; + error = syscore_suspend(); if (error) { - printk(KERN_ERR "PM: Some system devices failed to power down, " - "aborting hibernation\n"); + pr_err("Some system devices failed to power down, aborting\n"); goto Enable_irqs; } @@ -287,25 +354,34 @@ static int create_image(int platform_mode) in_suspend = 1; save_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); - if (error) - printk(KERN_ERR "PM: Error %d creating hibernation image\n", - error); /* Restore control flow magically appears here */ restore_processor_state(); + trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); + if (error) + pr_err("Error %d creating image\n", error); + if (!in_suspend) { events_check_enabled = false; - platform_leave(platform_mode); + clear_or_poison_free_pages(); } + platform_leave(platform_mode); + Power_up: syscore_resume(); Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + pm_sleep_enable_secondary_cpus(); + + /* Allow architectures to do nosmt-specific post-resume dances */ + if (!in_suspend) + error = arch_resume_nosmt(); Platform_finish: platform_finish(platform_mode); @@ -316,17 +392,35 @@ static int create_image(int platform_mode) return error; } +static void shrink_shmem_memory(void) +{ + struct sysinfo info; + unsigned long nr_shmem_pages, nr_freed_pages; + + si_meminfo(&info); + nr_shmem_pages = info.sharedram; /* current page count used for shmem */ + /* + * The intent is to reclaim all shmem pages. Though shrink_all_memory() can + * only reclaim about half of them, it's enough for creating the hibernation + * image. + */ + nr_freed_pages = shrink_all_memory(nr_shmem_pages); + pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n", + nr_shmem_pages, nr_freed_pages); +} + /** * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * - * This routine must be called with pm_mutex held. + * This routine must be called with system_transition_mutex held. */ int hibernation_snapshot(int platform_mode) { pm_message_t msg; int error; + pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; @@ -356,8 +450,16 @@ int hibernation_snapshot(int platform_mode) goto Thaw; } - suspend_console(); - ftrace_stop(); + /* + * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem + * pages will use lots of system memory, causing hibernation image creation + * fail due to insufficient free memory. + * This call is to force flush the shmem pages to swap disk and reclaim + * the system memory so that image creation can succeed. + */ + shrink_shmem_memory(); + + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -383,8 +485,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - ftrace_start(); - resume_console(); + console_resume_all(); dpm_complete(msg); Close: @@ -398,6 +499,11 @@ int hibernation_snapshot(int platform_mode) goto Close; } +int __weak hibernate_resume_nonboot_cpu_disable(void) +{ + return suspend_disable_secondary_cpus(); +} + /** * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -413,8 +519,7 @@ static int resume_target_kernel(bool platform_mode) error = dpm_suspend_end(PMSG_QUIESCE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " - "aborting resume\n"); + pr_err("Some devices failed to power down, aborting resume\n"); return error; } @@ -422,11 +527,14 @@ static int resume_target_kernel(bool platform_mode) if (error) goto Cleanup; - error = disable_nonboot_cpus(); + cpuidle_pause(); + + error = hibernate_resume_nonboot_cpu_disable(); if (error) goto Enable_cpus; local_irq_disable(); + system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) @@ -460,10 +568,11 @@ static int resume_target_kernel(bool platform_mode) syscore_resume(); Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + pm_sleep_enable_secondary_cpus(); Cleanup: platform_restore_cleanup(platform_mode); @@ -477,25 +586,28 @@ static int resume_target_kernel(bool platform_mode) * hibernation_restore - Quiesce devices and restore from a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. * - * This routine must be called with pm_mutex held. If it is successful, control - * reappears in the restored target kernel in hibernation_snapshot(). + * This routine must be called with system_transition_mutex held. If it is + * successful, control reappears in the restored target kernel in + * hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { int error; pm_prepare_console(); - suspend_console(); - ftrace_stop(); - pm_restrict_gfp_mask(); + console_suspend_all(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); } - pm_restore_gfp_mask(); - ftrace_start(); - resume_console(); + dpm_resume_end(PMSG_RECOVER); + console_resume_all(); pm_restore_console(); return error; } @@ -515,13 +627,12 @@ int hibernation_platform_enter(void) * hibernation_ops->finish() before saving the image, so we should let * the firmware know that we're going to enter the sleep state after all */ - error = hibernation_ops->begin(); + error = hibernation_ops->begin(PMSG_HIBERNATE); if (error) goto Close; entering_platform_hibernation = true; - suspend_console(); - ftrace_stop(); + console_suspend_all(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -537,12 +648,17 @@ int hibernation_platform_enter(void) if (error) goto Platform_finish; - error = disable_nonboot_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error) - goto Platform_finish; + goto Enable_cpus; local_irq_disable(); - syscore_suspend(); + system_state = SYSTEM_SUSPEND; + + error = syscore_suspend(); + if (error) + goto Enable_irqs; + if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; @@ -554,8 +670,12 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); + Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); - enable_nonboot_cpus(); + + Enable_cpus: + pm_sleep_enable_secondary_cpus(); Platform_finish: hibernation_ops->finish(); @@ -565,8 +685,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - ftrace_start(); - resume_console(); + console_resume_all(); Close: hibernation_ops->end(); @@ -583,8 +702,17 @@ int hibernation_platform_enter(void) */ static void power_down(void) { -#ifdef CONFIG_SUSPEND int error; + +#ifdef CONFIG_SUSPEND + if (hibernation_mode == HIBERNATION_SUSPEND) { + error = suspend_devices_and_enter(mem_sleep_current); + if (!error) + goto exit; + + hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : + HIBERNATION_SHUTDOWN; + } #endif switch (hibernation_mode) { @@ -592,260 +720,435 @@ static void power_down(void) kernel_restart(NULL); break; case HIBERNATION_PLATFORM: - hibernation_platform_enter(); + error = hibernation_platform_enter(); + if (error == -EAGAIN || error == -EBUSY) { + events_check_enabled = false; + pr_info("Wakeup event detected during hibernation, rolling back.\n"); + goto exit; + } + fallthrough; case HIBERNATION_SHUTDOWN: - kernel_power_off(); - break; -#ifdef CONFIG_SUSPEND - case HIBERNATION_SUSPEND: - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - if (error) { - if (hibernation_ops) - hibernation_mode = HIBERNATION_PLATFORM; - else - hibernation_mode = HIBERNATION_SHUTDOWN; - power_down(); + if (kernel_can_power_off()) { + entering_platform_hibernation = true; + kernel_power_off(); + entering_platform_hibernation = false; } - /* - * Restore swap signature. - */ - error = swsusp_unmark(); - if (error) - printk(KERN_ERR "PM: Swap will be unusable! " - "Try swapon -a.\n"); - return; -#endif + break; } kernel_halt(); /* * Valid image is on the disk, if we continue we risk serious data * corruption after resume. */ - printk(KERN_CRIT "PM: Please power down manually\n"); - while(1); + pr_crit("Power down manually\n"); + while (1) + cpu_relax(); + +exit: + /* Restore swap signature. */ + error = swsusp_unmark(); + if (error) + pr_err("Swap will be unusable! Try swapon -a.\n"); } +static int load_image_and_restore(void) +{ + int error; + unsigned int flags; + + pm_pr_dbg("Loading hibernation image.\n"); + + lock_device_hotplug(); + error = create_basic_memory_bitmaps(); + if (error) { + swsusp_close(); + goto Unlock; + } + + error = swsusp_read(&flags); + swsusp_close(); + if (!error) + error = hibernation_restore(flags & SF_PLATFORM_MODE); + + pr_err("Failed to load image, recovering.\n"); + swsusp_free(); + free_basic_memory_bitmaps(); + Unlock: + unlock_device_hotplug(); + + return error; +} + +#define COMPRESSION_ALGO_LZO "lzo" +#define COMPRESSION_ALGO_LZ4 "lz4" + /** * hibernate - Carry out system hibernation, including saving the image. */ int hibernate(void) { + bool snapshot_test = false; + unsigned int sleep_flags; int error; - lock_system_sleep(); + if (!hibernation_available()) { + pm_pr_dbg("Hibernation not available.\n"); + return -EPERM; + } + + /* + * Query for the compression algorithm support if compression is enabled. + */ + if (!nocompress) { + strscpy(hib_comp_algo, hibernate_compressor); + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { + pr_err("%s compression is not available\n", hib_comp_algo); + return -EOPNOTSUPP; + } + } + + sleep_flags = lock_system_sleep(); /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } + pr_info("hibernation entry\n"); pm_prepare_console(); - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); if (error) - goto Exit; + goto Restore; - /* Allocate memory management structures */ - error = create_basic_memory_bitmaps(); + error = pm_sleep_fs_sync(); if (error) - goto Exit; + goto Notify; - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) - goto Free_bitmaps; + goto Exit; + + lock_device_hotplug(); + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error || freezer_test_done) - goto Thaw; + goto Free_bitmaps; if (in_suspend) { unsigned int flags = 0; if (hibernation_mode == HIBERNATION_PLATFORM) flags |= SF_PLATFORM_MODE; - if (nocompress) + if (nocompress) { flags |= SF_NOCOMPRESS_MODE; - else + } else { flags |= SF_CRC32_MODE; - pr_debug("PM: writing image.\n"); + /* + * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4 + * to override this behaviour and use LZ4. + * + * Refer kernel/power/power.h for more details + */ + + if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4)) + flags |= SF_COMPRESSION_ALG_LZ4; + else + flags |= SF_COMPRESSION_ALG_LZO; + } + + pm_pr_dbg("Writing hibernation image.\n"); error = swsusp_write(flags); swsusp_free(); - if (!error) - power_down(); + if (!error) { + if (hibernation_mode == HIBERNATION_TEST_RESUME) + snapshot_test = true; + else + power_down(); + } in_suspend = 0; pm_restore_gfp_mask(); } else { - pr_debug("PM: Image restored successfully.\n"); + pm_pr_dbg("Hibernation image restored successfully.\n"); } + Free_bitmaps: + free_basic_memory_bitmaps(); Thaw: + unlock_device_hotplug(); + if (snapshot_test) { + pm_pr_dbg("Checking hibernation image\n"); + error = swsusp_check(false); + if (!error) + error = load_image_and_restore(); + } thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; - - Free_bitmaps: - free_basic_memory_bitmaps(); Exit: + filesystems_thaw(); + Notify: pm_notifier_call_chain(PM_POST_HIBERNATION); + Restore: pm_restore_console(); - atomic_inc(&snapshot_device_available); + hibernate_release(); Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); + pr_info("hibernation exit\n"); + return error; } - /** - * software_resume - Resume from a saved hibernation image. + * hibernate_quiet_exec - Execute a function with all devices frozen. + * @func: Function to execute. + * @data: Data pointer to pass to @func. * - * This routine is called as a late initcall, when all devices have been - * discovered and initialized already. - * - * The image reading code is called to see if there is a hibernation image - * available for reading. If that is the case, devices are quiesced and the - * contents of memory is restored from the saved image. - * - * If this is successful, control reappears in the restored target kernel in - * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine - * attempts to recover gracefully and make the kernel return to the normal mode - * of operation. + * Return the @func return value or an error code if it cannot be executed. */ -static int software_resume(void) +int hibernate_quiet_exec(int (*func)(void *data), void *data) { + unsigned int sleep_flags; int error; - unsigned int flags; - /* - * If the user said "noresume".. bail out early. - */ - if (noresume) - return 0; + sleep_flags = lock_system_sleep(); - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take pm_mutex) this can - * cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + if (!hibernate_acquire()) { + error = -EBUSY; + goto unlock; + } - if (swsusp_resume_device) - goto Check_image; + pm_prepare_console(); - if (!strlen(resume_file)) { - error = -ENOENT; - goto Unlock; - } + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); + if (error) + goto restore; + + filesystems_freeze(filesystem_freeze_enabled); + + error = freeze_processes(); + if (error) + goto exit; + + lock_device_hotplug(); + + pm_suspend_clear_flags(); + + error = platform_begin(true); + if (error) + goto thaw; + + error = freeze_kernel_threads(); + if (error) + goto thaw; + + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto dpm_complete; + + console_suspend_all(); + + error = dpm_suspend(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = dpm_suspend_end(PMSG_FREEZE); + if (error) + goto dpm_resume; + + error = platform_pre_snapshot(true); + if (error) + goto skip; + + error = func(data); + +skip: + platform_finish(true); + + dpm_resume_start(PMSG_THAW); + +dpm_resume: + dpm_resume(PMSG_THAW); + + console_resume_all(); - pr_debug("PM: Checking hibernation image partition %s\n", resume_file); +dpm_complete: + dpm_complete(PMSG_THAW); + + thaw_kernel_threads(); + +thaw: + platform_end(true); + + unlock_device_hotplug(); + + thaw_processes(); + +exit: + filesystems_thaw(); + pm_notifier_call_chain(PM_POST_HIBERNATION); + +restore: + pm_restore_console(); + + hibernate_release(); + +unlock: + unlock_system_sleep(sleep_flags); + + return error; +} +EXPORT_SYMBOL_GPL(hibernate_quiet_exec); + +static int __init find_resume_device(void) +{ + if (!strlen(resume_file)) + return -ENOENT; + + pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); if (resume_delay) { - printk(KERN_INFO "Waiting %dsec before reading resume device...\n", + pr_info("Waiting %dsec before reading resume device ...\n", resume_delay); ssleep(resume_delay); } /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); + if (!early_lookup_bdev(resume_file, &swsusp_resume_device)) + return 0; /* - * name_to_dev_t is ineffective to verify parition if resume_file is in - * integer format. (e.g. major:minor) + * Some device discovery might still be in progress; we need to wait for + * this to finish. */ - if (isdigit(resume_file[0]) && resume_wait) { - int partno; - while (!get_gendisk(swsusp_resume_device, &partno)) + wait_for_device_probe(); + if (resume_wait) { + while (early_lookup_bdev(resume_file, &swsusp_resume_device)) msleep(10); + async_synchronize_full(); } - if (!swsusp_resume_device) { - /* - * Some device discovery might still be in progress; we need - * to wait for this to finish. - */ - wait_for_device_probe(); - - if (resume_wait) { - while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) - msleep(10); - async_synchronize_full(); - } + return early_lookup_bdev(resume_file, &swsusp_resume_device); +} - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - error = -ENODEV; - goto Unlock; - } - } +static int software_resume(void) +{ + int error; - Check_image: - pr_debug("PM: Hibernation image partition %d:%d present\n", + pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - pr_debug("PM: Looking for hibernation image.\n"); - error = swsusp_check(); + pm_pr_dbg("Looking for hibernation image.\n"); + + mutex_lock(&system_transition_mutex); + error = swsusp_check(true); if (error) goto Unlock; + /* + * Check if the hibernation image is compressed. If so, query for + * the algorithm support. + */ + if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) { + if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4) + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4); + else + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO); + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { + pr_err("%s compression is not available\n", hib_comp_algo); + error = -EOPNOTSUPP; + goto Unlock; + } + } + /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(FMODE_READ); + swsusp_close(); goto Unlock; } + pr_info("resume from hibernation\n"); pm_prepare_console(); - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (error) - goto close_finish; + goto Restore; - error = create_basic_memory_bitmaps(); - if (error) - goto close_finish; + filesystems_freeze(filesystem_freeze_enabled); - pr_debug("PM: Preparing processes for restore.\n"); + pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); if (error) { - swsusp_close(FMODE_READ); - goto Done; + filesystems_thaw(); + goto Close_Finish; } - pr_debug("PM: Loading hibernation image.\n"); - - error = swsusp_read(&flags); - swsusp_close(FMODE_READ); - if (!error) - hibernation_restore(flags & SF_PLATFORM_MODE); + error = freeze_kernel_threads(); + if (error) { + thaw_processes(); + filesystems_thaw(); + goto Close_Finish; + } - printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); - swsusp_free(); + error = load_image_and_restore(); thaw_processes(); - Done: - free_basic_memory_bitmaps(); + filesystems_thaw(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); + Restore: pm_restore_console(); - atomic_inc(&snapshot_device_available); + pr_info("resume failed (%d)\n", error); + hibernate_release(); /* For success case, the suspend path will release the lock */ Unlock: - mutex_unlock(&pm_mutex); - pr_debug("PM: Hibernation image not present or could not be loaded.\n"); + mutex_unlock(&system_transition_mutex); + pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; -close_finish: - swsusp_close(FMODE_READ); + Close_Finish: + swsusp_close(); goto Finish; } -late_initcall(software_resume); +/** + * software_resume_initcall - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int __init software_resume_initcall(void) +{ + /* + * If the user said "noresume".. bail out early. + */ + if (noresume || !hibernation_available()) + return 0; + + if (!swsusp_resume_device) { + int error = find_resume_device(); + + if (error) + return error; + } + + return software_resume(); +} +late_initcall_sync(software_resume_initcall); static const char * const hibernation_modes[] = { @@ -855,6 +1158,7 @@ static const char * const hibernation_modes[] = { #ifdef CONFIG_SUSPEND [HIBERNATION_SUSPEND] = "suspend", #endif + [HIBERNATION_TEST_RESUME] = "test_resume", }; /* @@ -886,8 +1190,11 @@ static const char * const hibernation_modes[] = { static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + ssize_t count = 0; int i; - char *start = buf; + + if (!hibernation_available()) + return sysfs_emit(buf, "[disabled]\n"); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) @@ -898,6 +1205,7 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif + case HIBERNATION_TEST_RESUME: break; case HIBERNATION_PLATFORM: if (hibernation_ops) @@ -906,27 +1214,35 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, continue; } if (i == hibernation_mode) - buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]); else - buf += sprintf(buf, "%s ", hibernation_modes[i]); + count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]); } - buf += sprintf(buf, "\n"); - return buf-start; + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; } static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + int mode = HIBERNATION_INVALID; + unsigned int sleep_flags; int error = 0; - int i; int len; char *p; - int mode = HIBERNATION_INVALID; + int i; + + if (!hibernation_available()) + return -EPERM; p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -941,6 +1257,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND case HIBERNATION_SUSPEND: #endif + case HIBERNATION_TEST_RESUME: hibernation_mode = mode; break; case HIBERNATION_PLATFORM: @@ -953,9 +1270,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, error = -EINVAL; if (!error) - pr_debug("PM: Hibernation mode set to '%s'\n", - hibernation_modes[mode]); - unlock_system_sleep(); + pm_pr_dbg("Hibernation mode set to '%s'\n", + hibernation_modes[mode]); + unlock_system_sleep(sleep_flags); return error ? error : n; } @@ -964,41 +1281,90 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); + return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); } static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - unsigned int maj, min; - dev_t res; - int ret = -EINVAL; + unsigned int sleep_flags; + int len = n; + char *name; + dev_t dev; + int error; + + if (!hibernation_available()) + return n; - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; + if (len && buf[len-1] == '\n') + len--; + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + error = lookup_bdev(name, &dev); + if (error) { + unsigned maj, min, offset; + char *p, dummy; + + error = 0; + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, + &dummy) == 3) { + dev = MKDEV(maj, min); + if (maj != MAJOR(dev) || min != MINOR(dev)) + error = -EINVAL; + } else { + dev = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + error = -EINVAL; + } + } + kfree(name); + if (error) + return error; - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; + sleep_flags = lock_system_sleep(); + swsusp_resume_device = dev; + unlock_system_sleep(sleep_flags); - lock_system_sleep(); - swsusp_resume_device = res; - unlock_system_sleep(); - printk(KERN_INFO "PM: Starting manual resume from disk\n"); + pm_pr_dbg("Configured hibernation resume from disk to %u\n", + swsusp_resume_device); noresume = 0; software_resume(); - ret = n; - out: - return ret; + return n; } power_attr(resume); +static ssize_t resume_offset_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block); +} + +static ssize_t resume_offset_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t n) +{ + unsigned long long offset; + int rc; + + rc = kstrtoull(buf, 0, &offset); + if (rc) + return rc; + swsusp_resume_block = offset; + + return n; +} + +power_attr(resume_offset); + static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", image_size); + return sysfs_emit(buf, "%lu\n", image_size); } static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1019,7 +1385,7 @@ power_attr(image_size); static ssize_t reserved_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", reserved_size); + return sysfs_emit(buf, "%lu\n", reserved_size); } static ssize_t reserved_size_store(struct kobject *kobj, @@ -1038,8 +1404,9 @@ static ssize_t reserved_size_store(struct kobject *kobj, power_attr(reserved_size); -static struct attribute * g[] = { +static struct attribute *g[] = { &disk_attr.attr, + &resume_offset_attr.attr, &resume_attr.attr, &image_size_attr.attr, &reserved_size_attr.attr, @@ -1047,7 +1414,7 @@ static struct attribute * g[] = { }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; @@ -1065,7 +1432,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy( resume_file, str, 255 ); + strscpy(resume_file, str); return 1; } @@ -1084,10 +1451,17 @@ static int __init resume_offset_setup(char *str) static int __init hibernate_setup(char *str) { - if (!strncmp(str, "noresume", 8)) + if (!strncmp(str, "noresume", 8)) { noresume = 1; - else if (!strncmp(str, "nocompress", 10)) + } else if (!strncmp(str, "nocompress", 10)) { nocompress = 1; + } else if (!strncmp(str, "no", 2)) { + noresume = 1; + nohibernate = 1; + } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) + && !strncmp(str, "protect_image", 13)) { + enable_restore_image_protection(); + } return 1; } @@ -1105,13 +1479,74 @@ static int __init resumewait_setup(char *str) static int __init resumedelay_setup(char *str) { - resume_delay = simple_strtoul(str, NULL, 0); + int rc = kstrtouint(str, 0, &resume_delay); + + if (rc) + pr_warn("resumedelay: bad option string '%s'\n", str); + return 1; +} + +static int __init nohibernate_setup(char *str) +{ + noresume = 1; + nohibernate = 1; return 1; } +static const char * const comp_alg_enabled[] = { +#if IS_ENABLED(CONFIG_CRYPTO_LZO) + COMPRESSION_ALGO_LZO, +#endif +#if IS_ENABLED(CONFIG_CRYPTO_LZ4) + COMPRESSION_ALGO_LZ4, +#endif +}; + +static int hibernate_compressor_param_set(const char *compressor, + const struct kernel_param *kp) +{ + int index, ret; + + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; + + index = sysfs_match_string(comp_alg_enabled, compressor); + if (index >= 0) { + ret = param_set_copystring(comp_alg_enabled[index], kp); + if (!ret) + strscpy(hib_comp_algo, comp_alg_enabled[index]); + } else { + ret = index; + } + + mutex_unlock(&system_transition_mutex); + + if (ret) + pr_debug("Cannot set specified compressor %s\n", + compressor); + + return ret; +} + +static const struct kernel_param_ops hibernate_compressor_param_ops = { + .set = hibernate_compressor_param_set, + .get = param_get_string, +}; + +static struct kparam_string hibernate_compressor_param_string = { + .maxlen = sizeof(hibernate_compressor), + .string = hibernate_compressor, +}; + +module_param_cb(compressor, &hibernate_compressor_param_ops, + &hibernate_compressor_param_string, 0644); +MODULE_PARM_DESC(compressor, + "Compression algorithm to be used with hibernation"); + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); +__setup("nohibernate", nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 1d1bf630e6e9..03b2c5495c77 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -1,26 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * - * This file is released under the GPLv2 - * */ +#include <linux/acpi.h> #include <linux/export.h> +#include <linux/init.h> #include <linux/kobject.h> #include <linux/string.h> -#include <linux/resume-trace.h> +#include <linux/pm-trace.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/pm_runtime.h> +#include <linux/atomic.h> +#include <linux/wait.h> #include "power.h" -DEFINE_MUTEX(pm_mutex); - #ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). + */ +static unsigned int saved_gfp_count; +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + + if (WARN_ON(!saved_gfp_count) || --saved_gfp_count) + return; + + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + + pm_pr_dbg("GFP mask restored\n"); +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + + if (saved_gfp_count++) { + WARN_ON((saved_gfp_mask & ~(__GFP_IO | __GFP_FS)) != gfp_allowed_mask); + return; + } + + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); + + pm_pr_dbg("GFP mask restricted\n"); +} + +unsigned int lock_system_sleep(void) +{ + unsigned int flags = current->flags; + current->flags |= PF_NOFREEZE; + mutex_lock(&system_transition_mutex); + return flags; +} +EXPORT_SYMBOL_GPL(lock_system_sleep); + +void unlock_system_sleep(unsigned int flags) +{ + if (!(flags & PF_NOFREEZE)) + current->flags &= ~PF_NOFREEZE; + mutex_unlock(&system_transition_mutex); +} +EXPORT_SYMBOL_GPL(unlock_system_sleep); + +void ksys_sync_helper(void) +{ + ktime_t start; + long elapsed_msecs; + + start = ktime_get(); + ksys_sync(); + elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); + pr_info("Filesystems sync: %ld.%03ld seconds\n", + elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); +} +EXPORT_SYMBOL_GPL(ksys_sync_helper); + +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +/* Wakeup events handling resolution while syncing file systems in jiffies */ +#define PM_FS_SYNC_WAKEUP_RESOLUTION 5 + +static atomic_t pm_fs_sync_count = ATOMIC_INIT(0); +static struct workqueue_struct *pm_fs_sync_wq; +static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait); + +static bool pm_fs_sync_completed(void) +{ + return atomic_read(&pm_fs_sync_count) == 0; +} + +static void pm_fs_sync_work_fn(struct work_struct *work) +{ + ksys_sync_helper(); + + if (atomic_dec_and_test(&pm_fs_sync_count)) + wake_up(&pm_fs_sync_wait); +} +static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn); + +/** + * pm_sleep_fs_sync() - Sync file systems in an interruptible way + * + * Return: 0 on successful file system sync, or -EBUSY if the file system sync + * was aborted. + */ +int pm_sleep_fs_sync(void) +{ + pm_wakeup_clear(0); + + /* + * Take back-to-back sleeps into account by queuing a subsequent fs sync + * only if the previous fs sync is running or is not queued. Multiple fs + * syncs increase the likelihood of saving the latest files immediately + * before sleep. + */ + if (!work_pending(&pm_fs_sync_work)) { + atomic_inc(&pm_fs_sync_count); + queue_work(pm_fs_sync_wq, &pm_fs_sync_work); + } + + while (!pm_fs_sync_completed()) { + if (pm_wakeup_pending()) + return -EBUSY; + + wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(), + PM_FS_SYNC_WAKEUP_RESOLUTION); + } + + return 0; +} +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ /* Routines for PM-transition notifications */ @@ -38,20 +165,35 @@ int unregister_pm_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_pm_notifier); -int pm_notifier_call_chain(unsigned long val) +int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { - int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); + int ret; + + ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } +int pm_notifier_call_chain(unsigned long val) +{ + return blocking_notifier_call_chain(&pm_chain_head, val, NULL); +} + /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; +static int __init pm_async_setup(char *str) +{ + if (!strcmp(str, "off")) + pm_async_enabled = 0; + return 1; +} +__setup("pm_async=", pm_async_setup); + static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_async_enabled); + return sysfs_emit(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -71,7 +213,114 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_async); -#ifdef CONFIG_PM_DEBUG +#ifdef CONFIG_SUSPEND +static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + ssize_t count = 0; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { + if (i >= PM_SUSPEND_MEM && cxl_mem_active()) + continue; + if (mem_sleep_states[i]) { + const char *label = mem_sleep_states[i]; + + if (mem_sleep_current == i) + count += sysfs_emit_at(buf, count, "[%s] ", label); + else + count += sysfs_emit_at(buf, count, "%s ", label); + } + } + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; +} + +static suspend_state_t decode_suspend_state(const char *buf, size_t n) +{ + suspend_state_t state; + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = mem_sleep_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } + + return PM_SUSPEND_ON; +} + +static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_suspend_state(buf, n); + if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) + mem_sleep_current = state; + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); + return error ? error : n; +} + +power_attr(mem_sleep); + +/* + * sync_on_suspend: Sync file systems before suspend. + * + * show() returns whether file systems sync before suspend is enabled. + * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it. + */ +bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); + +static ssize_t sync_on_suspend_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled); +} + +static ssize_t sync_on_suspend_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + sync_on_suspend_enabled = !!val; + return n; +} + +power_attr(sync_on_suspend); +#endif /* CONFIG_SUSPEND */ + +#ifdef CONFIG_PM_SLEEP_DEBUG int pm_test_level = TEST_NONE; static const char * const pm_tests[__TEST_AFTER_LAST] = { @@ -86,37 +335,38 @@ static const char * const pm_tests[__TEST_AFTER_LAST] = { static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) - s += sprintf(s, "[%s] ", pm_tests[level]); + count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]); else - s += sprintf(s, "%s ", pm_tests[level]); + count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]); } - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; - return (s - buf); + return count; } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + unsigned int sleep_flags; const char * const *s; + int error = -EINVAL; int level; char *p; int len; - int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -126,38 +376,213 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } power_attr(pm_test); -#endif /* CONFIG_PM_DEBUG */ +#endif /* CONFIG_PM_SLEEP_DEBUG */ -#ifdef CONFIG_DEBUG_FS -static char *suspend_step_name(enum suspend_stat_step step) -{ - switch (step) { - case SUSPEND_FREEZE: - return "freeze"; - case SUSPEND_PREPARE: - return "prepare"; - case SUSPEND_SUSPEND: - return "suspend"; - case SUSPEND_SUSPEND_NOIRQ: - return "suspend_noirq"; - case SUSPEND_RESUME_NOIRQ: - return "resume_noirq"; - case SUSPEND_RESUME: - return "resume"; - default: - return ""; +#define SUSPEND_NR_STEPS SUSPEND_RESUME +#define REC_FAILED_NUM 2 + +struct suspend_stats { + unsigned int step_failures[SUSPEND_NR_STEPS]; + unsigned int success; + unsigned int fail; + int last_failed_dev; + char failed_devs[REC_FAILED_NUM][40]; + int last_failed_errno; + int errno[REC_FAILED_NUM]; + int last_failed_step; + u64 last_hw_sleep; + u64 total_hw_sleep; + u64 max_hw_sleep; + enum suspend_stat_step failed_steps[REC_FAILED_NUM]; +}; + +static struct suspend_stats suspend_stats; +static DEFINE_MUTEX(suspend_stats_lock); + +void dpm_save_failed_dev(const char *name) +{ + mutex_lock(&suspend_stats_lock); + + strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], + name, sizeof(suspend_stats.failed_devs[0])); + suspend_stats.last_failed_dev++; + suspend_stats.last_failed_dev %= REC_FAILED_NUM; + + mutex_unlock(&suspend_stats_lock); +} + +void dpm_save_failed_step(enum suspend_stat_step step) +{ + suspend_stats.step_failures[step-1]++; + suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; + suspend_stats.last_failed_step++; + suspend_stats.last_failed_step %= REC_FAILED_NUM; +} + +void dpm_save_errno(int err) +{ + if (!err) { + suspend_stats.success++; + return; } + + suspend_stats.fail++; + + suspend_stats.errno[suspend_stats.last_failed_errno] = err; + suspend_stats.last_failed_errno++; + suspend_stats.last_failed_errno %= REC_FAILED_NUM; +} + +void pm_report_hw_sleep_time(u64 t) +{ + suspend_stats.last_hw_sleep = t; + suspend_stats.total_hw_sleep += t; +} +EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); + +void pm_report_max_hw_sleep(u64 t) +{ + suspend_stats.max_hw_sleep = t; +} +EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); + +static const char * const suspend_step_names[] = { + [SUSPEND_WORKING] = "", + [SUSPEND_FREEZE] = "freeze", + [SUSPEND_PREPARE] = "prepare", + [SUSPEND_SUSPEND] = "suspend", + [SUSPEND_SUSPEND_LATE] = "suspend_late", + [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq", + [SUSPEND_RESUME_NOIRQ] = "resume_noirq", + [SUSPEND_RESUME_EARLY] = "resume_early", + [SUSPEND_RESUME] = "resume", +}; + +#define suspend_attr(_name, format_str) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sysfs_emit(buf, format_str, suspend_stats._name);\ +} \ +static struct kobj_attribute _name = __ATTR_RO(_name) + +suspend_attr(success, "%u\n"); +suspend_attr(fail, "%u\n"); +suspend_attr(last_hw_sleep, "%llu\n"); +suspend_attr(total_hw_sleep, "%llu\n"); +suspend_attr(max_hw_sleep, "%llu\n"); + +#define suspend_step_attr(_name, step) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sysfs_emit(buf, "%u\n", \ + suspend_stats.step_failures[step-1]); \ +} \ +static struct kobj_attribute _name = __ATTR_RO(_name) + +suspend_step_attr(failed_freeze, SUSPEND_FREEZE); +suspend_step_attr(failed_prepare, SUSPEND_PREPARE); +suspend_step_attr(failed_suspend, SUSPEND_SUSPEND); +suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE); +suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ); +suspend_step_attr(failed_resume, SUSPEND_RESUME); +suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY); +suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ); + +static ssize_t last_failed_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int index; + char *last_failed_dev = NULL; + + index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + last_failed_dev = suspend_stats.failed_devs[index]; + + return sysfs_emit(buf, "%s\n", last_failed_dev); +} +static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); + +static ssize_t last_failed_errno_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int index; + int last_failed_errno; + + index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + last_failed_errno = suspend_stats.errno[index]; + + return sysfs_emit(buf, "%d\n", last_failed_errno); +} +static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); + +static ssize_t last_failed_step_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + enum suspend_stat_step step; + int index; + + index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + index %= REC_FAILED_NUM; + step = suspend_stats.failed_steps[index]; + + return sysfs_emit(buf, "%s\n", suspend_step_names[step]); +} +static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); + +static struct attribute *suspend_attrs[] = { + &success.attr, + &fail.attr, + &failed_freeze.attr, + &failed_prepare.attr, + &failed_suspend.attr, + &failed_suspend_late.attr, + &failed_suspend_noirq.attr, + &failed_resume.attr, + &failed_resume_early.attr, + &failed_resume_noirq.attr, + &last_failed_dev.attr, + &last_failed_errno.attr, + &last_failed_step.attr, + &last_hw_sleep.attr, + &total_hw_sleep.attr, + &max_hw_sleep.attr, + NULL, +}; + +static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) +{ + if (attr != &last_hw_sleep.attr && + attr != &total_hw_sleep.attr && + attr != &max_hw_sleep.attr) + return 0444; + +#ifdef CONFIG_ACPI + if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) + return 0444; +#endif + return 0; } +static const struct attribute_group suspend_attr_group = { + .name = "suspend_stats", + .attrs = suspend_attrs, + .is_visible = suspend_attr_is_visible, +}; + +#ifdef CONFIG_DEBUG_FS static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; + enum suspend_stat_step step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; @@ -165,74 +590,55 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", - "success", suspend_stats.success, - "fail", suspend_stats.fail, - "failed_freeze", suspend_stats.failed_freeze, - "failed_prepare", suspend_stats.failed_prepare, - "failed_suspend", suspend_stats.failed_suspend, - "failed_suspend_late", - suspend_stats.failed_suspend_late, - "failed_suspend_noirq", - suspend_stats.failed_suspend_noirq, - "failed_resume", suspend_stats.failed_resume, - "failed_resume_early", - suspend_stats.failed_resume_early, - "failed_resume_noirq", - suspend_stats.failed_resume_noirq); + + seq_printf(s, "success: %u\nfail: %u\n", + suspend_stats.success, suspend_stats.fail); + + for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++) + seq_printf(s, "failed_%s: %u\n", suspend_step_names[step], + suspend_stats.step_failures[step-1]); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", - suspend_stats.failed_devs[last_dev]); + suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_dev + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-s\n", - suspend_stats.failed_devs[index]); + seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]); } seq_printf(s, " last_failed_errno:\t%-d\n", suspend_stats.errno[last_errno]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_errno + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-d\n", - suspend_stats.errno[index]); + seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]); } seq_printf(s, " last_failed_step:\t%-s\n", - suspend_step_name( - suspend_stats.failed_steps[last_step])); + suspend_step_names[suspend_stats.failed_steps[last_step]]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_step + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", - suspend_step_name( - suspend_stats.failed_steps[index])); + suspend_step_names[suspend_stats.failed_steps[index]]); } return 0; } - -static int suspend_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, suspend_stats_show, NULL); -} - -static const struct file_operations suspend_stats_operations = { - .open = suspend_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(suspend_stats); static int __init pm_debugfs_init(void) { debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, - NULL, NULL, &suspend_stats_operations); + NULL, NULL, &suspend_stats_fops); return 0; } late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +bool pm_sleep_transition_in_progress(void) +{ + return pm_suspend_in_progress() || hibernation_in_progress(); +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG @@ -247,7 +653,7 @@ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_print_times_enabled); + return sysfs_emit(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, @@ -270,51 +676,103 @@ power_attr(pm_print_times); static inline void pm_print_times_init(void) { - pm_print_times_enabled = !!initcall_debug; + pm_print_times_enabled = initcall_debug; +} + +static ssize_t pm_wakeup_irq_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + if (!pm_wakeup_irq()) + return -ENODATA; + + return sysfs_emit(buf, "%u\n", pm_wakeup_irq()); } -#else /* !CONFIG_PP_SLEEP_DEBUG */ + +power_attr_ro(pm_wakeup_irq); + +bool pm_debug_messages_on __read_mostly; + +bool pm_debug_messages_should_print(void) +{ + return pm_debug_messages_on && pm_sleep_transition_in_progress(); +} +EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); + +static ssize_t pm_debug_messages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", pm_debug_messages_on); +} + +static ssize_t pm_debug_messages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_debug_messages_on = !!val; + return n; +} + +power_attr(pm_debug_messages); + +static int __init pm_debug_messages_setup(char *str) +{ + pm_debug_messages_on = true; + return 1; +} +__setup("pm_debug_messages", pm_debug_messages_setup); + +#else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ struct kobject *power_kobj; -/** - * state - control system power state. +/* + * state - control system sleep states. * - * show() returns what states are supported, which is hard-coded to - * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and - * 'disk' (Suspend-to-Disk). + * show() returns available sleep state labels, which may be "mem", "standby", + * "freeze" and "disk" (hibernation). + * See Documentation/admin-guide/pm/sleep-states.rst for a description of + * what they mean. * - * store() accepts one of those strings, translates it into the - * proper enumerated value, and initiates a suspend transition. + * store() accepts one of those strings, translates it into the proper + * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; #ifdef CONFIG_SUSPEND - int i; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (pm_states[i]) + count += sysfs_emit_at(buf, count, "%s ", pm_states[i]); - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && valid_state(i)) - s += sprintf(s,"%s ", pm_states[i]); - } -#endif -#ifdef CONFIG_HIBERNATION - s += sprintf(s, "%s\n", "disk"); -#else - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; #endif - return (s - buf); + if (hibernation_available()) + count += sysfs_emit_at(buf, count, "disk "); + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; } static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_MIN; - const char * const *s; + suspend_state_t state; #endif char *p; int len; @@ -323,13 +781,16 @@ static suspend_state_t decode_state(const char *buf, size_t n) len = p ? p - buf : n; /* Check hibernation first. */ - if (len == 4 && !strncmp(buf, "disk", len)) + if (len == 4 && str_has_prefix(buf, "disk")) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = pm_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; + } #endif return PM_SUSPEND_ON; @@ -351,12 +812,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } state = decode_state(buf, n); - if (state < PM_SUSPEND_MAX) + if (state < PM_SUSPEND_MAX) { + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_suspend(state); - else if (state == PM_SUSPEND_MAX) + } else if (state == PM_SUSPEND_MAX) { error = hibernate(); - else + } else { error = -EINVAL; + } out: pm_autosleep_unlock(); @@ -401,7 +866,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj, unsigned int val; return pm_get_wakeup_count(&val, true) ? - sprintf(buf, "%u\n", val) : -EINTR; + sysfs_emit(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, @@ -443,17 +908,17 @@ static ssize_t autosleep_show(struct kobject *kobj, suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) - return sprintf(buf, "off\n"); + return sysfs_emit(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", valid_state(state) ? - pm_states[state] : "error"); + return sysfs_emit(buf, "%s\n", pm_states[state] ? + pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION - return sprintf(buf, "disk\n"); + return sysfs_emit(buf, "disk\n"); #else - return sprintf(buf, "error"); + return sysfs_emit(buf, "error\n"); #endif } @@ -468,6 +933,9 @@ static ssize_t autosleep_store(struct kobject *kobj, && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_autosleep_set_state(state); return error ? error : n; } @@ -519,7 +987,7 @@ int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_trace_enabled); + return sysfs_emit(buf, "%d\n", pm_trace_enabled); } static ssize_t @@ -548,14 +1016,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj, return show_trace_dev_match(buf, PAGE_SIZE); } -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); +power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ @@ -563,7 +1024,7 @@ power_attr(pm_trace_dev_match); static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", freeze_timeout_msecs); + return sysfs_emit(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, @@ -583,6 +1044,34 @@ power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +bool filesystem_freeze_enabled = false; + +static ssize_t freeze_filesystems_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled); +} + +static ssize_t freeze_filesystems_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + filesystem_freeze_enabled = !!val; + return n; +} + +power_attr(freeze_filesystems); +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -592,6 +1081,10 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_SUSPEND + &mem_sleep_attr.attr, + &sync_on_suspend_attr.attr, +#endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif @@ -599,48 +1092,66 @@ static struct attribute * g[] = { &wake_lock_attr.attr, &wake_unlock_attr.attr, #endif -#ifdef CONFIG_PM_DEBUG - &pm_test_attr.attr, -#endif #ifdef CONFIG_PM_SLEEP_DEBUG + &pm_test_attr.attr, &pm_print_times_attr.attr, + &pm_wakeup_irq_attr.attr, + &pm_debug_messages_attr.attr, #endif #endif #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + &freeze_filesystems_attr.attr, +#endif NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -#ifdef CONFIG_PM_RUNTIME +static const struct attribute_group *attr_groups[] = { + &attr_group, +#ifdef CONFIG_PM_SLEEP + &suspend_attr_group, +#endif + NULL, +}; + struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); -static int __init pm_start_workqueue(void) +static int __init pm_start_workqueues(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0); + if (!pm_wq) + return -ENOMEM; - return pm_wq ? 0 : -ENOMEM; -} -#else -static inline int pm_start_workqueue(void) { return 0; } +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0); + if (!pm_fs_sync_wq) { + destroy_workqueue(pm_wq); + return -ENOMEM; + } #endif + return 0; +} + static int __init pm_init(void) { - int error = pm_start_workqueue(); + int error = pm_start_workqueues(); if (error) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); + pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; - error = sysfs_create_group(power_kobj, &attr_group); + error = sysfs_create_groups(power_kobj, attr_groups); if (error) return error; pm_print_times_init(); diff --git a/kernel/power/power.h b/kernel/power/power.h index 7d4b7ffb3c1d..75b63843886e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,7 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/suspend.h> #include <linux/suspend_ioctls.h> #include <linux/utsname.h> #include <linux/freezer.h> +#include <linux/compiler.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/crypto.h> struct swsusp_info { struct new_utsname uts; @@ -11,7 +16,12 @@ struct swsusp_info { unsigned long image_pages; unsigned long pages; unsigned long size; -} __attribute__((aligned(PAGE_SIZE))); +} __aligned(PAGE_SIZE); + +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern int pm_sleep_fs_sync(void); +extern bool filesystem_freeze_enabled; +#endif #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ @@ -22,15 +32,12 @@ extern void __init hibernate_image_size_init(void); /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) -extern int arch_hibernation_header_save(void *addr, unsigned int max_size); -extern int arch_hibernation_header_restore(void *addr); - static inline int init_header_complete(struct swsusp_info *info) { return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); } -static inline char *check_image_kernel(struct swsusp_info *info) +static inline const char *check_image_kernel(struct swsusp_info *info) { return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; @@ -49,20 +56,35 @@ static inline char *check_image_kernel(struct swsusp_info *info) */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) +asmlinkage int swsusp_save(void); + /* kernel/power/hibernate.c */ extern bool freezer_test_done; +extern char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; + +/* kernel/power/swap.c */ +extern unsigned int swsusp_header_flags; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); +#ifdef CONFIG_STRICT_KERNEL_RWX +/* kernel/power/snapshot.c */ +extern void enable_restore_image_protection(void); +#else +static inline void enable_restore_image_protection(void) {} +#endif /* CONFIG_STRICT_KERNEL_RWX */ + +extern bool hibernation_in_progress(void); + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} -#endif /* !CONFIG_HIBERNATION */ -extern int pfn_is_nosave(unsigned long); +static inline bool hibernation_in_progress(void) { return false; } +#endif /* !CONFIG_HIBERNATION */ #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ @@ -74,6 +96,15 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} + /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ @@ -82,14 +113,13 @@ extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; -extern asmlinkage int swsusp_arch_suspend(void); -extern asmlinkage int swsusp_arch_resume(void); - extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); -/** +extern void clear_or_poison_free_pages(void); + +/* * Auxiliary structure used for reading the snapshot image data and * metadata from and writing them to the list of page backup entries * (PBEs) which is the main data structure of swsusp. @@ -132,11 +162,11 @@ extern unsigned int snapshot_additional_pages(struct zone *zone); extern unsigned long snapshot_get_image_size(void); extern int snapshot_read_next(struct snapshot_handle *handle); extern int snapshot_write_next(struct snapshot_handle *handle); -extern void snapshot_write_finalize(struct snapshot_handle *handle); +int snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); -/* If unset, the snapshot device cannot be open. */ -extern atomic_t snapshot_device_available; +extern bool hibernate_acquire(void); +extern void hibernate_release(void); extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); @@ -146,46 +176,55 @@ extern int swsusp_swap_in_use(void); * Flags that can be passed from the hibernatig hernel to the "boot" kernel in * the image header. */ +#define SF_COMPRESSION_ALG_LZO 0 /* dummy, details given below */ #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 #define SF_CRC32_MODE 4 +#define SF_HW_SIG 8 + +/* + * Bit to indicate the compression algorithm to be used(for LZ4). The same + * could be checked while saving/loading image to/from disk to use the + * corresponding algorithms. + * + * By default, LZO compression is enabled if SF_CRC32_MODE is set. Use + * SF_COMPRESSION_ALG_LZ4 to override this behaviour and use LZ4. + * + * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZO(dummy) -> Compression, LZO + * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZ4 -> Compression, LZ4 + */ +#define SF_COMPRESSION_ALG_LZ4 16 /* kernel/power/hibernate.c */ -extern int swsusp_check(void); +int swsusp_check(bool exclusive); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -extern void swsusp_close(fmode_t); +void swsusp_close(void); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); +#else +static inline int swsusp_unmark(void) { return 0; } #endif -/* kernel/power/block_io.c */ -extern struct block_device *hib_resume_bdev; - -extern int hib_bio_read_page(pgoff_t page_off, void *addr, - struct bio **bio_chain); -extern int hib_bio_write_page(pgoff_t page_off, void *addr, - struct bio **bio_chain); -extern int hib_wait_on_bio_chain(struct bio **bio_chain); - -struct timeval; +struct __kernel_old_timeval; /* kernel/power/swsusp.c */ -extern void swsusp_show_speed(struct timeval *, struct timeval *, - unsigned int, char *); +extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ -extern const char *const pm_states[]; +extern const char * const pm_labels[]; +extern const char *pm_states[]; +extern const char *mem_sleep_states[]; -extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ +#define mem_sleep_current PM_SUSPEND_ON + static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } -static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_PM_TEST_SUSPEND @@ -199,6 +238,7 @@ static inline void suspend_test_finish(const char *label) {} #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ +extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); #endif @@ -227,7 +267,11 @@ enum { #define TEST_FIRST TEST_NONE #define TEST_MAX (__TEST_AFTER_LAST - 1) +#ifdef CONFIG_PM_SLEEP_DEBUG extern int pm_test_level; +#else +#define pm_test_level (TEST_NONE) +#endif #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) @@ -294,3 +338,17 @@ extern int pm_wake_lock(const char *buf); extern int pm_wake_unlock(const char *buf); #endif /* !CONFIG_PM_WAKELOCKS */ + +static inline int pm_sleep_disable_secondary_cpus(void) +{ + cpuidle_pause(); + return suspend_disable_secondary_cpus(); +} + +static inline void pm_sleep_enable_secondary_cpus(void) +{ + suspend_enable_secondary_cpus(); + cpuidle_resume(); +} + +void dpm_save_errno(int err); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 7ef6866b521d..1f306f158696 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -1,7 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * poweroff.c - sysrq handler to gracefully power down machine. - * - * This file is released under the GPL v2 */ #include <linux/kernel.h> @@ -24,13 +23,13 @@ static void do_poweroff(struct work_struct *dummy) static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key) +static void handle_poweroff(u8 key) { /* run sysrq poweroff on boot cpu */ schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); } -static struct sysrq_key_op sysrq_poweroff_op = { +static const struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, .help_msg = "poweroff(o)", .action_msg = "Power Off", diff --git a/kernel/power/process.c b/kernel/power/process.c index fc0df8486449..dc0dfc349f22 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -1,41 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * drivers/power/process.c - Functions for starting/stopping processes on + * drivers/power/process.c - Functions for starting/stopping processes on * suspend transitions. * * Originally from swsusp. */ - -#undef DEBUG - #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> +#include <linux/sched/debug.h> +#include <linux/sched/task.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> #include <linux/workqueue.h> #include <linux/kmod.h> +#include <trace/events/power.h> +#include <linux/cpuset.h> -/* +/* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { + const char *what = user_only ? "user space processes" : + "remaining freezable tasks"; struct task_struct *g, *p; unsigned long end_time; unsigned int todo; bool wq_busy = false; - struct timeval start, end; - u64 elapsed_msecs64; + ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; - do_gettimeofday(&start); + pr_info("Freezing %s\n", what); + + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -45,13 +50,12 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; - if (!freezer_should_skip(p)) - todo++; - } while_each_thread(g, p); + todo++; + } read_unlock(&tasklist_lock); if (!user_only) { @@ -77,31 +81,31 @@ static int try_to_freeze_tasks(bool user_only) sleep_usecs *= 2; } - do_gettimeofday(&end); - elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_msecs64, NSEC_PER_MSEC); - elapsed_msecs = elapsed_msecs64; + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); if (todo) { - printk("\n"); - printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", + pr_err("Freezing %s %s after %d.%03d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", what, wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); - if (!wakeup) { + if (wq_busy) + show_freezable_workqueues(); + + if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (p != current && !freezer_should_skip(p) - && freezing(p) && !frozen(p)) + for_each_process_thread(g, p) { + if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { - printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, - elapsed_msecs % 1000); + pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n", + what, elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; @@ -109,6 +113,8 @@ static int try_to_freeze_tasks(bool user_only) /** * freeze_processes - Signal user space processes to enter the refrigerator. + * The current thread will not be frozen. The same process that calls + * freeze_processes must later call thaw_processes. * * On success, returns 0. On failure, -errno and system is fully thawed. */ @@ -120,20 +126,29 @@ int freeze_processes(void) if (error) return error; + /* Make sure this task doesn't get frozen */ + current->flags |= PF_SUSPEND_TASK; + if (!pm_freezing) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); - printk("Freezing user space processes ... "); + pm_wakeup_clear(0); pm_freezing = true; error = try_to_freeze_tasks(true); - if (!error) { - printk("done."); + if (!error) __usermodehelper_set_disable_depth(UMH_DISABLED); - oom_killer_disable(); - } - printk("\n"); + BUG_ON(in_atomic()); + /* + * Now that the whole userspace is frozen we need to disable + * the OOM killer to disallow any further interference with + * killable tasks. There is no guarantee oom victims will + * ever reach a point they go away we have to wait with a timeout. + */ + if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) + error = -EBUSY; + if (error) thaw_processes(); return error; @@ -151,13 +166,9 @@ int freeze_kernel_threads(void) { int error; - printk("Freezing remaining freezable tasks ... "); pm_nosig_freezing = true; error = try_to_freeze_tasks(false); - if (!error) - printk("done."); - printk("\n"); BUG_ON(in_atomic()); if (error) @@ -168,28 +179,37 @@ int freeze_kernel_threads(void) void thaw_processes(void) { struct task_struct *g, *p; + struct task_struct *curr = current; + trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; oom_killer_enable(); - printk("Restarting tasks ... "); + pr_info("Restarting tasks: Starting\n"); + __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { + /* No other threads should have PF_SUSPEND_TASK set */ + WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); + WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); + curr->flags &= ~PF_SUSPEND_TASK; + usermodehelper_enable(); schedule(); - printk("done.\n"); + pr_info("Restarting tasks: Done\n"); + trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) @@ -197,17 +217,17 @@ void thaw_kernel_threads(void) struct task_struct *g, *p; pm_nosig_freezing = false; - printk("Restarting kernel threads ... "); + pr_info("Restarting kernel threads ...\n"); thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + for_each_process_thread(g, p) { + if (p->flags & PF_KTHREAD) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); - printk("done.\n"); + pr_info("Done restarting kernel threads.\n"); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 06fe28589e9c..f7d8064e9adc 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -1,30 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * This module exposes the interface to kernel space for specifying - * QoS dependencies. It provides infrastructure for registration of: + * Power Management Quality of Service (PM QoS) support base. * - * Dependents on a QoS value : register requests - * Watchers of QoS value : get notified when target QoS value changes + * Copyright (C) 2020 Intel Corporation * - * This QoS design is best effort based. Dependents register their QoS needs. - * Watchers register to keep track of the current QoS needs of the system. + * Authors: + * Mark Gross <mgross@linux.intel.com> + * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * - * There are 3 basic classes of QoS parameter: latency, timeout, throughput - * each have defined units: - * latency: usec - * timeout: usec <-- currently not used. - * throughput: kbs (kilo byte / sec) + * Provided here is an interface for specifying PM QoS dependencies. It allows + * entities depending on QoS constraints to register their requests which are + * aggregated as appropriate to produce effective constraints (target values) + * that can be monitored by entities needing to respect them, either by polling + * or through a built-in notification mechanism. * - * There are lists of pm_qos_objects each one wrapping requests, notifiers - * - * User mode requests on a QOS parameter register themselves to the - * subsystem by opening the device node /dev/... and writing there request to - * the node. As long as the process holds a file handle open to the node the - * client continues to be accounted for. Upon file release the usermode - * request is removed and a new qos target is computed. This way when the - * request that the application has is cleaned up when closes the file - * pointer or exits the pm_qos_object will get an opportunity to clean up. - * - * Mark Gross <mgross@linux.intel.com> + * In addition to the basic functionality, more specific interfaces for managing + * global CPU latency QoS requests and frequency QoS requests are provided. */ /*#define DEBUG*/ @@ -41,6 +32,8 @@ #include <linux/platform_device.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/export.h> @@ -51,84 +44,21 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_object { - struct pm_qos_constraints *constraints; - struct miscdevice pm_qos_power_miscdev; - char *name; -}; - static DEFINE_SPINLOCK(pm_qos_lock); -static struct pm_qos_object null_pm_qos; - -static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_constraints cpu_dma_constraints = { - .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), - .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN, - .notifiers = &cpu_dma_lat_notifier, -}; -static struct pm_qos_object cpu_dma_pm_qos = { - .constraints = &cpu_dma_constraints, - .name = "cpu_dma_latency", -}; - -static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -static struct pm_qos_constraints network_lat_constraints = { - .list = PLIST_HEAD_INIT(network_lat_constraints.list), - .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN, - .notifiers = &network_lat_notifier, -}; -static struct pm_qos_object network_lat_pm_qos = { - .constraints = &network_lat_constraints, - .name = "network_latency", -}; - - -static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); -static struct pm_qos_constraints network_tput_constraints = { - .list = PLIST_HEAD_INIT(network_tput_constraints.list), - .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, - .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, - .type = PM_QOS_MAX, - .notifiers = &network_throughput_notifier, -}; -static struct pm_qos_object network_throughput_pm_qos = { - .constraints = &network_tput_constraints, - .name = "network_throughput", -}; - - -static struct pm_qos_object *pm_qos_array[] = { - &null_pm_qos, - &cpu_dma_pm_qos, - &network_lat_pm_qos, - &network_throughput_pm_qos -}; - -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos); -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos); -static int pm_qos_power_open(struct inode *inode, struct file *filp); -static int pm_qos_power_release(struct inode *inode, struct file *filp); - -static const struct file_operations pm_qos_power_fops = { - .write = pm_qos_power_write, - .read = pm_qos_power_read, - .open = pm_qos_power_open, - .release = pm_qos_power_release, - .llseek = noop_llseek, -}; +/** + * pm_qos_read_value - Return the current effective constraint value. + * @c: List of PM QoS constraint requests. + */ +s32 pm_qos_read_value(struct pm_qos_constraints *c) +{ + return READ_ONCE(c->target_value); +} -/* unlocked internal variant */ -static inline int pm_qos_get_value(struct pm_qos_constraints *c) +static int pm_qos_get_value(struct pm_qos_constraints *c) { if (plist_head_empty(&c->list)) - return c->default_value; + return c->no_constraint_value; switch (c->type) { case PM_QOS_MIN: @@ -138,40 +68,41 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) return plist_last(&c->list)->prio; default: - /* runtime check for not using enum */ - BUG(); + WARN(1, "Unknown PM QoS type in %s\n", __func__); return PM_QOS_DEFAULT_VALUE; } } -s32 pm_qos_read_value(struct pm_qos_constraints *c) +static void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { - return c->target_value; -} - -static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) -{ - c->target_value = value; + WRITE_ONCE(c->target_value, value); } /** - * pm_qos_update_target - manages the constraints list and calls the notifiers - * if needed - * @c: constraints data struct - * @node: request to add to the list, to update or to remove - * @action: action to take on the constraints list - * @value: value of the request to add or update + * pm_qos_update_target - Update a list of PM QoS constraint requests. + * @c: List of PM QoS requests. + * @node: Target list entry. + * @action: Action to carry out (add, update or remove). + * @value: New request value for the target list entry. + * + * Update the given list of PM QoS constraint requests, @c, by carrying an + * @action involving the @node list entry and @value on it. * - * This function returns 1 if the aggregated constraint value has changed, 0 - * otherwise. + * The recognized values of @action are PM_QOS_ADD_REQ (store @value in @node + * and add it to the list), PM_QOS_UPDATE_REQ (remove @node from the list, store + * @value in it and add it to the list again), and PM_QOS_REMOVE_REQ (remove + * @node from the list, ignore @value). + * + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value) { - unsigned long flags; int prev_value, curr_value, new_value; + unsigned long flags; spin_lock_irqsave(&pm_qos_lock, flags); + prev_value = pm_qos_get_value(c); if (value == PM_QOS_DEFAULT_VALUE) new_value = c->default_value; @@ -184,11 +115,11 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, break; case PM_QOS_UPDATE_REQ: /* - * to change the list, we atomically remove, reinit - * with new value and add, then see if the extremal - * changed + * To change the list, atomically remove, reinit with new value + * and add, then see if the aggregate has changed. */ plist_del(node, &c->list); + fallthrough; case PM_QOS_ADD_REQ: plist_node_init(node, new_value); plist_add(node, &c->list); @@ -204,14 +135,14 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, spin_unlock_irqrestore(&pm_qos_lock, flags); trace_pm_qos_update_target(action, prev_value, curr_value); - if (prev_value != curr_value) { - blocking_notifier_call_chain(c->notifiers, - (unsigned long)curr_value, - NULL); - return 1; - } else { + + if (prev_value == curr_value) return 0; - } + + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, curr_value, NULL); + + return 1; } /** @@ -233,14 +164,12 @@ static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf, /** * pm_qos_update_flags - Update a set of PM QoS flags. - * @pqf: Set of flags to update. + * @pqf: Set of PM QoS flags to update. * @req: Request to add to the set, to modify, or to remove from the set. * @action: Action to take on the set. * @val: Value of the request to add or modify. * - * Update the given set of PM QoS flags and call notifiers if the aggregate - * value has changed. Returns 1 if the aggregate constraint value has changed, - * 0 otherwise. + * Return: 1 if the aggregate constraint value has changed, 0 otherwise. */ bool pm_qos_update_flags(struct pm_qos_flags *pqf, struct pm_qos_flags_request *req, @@ -259,6 +188,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, break; case PM_QOS_UPDATE_REQ: pm_qos_flags_remove_req(pqf, req); + fallthrough; case PM_QOS_ADD_REQ: req->flags = val; INIT_LIST_HEAD(&req->node); @@ -275,333 +205,585 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, spin_unlock_irqrestore(&pm_qos_lock, irqflags); trace_pm_qos_update_flags(action, prev_value, curr_value); + return prev_value != curr_value; } -/** - * pm_qos_request - returns current system wide qos expectation - * @pm_qos_class: identification of which qos value is requested - * - * This function returns the current target value. - */ -int pm_qos_request(int pm_qos_class) -{ - return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); -} -EXPORT_SYMBOL_GPL(pm_qos_request); +#ifdef CONFIG_CPU_IDLE +/* Definitions related to the CPU latency QoS. */ + +static struct pm_qos_constraints cpu_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_latency_constraints.list), + .target_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_LATENCY_DEFAULT_VALUE, + .type = PM_QOS_MIN, +}; -int pm_qos_request_active(struct pm_qos_request *req) +static inline bool cpu_latency_qos_value_invalid(s32 value) { - return req->pm_qos_class != 0; + return value < 0 && value != PM_QOS_DEFAULT_VALUE; } -EXPORT_SYMBOL_GPL(pm_qos_request_active); /** - * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout - * @work: work struct for the delayed work (timeout) - * - * This cancels the timeout request by falling back to the default at timeout. + * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ -static void pm_qos_work_fn(struct work_struct *work) +s32 cpu_latency_qos_limit(void) { - struct pm_qos_request *req = container_of(to_delayed_work(work), - struct pm_qos_request, - work); - - pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); + return pm_qos_read_value(&cpu_latency_constraints); } /** - * pm_qos_add_request - inserts new qos request into the list - * @req: pointer to a preallocated handle - * @pm_qos_class: identifies which list of qos request to use - * @value: defines the qos request + * cpu_latency_qos_request_active - Check the given PM QoS request. + * @req: PM QoS request to check. * - * This function inserts a new entry in the pm_qos_class list of requested qos - * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters and initializes the pm_qos_request - * handle. Caller needs to save this handle for later use in updates and - * removal. + * Return: 'true' if @req has been added to the CPU latency QoS list, 'false' + * otherwise. */ - -void pm_qos_add_request(struct pm_qos_request *req, - int pm_qos_class, s32 value) +bool cpu_latency_qos_request_active(struct pm_qos_request *req) { - if (!req) /*guard against callers passing in null */ - return; + return req->qos == &cpu_latency_constraints; +} +EXPORT_SYMBOL_GPL(cpu_latency_qos_request_active); - if (pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); - return; - } - req->pm_qos_class = pm_qos_class; - INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); - trace_pm_qos_add_request(pm_qos_class, value); - pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, - &req->node, PM_QOS_ADD_REQ, value); +static void cpu_latency_qos_apply(struct pm_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret = pm_qos_update_target(req->qos, &req->node, action, value); + if (ret > 0) + wake_up_all_idle_cpus(); } -EXPORT_SYMBOL_GPL(pm_qos_add_request); /** - * pm_qos_update_request - modifies an existing qos request - * @req : handle to list element holding a pm_qos request to use - * @value: defines the qos request + * cpu_latency_qos_add_request - Add new CPU latency QoS request. + * @req: Pointer to a preallocated handle. + * @value: Requested constraint value. * - * Updates an existing qos request for the pm_qos_class of parameters along - * with updating the target pm_qos_class value. + * Use @value to initialize the request handle pointed to by @req, insert it as + * a new entry to the CPU latency QoS list and recompute the effective QoS + * constraint for that list. * - * Attempts are made to make this code callable on hot code paths. + * Callers need to save the handle for later use in updates and removal of the + * QoS request represented by it. */ -void pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) +void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { - if (!req) /*guard against callers passing in null */ + if (!req || cpu_latency_qos_value_invalid(value)) return; - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + if (cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for already added request\n", __func__); return; } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_add_request(value); - trace_pm_qos_update_request(req->pm_qos_class, new_value); - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + req->qos = &cpu_latency_constraints; + cpu_latency_qos_apply(req, PM_QOS_ADD_REQ, value); } -EXPORT_SYMBOL_GPL(pm_qos_update_request); +EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); /** - * pm_qos_update_request_timeout - modifies an existing qos request temporarily. - * @req : handle to list element holding a pm_qos request to use - * @new_value: defines the temporal qos request - * @timeout_us: the effective duration of this qos request in usecs. + * cpu_latency_qos_update_request - Modify existing CPU latency QoS request. + * @req : QoS request to update. + * @new_value: New requested constraint value. * - * After timeout_us, this qos request is cancelled automatically. + * Use @new_value to update the QoS request represented by @req in the CPU + * latency QoS list along with updating the effective constraint value for that + * list. */ -void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, - unsigned long timeout_us) +void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { - if (!req) + if (!req || cpu_latency_qos_value_invalid(new_value)) return; - if (WARN(!pm_qos_request_active(req), - "%s called for unknown object.", __func__)) + + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; + } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_update_request(new_value); - trace_pm_qos_update_request_timeout(req->pm_qos_class, - new_value, timeout_us); - if (new_value != req->node.prio) - pm_qos_update_target( - pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_UPDATE_REQ, new_value); + if (new_value == req->node.prio) + return; - schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); + cpu_latency_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); } +EXPORT_SYMBOL_GPL(cpu_latency_qos_update_request); /** - * pm_qos_remove_request - modifies an existing qos request - * @req: handle to request list element + * cpu_latency_qos_remove_request - Remove existing CPU latency QoS request. + * @req: QoS request to remove. * - * Will remove pm qos request from the list of constraints and - * recompute the current target value for the pm_qos_class. Call this - * on slow code paths. + * Remove the CPU latency QoS request represented by @req from the CPU latency + * QoS list along with updating the effective constraint value for that list. */ -void pm_qos_remove_request(struct pm_qos_request *req) +void cpu_latency_qos_remove_request(struct pm_qos_request *req) { - if (!req) /*guard against callers passing in null */ + if (!req) return; - /* silent return to keep pcm code cleaner */ - if (!pm_qos_request_active(req)) { - WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + if (!cpu_latency_qos_request_active(req)) { + WARN(1, KERN_ERR "%s called for unknown object\n", __func__); return; } - cancel_delayed_work_sync(&req->work); + trace_pm_qos_remove_request(PM_QOS_DEFAULT_VALUE); - trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); - pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, - &req->node, PM_QOS_REMOVE_REQ, - PM_QOS_DEFAULT_VALUE); + cpu_latency_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } -EXPORT_SYMBOL_GPL(pm_qos_remove_request); +EXPORT_SYMBOL_GPL(cpu_latency_qos_remove_request); -/** - * pm_qos_add_notifier - sets notification entry for changes to target value - * @pm_qos_class: identifies which qos target changes should be notified. - * @notifier: notifier block managed by caller. - * - * will register the notifier into a notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) +/* User space interface to the CPU latency QoS via misc device. */ + +static int cpu_latency_qos_open(struct inode *inode, struct file *filp) { - int retval; + struct pm_qos_request *req; - retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + cpu_latency_qos_add_request(req, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; - return retval; + return 0; } -EXPORT_SYMBOL_GPL(pm_qos_add_notifier); -/** - * pm_qos_remove_notifier - deletes notification entry from chain. - * @pm_qos_class: identifies which qos target changes are notified. - * @notifier: notifier block to be removed. - * - * will remove the notifier from the notification chain that gets called - * upon changes to the pm_qos_class target value. - */ -int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) +static int cpu_latency_qos_release(struct inode *inode, struct file *filp) { - int retval; + struct pm_qos_request *req = filp->private_data; - retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->constraints->notifiers, - notifier); + filp->private_data = NULL; - return retval; + cpu_latency_qos_remove_request(req); + kfree(req); + + return 0; } -EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); -/* User space interface to PM QoS classes via misc devices */ -static int register_pm_qos_misc(struct pm_qos_object *qos) +static ssize_t cpu_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) { - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + struct pm_qos_request *req = filp->private_data; + unsigned long flags; + s32 value; + + if (!req || !cpu_latency_qos_request_active(req)) + return -EINVAL; - return misc_register(&qos->pm_qos_power_miscdev); + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(&cpu_latency_constraints); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); } -static int find_pm_qos_object_by_minor(int minor) +static ssize_t cpu_latency_qos_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) { - int pm_qos_class; + s32 value; - for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; } - return -1; + + cpu_latency_qos_update_request(filp->private_data, value); + + return count; } -static int pm_qos_power_open(struct inode *inode, struct file *filp) -{ - long pm_qos_class; +static const struct file_operations cpu_latency_qos_fops = { + .write = cpu_latency_qos_write, + .read = cpu_latency_qos_read, + .open = cpu_latency_qos_open, + .release = cpu_latency_qos_release, + .llseek = noop_llseek, +}; - pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); - if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { - struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - return -ENOMEM; +static struct miscdevice cpu_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_dma_latency", + .fops = &cpu_latency_qos_fops, +}; - pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); - filp->private_data = req; +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +/* The CPU system wakeup latency QoS. */ +static struct pm_qos_constraints cpu_wakeup_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list), + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .type = PM_QOS_MIN, +}; - return 0; - } - return -EPERM; +/** + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit. + * + * Returns the current CPU system wakeup latency QoS limit that may have been + * requested by user space. + */ +s32 cpu_wakeup_latency_qos_limit(void) +{ + return pm_qos_read_value(&cpu_wakeup_latency_constraints); } -static int pm_qos_power_release(struct inode *inode, struct file *filp) +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp) { struct pm_qos_request *req; - req = filp->private_data; - pm_qos_remove_request(req); - kfree(req); + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->qos = &cpu_wakeup_latency_constraints; + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + filp->private_data = req; return 0; } - -static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, - size_t count, loff_t *f_pos) +static int cpu_wakeup_latency_qos_release(struct inode *inode, + struct file *filp) { - s32 value; - unsigned long flags; struct pm_qos_request *req = filp->private_data; - if (!req) - return -EINVAL; - if (!pm_qos_request_active(req)) - return -EINVAL; + filp->private_data = NULL; + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + kfree(req); - spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); - spin_unlock_irqrestore(&pm_qos_lock, flags); + return 0; +} + +static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); } -static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, - size_t count, loff_t *f_pos) +static ssize_t cpu_wakeup_latency_qos_write(struct file *filp, + const char __user *buf, + size_t count, loff_t *f_pos) { + struct pm_qos_request *req = filp->private_data; s32 value; - struct pm_qos_request *req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - } else if (count <= 11) { /* ASCII perhaps? */ - char ascii_value[11]; - unsigned long int ulval; + } else { int ret; - if (copy_from_user(ascii_value, buf, count)) - return -EFAULT; + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } - if (count > 10) { - if (ascii_value[10] == '\n') - ascii_value[10] = '\0'; - else - return -EINVAL; - } else { - ascii_value[count] = '\0'; - } - ret = kstrtoul(ascii_value, 16, &ulval); - if (ret) { - pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); - return -EINVAL; - } - value = (s32)lower_32_bits(ulval); - } else { + if (value < 0) return -EINVAL; - } - req = filp->private_data; - pm_qos_update_request(req, value); + pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value); return count; } +static const struct file_operations cpu_wakeup_latency_qos_fops = { + .open = cpu_wakeup_latency_qos_open, + .release = cpu_wakeup_latency_qos_release, + .read = cpu_wakeup_latency_qos_read, + .write = cpu_wakeup_latency_qos_write, + .llseek = noop_llseek, +}; -static int __init pm_qos_power_init(void) +static struct miscdevice cpu_wakeup_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_wakeup_latency", + .fops = &cpu_wakeup_latency_qos_fops, +}; +#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */ + +static int __init cpu_latency_qos_init(void) { - int ret = 0; - int i; + int ret; - BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + ret = misc_register(&cpu_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_latency_qos_miscdev.name); - for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { - ret = register_pm_qos_misc(pm_qos_array[i]); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: %s setup failed\n", - pm_qos_array[i]->name); - return ret; - } +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP + ret = misc_register(&cpu_wakeup_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_wakeup_latency_qos_miscdev.name); +#endif + + return ret; +} +late_initcall(cpu_latency_qos_init); +#endif /* CONFIG_CPU_IDLE */ + +/* Definitions related to the frequency QoS below. */ + +static inline bool freq_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + +/** + * freq_constraints_init - Initialize frequency QoS constraints. + * @qos: Frequency QoS constraints to initialize. + */ +void freq_constraints_init(struct freq_constraints *qos) +{ + struct pm_qos_constraints *c; + + c = &qos->min_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->type = PM_QOS_MAX; + c->notifiers = &qos->min_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); + + c = &qos->max_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->type = PM_QOS_MIN; + c->notifiers = &qos->max_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); +} + +/** + * freq_qos_read_value - Get frequency QoS constraint for a given list. + * @qos: Constraints to evaluate. + * @type: QoS request type. + */ +s32 freq_qos_read_value(struct freq_constraints *qos, + enum freq_qos_req_type type) +{ + s32 ret; + + switch (type) { + case FREQ_QOS_MIN: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MIN_DEFAULT_VALUE : + pm_qos_read_value(&qos->min_freq); + break; + case FREQ_QOS_MAX: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MAX_DEFAULT_VALUE : + pm_qos_read_value(&qos->max_freq); + break; + default: + WARN_ON(1); + ret = 0; + } + + return ret; +} + +/** + * freq_qos_apply - Add/modify/remove frequency QoS request. + * @req: Constraint request to apply. + * @action: Action to perform (add/update/remove). + * @value: Value to assign to the QoS request. + * + * This is only meant to be called from inside pm_qos, not drivers. + */ +int freq_qos_apply(struct freq_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret; + + switch(req->type) { + case FREQ_QOS_MIN: + ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode, + action, value); + break; + case FREQ_QOS_MAX: + ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode, + action, value); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/** + * freq_qos_add_request - Insert new frequency QoS request into a given list. + * @qos: Constraints to update. + * @req: Preallocated request object. + * @type: Request type. + * @value: Request value. + * + * Insert a new entry into the @qos list of requests, recompute the effective + * QoS constraint value for that list and initialize the @req object. The + * caller needs to save that object for later use in updates and removal. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_add_request(struct freq_constraints *qos, + struct freq_qos_request *req, + enum freq_qos_req_type type, s32 value) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value)) + return -EINVAL; + + if (WARN(freq_qos_request_active(req), + "%s() called for active request\n", __func__)) + return -EINVAL; + + req->qos = qos; + req->type = type; + ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value); + if (ret < 0) { + req->qos = NULL; + req->type = 0; } return ret; } +EXPORT_SYMBOL_GPL(freq_qos_add_request); + +/** + * freq_qos_update_request - Modify existing frequency QoS request. + * @req: Request to modify. + * @new_value: New request value. + * + * Update an existing frequency QoS request along with the effective constraint + * value for the list of requests it belongs to. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) +{ + if (!req || freq_qos_value_invalid(new_value)) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + if (req->pnode.prio == new_value) + return 0; + + return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); +} +EXPORT_SYMBOL_GPL(freq_qos_update_request); + +/** + * freq_qos_remove_request - Remove frequency QoS request from its list. + * @req: Request to remove. + * + * Remove the given frequency QoS request from the list of constraints it + * belongs to and recompute the effective constraint value for that list. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_remove_request(struct freq_qos_request *req) +{ + int ret; + + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); + req->qos = NULL; + req->type = 0; -late_initcall(pm_qos_power_init); + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_request); + +/** + * freq_qos_add_notifier - Add frequency QoS change notifier. + * @qos: List of requests to add the notifier to. + * @type: Request type. + * @notifier: Notifier block to add. + */ +int freq_qos_add_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_register(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_register(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_notifier); + +/** + * freq_qos_remove_notifier - Remove frequency QoS change notifier. + * @qos: List of requests to remove the notifier from. + * @type: Request type. + * @notifier: Notifier block to remove. + */ +int freq_qos_remove_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_notifier); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 349587bb03e1..0a946932d5c1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/snapshot.c * @@ -5,11 +6,10 @@ * * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * This file is released under the GPLv2. - * */ +#define pr_fmt(fmt) "PM: hibernation: " fmt + #include <linux/version.h> #include <linux/module.h> #include <linux/mm.h> @@ -21,21 +21,97 @@ #include <linux/pm.h> #include <linux/device.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/nmi.h> #include <linux/syscalls.h> #include <linux/console.h> #include <linux/highmem.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/compiler.h> +#include <linux/ktime.h> +#include <linux/set_memory.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/io.h> #include "power.h" +#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY) +static bool hibernate_restore_protection; +static bool hibernate_restore_protection_active; + +void enable_restore_image_protection(void) +{ + hibernate_restore_protection = true; +} + +static inline void hibernate_restore_protection_begin(void) +{ + hibernate_restore_protection_active = hibernate_restore_protection; +} + +static inline void hibernate_restore_protection_end(void) +{ + hibernate_restore_protection_active = false; +} + +static inline int __must_check hibernate_restore_protect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + return set_memory_ro((unsigned long)page_address, 1); + return 0; +} + +static inline int hibernate_restore_unprotect_page(void *page_address) +{ + if (hibernate_restore_protection_active) + return set_memory_rw((unsigned long)page_address, 1); + return 0; +} +#else +static inline void hibernate_restore_protection_begin(void) {} +static inline void hibernate_restore_protection_end(void) {} +static inline int __must_check hibernate_restore_protect_page(void *page_address) {return 0; } +static inline int hibernate_restore_unprotect_page(void *page_address) {return 0; } +#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ + + +/* + * The calls to set_direct_map_*() should not fail because remapping a page + * here means that we only update protection bits in an existing PTE. + * It is still worth to have a warning here if something changes and this + * will no longer be the case. + */ +static inline void hibernate_map_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + int ret = set_direct_map_default_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + } else { + debug_pagealloc_map_pages(page, 1); + } +} + +static inline void hibernate_unmap_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + unsigned long addr = (unsigned long)page_address(page); + int ret = set_direct_map_invalid_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } else { + debug_pagealloc_unmap_pages(page, 1); + } +} + static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); @@ -62,28 +138,35 @@ unsigned long image_size; void __init hibernate_image_size_init(void) { - image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; + image_size = ((totalram_pages() * 2) / 5) * PAGE_SIZE; } -/* List of PBEs needed for restoring the pages that were allocated before +/* + * List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written * directly to their "original" page frames. */ struct pbe *restore_pblist; -/* Pointer to an auxiliary buffer (1 page) */ -static void *buffer; +/* struct linked_page is used to build chains of pages */ -/** - * @safe_needed - on resume, for storing the PBE list and the image, - * we can only use memory pages that do not conflict with the pages - * used before suspend. The unsafe pages have PageNosaveFree set - * and we count them using unsafe_pages. - * - * Each allocated image page is marked as PageNosave and PageNosaveFree - * so that swsusp_free() can release it. +#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) + +struct linked_page { + struct linked_page *next; + char data[LINKED_PAGE_DATA_SIZE]; +} __packed; + +/* + * List of "safe" pages (ie. pages that were not used by the image kernel + * before hibernation) that may be used as temporary storage for image kernel + * memory contents. */ +static struct linked_page *safe_pages_list; + +/* Pointer to an auxiliary buffer (1 page) */ +static void *buffer; #define PG_ANY 0 #define PG_SAFE 1 @@ -92,6 +175,19 @@ static void *buffer; static unsigned int allocated_unsafe_pages; +/** + * get_image_page - Allocate a page for a hibernation image. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages that were not used before hibernation (restore only) + * + * During image restoration, for storing the PBE list and the image data, we can + * only use memory pages that do not conflict with the pages used before + * hibernation. The "unsafe" pages have PageNosaveFree set and we count them + * using allocated_unsafe_pages. + * + * Each allocated image page is marked as PageNosave and PageNosaveFree so that + * swsusp_free() can release it. + */ static void *get_image_page(gfp_t gfp_mask, int safe_needed) { void *res; @@ -111,9 +207,21 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) return res; } +static void *__get_safe_page(gfp_t gfp_mask) +{ + if (safe_pages_list) { + void *ret = safe_pages_list; + + safe_pages_list = safe_pages_list->next; + memset(ret, 0, PAGE_SIZE); + return ret; + } + return get_image_page(gfp_mask, PG_SAFE); +} + unsigned long get_safe_page(gfp_t gfp_mask) { - return (unsigned long)get_image_page(gfp_mask, PG_SAFE); + return (unsigned long)__get_safe_page(gfp_mask); } static struct page *alloc_image_page(gfp_t gfp_mask) @@ -128,11 +236,22 @@ static struct page *alloc_image_page(gfp_t gfp_mask) return page; } +static void recycle_safe_page(void *page_address) +{ + struct linked_page *lp = page_address; + + lp->next = safe_pages_list; + safe_pages_list = lp; +} + /** - * free_image_page - free page represented by @addr, allocated with - * get_image_page (page flags set by it must be cleared) + * free_image_page - Free a page allocated for hibernation image. + * @addr: Address of the page to free. + * @clear_nosave_free: If set, clear the PageNosaveFree bit for the page. + * + * The page to free should have been allocated by get_image_page() (page flags + * set by it are affected). */ - static inline void free_image_page(void *addr, int clear_nosave_free) { struct page *page; @@ -148,17 +267,8 @@ static inline void free_image_page(void *addr, int clear_nosave_free) __free_page(page); } -/* struct linked_page is used to build chains of pages */ - -#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) - -struct linked_page { - struct linked_page *next; - char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); - -static inline void -free_list_of_pages(struct linked_page *list, int clear_page_nosave) +static inline void free_list_of_pages(struct linked_page *list, + int clear_page_nosave) { while (list) { struct linked_page *lp = list->next; @@ -168,30 +278,28 @@ free_list_of_pages(struct linked_page *list, int clear_page_nosave) } } -/** - * struct chain_allocator is used for allocating small objects out of - * a linked list of pages called 'the chain'. - * - * The chain grows each time when there is no room for a new object in - * the current page. The allocated objects cannot be freed individually. - * It is only possible to free them all at once, by freeing the entire - * chain. - * - * NOTE: The chain allocator may be inefficient if the allocated objects - * are not much smaller than PAGE_SIZE. - */ - +/* + * struct chain_allocator is used for allocating small objects out of + * a linked list of pages called 'the chain'. + * + * The chain grows each time when there is no room for a new object in + * the current page. The allocated objects cannot be freed individually. + * It is only possible to free them all at once, by freeing the entire + * chain. + * + * NOTE: The chain allocator may be inefficient if the allocated objects + * are not much smaller than PAGE_SIZE. + */ struct chain_allocator { struct linked_page *chain; /* the chain */ unsigned int used_space; /* total size of objects allocated out - * of the current page - */ + of the current page */ gfp_t gfp_mask; /* mask for allocating pages */ int safe_needed; /* if set, only "safe" pages are allocated */ }; -static void -chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) +static void chain_init(struct chain_allocator *ca, gfp_t gfp_mask, + int safe_needed) { ca->chain = NULL; ca->used_space = LINKED_PAGE_DATA_SIZE; @@ -206,7 +314,8 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { struct linked_page *lp; - lp = get_image_page(ca->gfp_mask, ca->safe_needed); + lp = ca->safe_needed ? __get_safe_page(ca->gfp_mask) : + get_image_page(ca->gfp_mask, PG_ANY); if (!lp) return NULL; @@ -219,102 +328,276 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -/** - * Data types related to memory bitmaps. +/* + * Data types related to memory bitmaps. + * + * Memory bitmap is a structure consisting of many linked lists of + * objects. The main list's elements are of type struct zone_bitmap + * and each of them corresponds to one zone. For each zone bitmap + * object there is a list of objects of type struct bm_block that + * represent each blocks of bitmap in which information is stored. * - * Memory bitmap is a structure consiting of many linked lists of - * objects. The main list's elements are of type struct zone_bitmap - * and each of them corresonds to one zone. For each zone bitmap - * object there is a list of objects of type struct bm_block that - * represent each blocks of bitmap in which information is stored. + * struct memory_bitmap contains a pointer to the main list of zone + * bitmap objects, a struct bm_position used for browsing the bitmap, + * and a pointer to the list of pages used for allocating all of the + * zone bitmap objects and bitmap block objects. * - * struct memory_bitmap contains a pointer to the main list of zone - * bitmap objects, a struct bm_position used for browsing the bitmap, - * and a pointer to the list of pages used for allocating all of the - * zone bitmap objects and bitmap block objects. + * NOTE: It has to be possible to lay out the bitmap in memory + * using only allocations of order 0. Additionally, the bitmap is + * designed to work with arbitrary number of zones (this is over the + * top for now, but let's avoid making unnecessary assumptions ;-). * - * NOTE: It has to be possible to lay out the bitmap in memory - * using only allocations of order 0. Additionally, the bitmap is - * designed to work with arbitrary number of zones (this is over the - * top for now, but let's avoid making unnecessary assumptions ;-). + * struct zone_bitmap contains a pointer to a list of bitmap block + * objects and a pointer to the bitmap block object that has been + * most recently used for setting bits. Additionally, it contains the + * PFNs that correspond to the start and end of the represented zone. * - * struct zone_bitmap contains a pointer to a list of bitmap block - * objects and a pointer to the bitmap block object that has been - * most recently used for setting bits. Additionally, it contains the - * pfns that correspond to the start and end of the represented zone. + * struct bm_block contains a pointer to the memory page in which + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. * - * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bitmap) - * It also contains the pfns that correspond to the start and end of - * the represented memory area. + * The memory bitmap is organized as a radix tree to guarantee fast random + * access to the bits. There is one radix tree for each zone (as returned + * from create_mem_extents). + * + * One radix tree is represented by one struct mem_zone_bm_rtree. There are + * two linked lists for the nodes of the tree, one for the inner nodes and + * one for the leaf nodes. The linked leaf nodes are used for fast linear + * access of the memory bitmap. + * + * The struct rtree_node represents one node of the radix tree. */ #define BM_END_OF_MAP (~0UL) #define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) +#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3) +#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1) -struct bm_block { - struct list_head hook; /* hook into a list of bitmap blocks */ - unsigned long start_pfn; /* pfn represented by the first bit */ - unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned long *data; /* bitmap representing pages */ +/* + * struct rtree_node is a wrapper struct to link the nodes + * of the rtree together for easy linear iteration over + * bits and easy freeing + */ +struct rtree_node { + struct list_head list; + unsigned long *data; }; -static inline unsigned long bm_block_bits(struct bm_block *bb) -{ - return bb->end_pfn - bb->start_pfn; -} +/* + * struct mem_zone_bm_rtree represents a bitmap used for one + * populated memory zone. + */ +struct mem_zone_bm_rtree { + struct list_head list; /* Link Zones together */ + struct list_head nodes; /* Radix Tree inner nodes */ + struct list_head leaves; /* Radix Tree leaves */ + unsigned long start_pfn; /* Zone start page frame */ + unsigned long end_pfn; /* Zone end page frame + 1 */ + struct rtree_node *rtree; /* Radix Tree Root */ + int levels; /* Number of Radix Tree Levels */ + unsigned int blocks; /* Number of Bitmap Blocks */ +}; -/* strcut bm_position is used for browsing memory bitmaps */ +/* struct bm_position is used for browsing memory bitmaps */ struct bm_position { - struct bm_block *block; - int bit; + struct mem_zone_bm_rtree *zone; + struct rtree_node *node; + unsigned long node_pfn; + unsigned long cur_pfn; + int node_bit; }; struct memory_bitmap { - struct list_head blocks; /* list of bitmap blocks */ + struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone - * bitmap objects and bitmap block - * objects - */ + bitmap objects and bitmap block + objects */ struct bm_position cur; /* most recently used bit position */ }; /* Functions that operate on memory bitmaps */ -static void memory_bm_position_reset(struct memory_bitmap *bm) +#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long)) +#if BITS_PER_LONG == 32 +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2) +#else +#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3) +#endif +#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1) + +/** + * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages not used before hibernation (restore only) + * @ca: Pointer to a linked list of pages ("a chain") to allocate from + * @list: Radix Tree node to add. + * + * This function is used to allocate inner nodes as well as the + * leave nodes of the radix tree. It also adds the node to the + * corresponding linked list passed in by the *list parameter. + */ +static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, + struct chain_allocator *ca, + struct list_head *list) { - bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); - bm->cur.bit = 0; -} + struct rtree_node *node; -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + node = chain_alloc(ca, sizeof(struct rtree_node)); + if (!node) + return NULL; + + node->data = get_image_page(gfp_mask, safe_needed); + if (!node->data) + return NULL; + + list_add_tail(&node->list, list); + + return node; +} /** - * create_bm_block_list - create a list of block bitmap objects - * @pages - number of pages to track - * @list - list to put the allocated blocks into - * @ca - chain allocator to be used for allocating memory + * add_rtree_block - Add a new leave node to the radix tree. + * + * The leave nodes need to be allocated in order to keep the leaves + * linked list in order. This is guaranteed by the zone->blocks + * counter. */ -static int create_bm_block_list(unsigned long pages, - struct list_head *list, - struct chain_allocator *ca) +static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, + int safe_needed, struct chain_allocator *ca) { - unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + struct rtree_node *node, *block, **dst; + unsigned int levels_needed, block_nr; + int i; - while (nr_blocks-- > 0) { - struct bm_block *bb; + block_nr = zone->blocks; + levels_needed = 0; - bb = chain_alloc(ca, sizeof(struct bm_block)); - if (!bb) + /* How many levels do we need for this block nr? */ + while (block_nr) { + levels_needed += 1; + block_nr >>= BM_RTREE_LEVEL_SHIFT; + } + + /* Make sure the rtree has enough levels */ + for (i = zone->levels; i < levels_needed; i++) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) return -ENOMEM; - list_add(&bb->hook, list); + + node->data[0] = (unsigned long)zone->rtree; + zone->rtree = node; + zone->levels += 1; } + /* Allocate new block */ + block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); + if (!block) + return -ENOMEM; + + /* Now walk the rtree to insert the block */ + node = zone->rtree; + dst = &zone->rtree; + block_nr = zone->blocks; + for (i = zone->levels; i > 0; i--) { + int index; + + if (!node) { + node = alloc_rtree_node(gfp_mask, safe_needed, ca, + &zone->nodes); + if (!node) + return -ENOMEM; + *dst = node; + } + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + dst = (struct rtree_node **)&((*dst)->data[index]); + node = *dst; + } + + zone->blocks += 1; + *dst = block; + return 0; } +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free); + +/** + * create_zone_bm_rtree - Create a radix tree for one zone. + * + * Allocated the mem_zone_bm_rtree structure and initializes it. + * This function also allocated and builds the radix tree for the + * zone. + */ +static struct mem_zone_bm_rtree *create_zone_bm_rtree(gfp_t gfp_mask, + int safe_needed, + struct chain_allocator *ca, + unsigned long start, + unsigned long end) +{ + struct mem_zone_bm_rtree *zone; + unsigned int i, nr_blocks; + unsigned long pages; + + pages = end - start; + zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); + if (!zone) + return NULL; + + INIT_LIST_HEAD(&zone->nodes); + INIT_LIST_HEAD(&zone->leaves); + zone->start_pfn = start; + zone->end_pfn = end; + nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + + for (i = 0; i < nr_blocks; i++) { + if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { + free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); + return NULL; + } + } + + return zone; +} + +/** + * free_zone_bm_rtree - Free the memory of the radix tree. + * + * Free all node pages of the radix tree. The mem_zone_bm_rtree + * structure itself is not freed here nor are the rtree_node + * structs. + */ +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, + int clear_nosave_free) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + free_image_page(node->data, clear_nosave_free); + + list_for_each_entry(node, &zone->leaves, list) + free_image_page(node->data, clear_nosave_free); +} + +static void memory_bm_position_reset(struct memory_bitmap *bm) +{ + bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + list); + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.cur_pfn = BM_END_OF_MAP; + bm->cur.node_bit = 0; +} + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + struct mem_extent { struct list_head hook; unsigned long start; @@ -322,8 +605,8 @@ struct mem_extent { }; /** - * free_mem_extents - free a list of memory extents - * @list - list of extents to empty + * free_mem_extents - Free a list of memory extents. + * @list: List of extents to free. */ static void free_mem_extents(struct list_head *list) { @@ -336,10 +619,11 @@ static void free_mem_extents(struct list_head *list) } /** - * create_mem_extents - create a list of memory extents representing - * contiguous ranges of PFNs - * @list - list to put the extents into - * @gfp_mask - mask to use for memory allocations + * create_mem_extents - Create a list of memory extents. + * @list: List to put the extents into. + * @gfp_mask: Mask to use for memory allocations. + * + * The extents represent contiguous ranges of PFNs. */ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { @@ -352,7 +636,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) struct mem_extent *ext, *cur, *aux; zone_start = zone->zone_start_pfn; - zone_end = zone->zone_start_pfn + zone->spanned_pages; + zone_end = zone_end_pfn(zone); list_for_each_entry(ext, list, hook) if (zone_start <= ext->end) @@ -395,10 +679,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) } /** - * memory_bm_create - allocate memory for a memory bitmap - */ -static int -memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) + * memory_bm_create - Allocate memory for a memory bitmap. + */ +static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, + int safe_needed) { struct chain_allocator ca; struct list_head mem_extents; @@ -406,40 +690,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) int error; chain_init(&ca, gfp_mask, safe_needed); - INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); error = create_mem_extents(&mem_extents, gfp_mask); if (error) return error; list_for_each_entry(ext, &mem_extents, hook) { - struct bm_block *bb; - unsigned long pfn = ext->start; - unsigned long pages = ext->end - ext->start; - - bb = list_entry(bm->blocks.prev, struct bm_block, hook); + struct mem_zone_bm_rtree *zone; - error = create_bm_block_list(pages, bm->blocks.prev, &ca); - if (error) + zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, + ext->start, ext->end); + if (!zone) { + error = -ENOMEM; goto Error; - - list_for_each_entry_continue(bb, &bm->blocks, hook) { - bb->data = get_image_page(gfp_mask, safe_needed); - if (!bb->data) { - error = -ENOMEM; - goto Error; - } - - bb->start_pfn = pfn; - if (pages >= BM_BITS_PER_BLOCK) { - pfn += BM_BITS_PER_BLOCK; - pages -= BM_BITS_PER_BLOCK; - } else { - /* This is executed only once in the loop */ - pfn += pages; - } - bb->end_pfn = pfn; } + list_add_tail(&zone->list, &bm->zones); } bm->p_list = ca.chain; @@ -455,55 +721,94 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) } /** - * memory_bm_free - free memory occupied by the memory bitmap @bm - */ + * memory_bm_free - Free memory occupied by the memory bitmap. + * @bm: Memory bitmap. + */ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct bm_block *bb; + struct mem_zone_bm_rtree *zone; - list_for_each_entry(bb, &bm->blocks, hook) - if (bb->data) - free_image_page(bb->data, clear_nosave_free); + list_for_each_entry(zone, &bm->zones, list) + free_zone_bm_rtree(zone, clear_nosave_free); free_list_of_pages(bm->p_list, clear_nosave_free); - INIT_LIST_HEAD(&bm->blocks); + INIT_LIST_HEAD(&bm->zones); } /** - * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds - * to given pfn. The cur_zone_bm member of @bm and the cur_block member - * of @bm->cur_zone_bm are updated. + * memory_bm_find_bit - Find the bit for a given PFN in a memory bitmap. + * + * Find the bit in memory bitmap @bm that corresponds to the given PFN. + * The cur.zone, cur.block and cur.node_pfn members of @bm are updated. + * + * Walk the radix tree to find the page containing the bit that represents @pfn + * and return the position of the bit in @addr and @bit_nr. */ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) + void **addr, unsigned int *bit_nr) { - struct bm_block *bb; + struct mem_zone_bm_rtree *curr, *zone; + struct rtree_node *node; + int i, block_nr; + + zone = bm->cur.zone; + + if (pfn >= zone->start_pfn && pfn < zone->end_pfn) + goto zone_found; + + zone = NULL; + + /* Find the right zone */ + list_for_each_entry(curr, &bm->zones, list) { + if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { + zone = curr; + break; + } + } + + if (!zone) + return -EFAULT; +zone_found: /* - * Check if the pfn corresponds to the current bitmap block and find - * the block where it fits if this is not the case. + * We have found the zone. Now walk the radix tree to find the leaf node + * for our PFN. */ - bb = bm->cur.block; - if (pfn < bb->start_pfn) - list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn) - break; - if (pfn >= bb->end_pfn) - list_for_each_entry_continue(bb, &bm->blocks, hook) - if (pfn >= bb->start_pfn && pfn < bb->end_pfn) - break; + /* + * If the zone we wish to scan is the current zone and the + * pfn falls into the current node then we do not need to walk + * the tree. + */ + node = bm->cur.node; + if (zone == bm->cur.zone && + ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + goto node_found; - if (&bb->hook == &bm->blocks) - return -EFAULT; + node = zone->rtree; + block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; + + for (i = zone->levels; i > 0; i--) { + int index; + + index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); + index &= BM_RTREE_LEVEL_MASK; + BUG_ON(node->data[index] == 0); + node = (struct rtree_node *)node->data[index]; + } + +node_found: + /* Update last position */ + bm->cur.zone = zone; + bm->cur.node = node; + bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + bm->cur.cur_pfn = pfn; + + /* Set return values */ + *addr = node->data; + *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; - /* The block has been found */ - bm->cur.block = bb; - pfn -= bb->start_pfn; - bm->cur.bit = pfn + 1; - *bit_nr = pfn; - *addr = bb->data; return 0; } @@ -527,6 +832,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) error = memory_bm_find_bit(bm, pfn, &addr, &bit); if (!error) set_bit(bit, addr); + return error; } @@ -541,6 +847,19 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ + int bit; + + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); +} + +static unsigned long memory_bm_get_current(struct memory_bitmap *bm) +{ + return bm->cur.cur_pfn; +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -560,45 +879,79 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) return !memory_bm_find_bit(bm, pfn, &addr, &bit); } -/** - * memory_bm_next_pfn - find the pfn that corresponds to the next set bit - * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is - * returned. +/* + * rtree_next_node - Jump to the next leaf node. + * + * Set the position to the beginning of the next node in the + * memory bitmap. This is either the next node in the current + * zone's radix tree or the first node in the radix tree of the + * next zone. * - * It is required to run memory_bm_position_reset() before the first call to - * this function. + * Return true if there is a next node, false otherwise. */ +static bool rtree_next_node(struct memory_bitmap *bm) +{ + if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) { + bm->cur.node = list_entry(bm->cur.node->list.next, + struct rtree_node, list); + bm->cur.node_pfn += BM_BITS_PER_BLOCK; + bm->cur.node_bit = 0; + touch_softlockup_watchdog(); + return true; + } + + /* No more nodes, goto next zone */ + if (!list_is_last(&bm->cur.zone->list, &bm->zones)) { + bm->cur.zone = list_entry(bm->cur.zone->list.next, + struct mem_zone_bm_rtree, list); + bm->cur.node = list_entry(bm->cur.zone->leaves.next, + struct rtree_node, list); + bm->cur.node_pfn = 0; + bm->cur.node_bit = 0; + return true; + } + /* No more zones */ + return false; +} + +/** + * memory_bm_next_pfn - Find the next set bit in a memory bitmap. + * @bm: Memory bitmap. + * + * Starting from the last returned position this function searches for the next + * set bit in @bm and returns the PFN represented by it. If no more bits are + * set, BM_END_OF_MAP is returned. + * + * It is required to run memory_bm_position_reset() before the first call to + * this function for the given memory bitmap. + */ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct bm_block *bb; + unsigned long bits, pfn, pages; int bit; - bb = bm->cur.block; do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = list_entry(bb->hook.next, struct bm_block, hook); - bm->cur.block = bb; - bm->cur.bit = 0; - } while (&bb->hook != &bm->blocks); + pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; + bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur.node->data, bits, + bm->cur.node_bit); + if (bit < bits) { + pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; + bm->cur.node_bit = bit + 1; + bm->cur.cur_pfn = pfn; + return pfn; + } + } while (rtree_next_node(bm)); - memory_bm_position_reset(bm); + bm->cur.cur_pfn = BM_END_OF_MAP; return BM_END_OF_MAP; - - Return_pfn: - bm->cur.bit = bit + 1; - return bb->start_pfn + bit; } -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. +/* + * This structure represents a range of page frames the contents of which + * should not be saved during hibernation. */ - struct nosave_region { struct list_head list; unsigned long start_pfn; @@ -607,15 +960,41 @@ struct nosave_region { static LIST_HEAD(nosave_regions); +static void recycle_zone_bm_rtree(struct mem_zone_bm_rtree *zone) +{ + struct rtree_node *node; + + list_for_each_entry(node, &zone->nodes, list) + recycle_safe_page(node->data); + + list_for_each_entry(node, &zone->leaves, list) + recycle_safe_page(node->data); +} + +static void memory_bm_recycle(struct memory_bitmap *bm) +{ + struct mem_zone_bm_rtree *zone; + struct linked_page *p_list; + + list_for_each_entry(zone, &bm->zones, list) + recycle_zone_bm_rtree(zone); + + p_list = bm->p_list; + while (p_list) { + struct linked_page *lp = p_list; + + p_list = lp->next; + recycle_safe_page(lp); + } +} + /** - * register_nosave_region - register a range of page frames the contents - * of which should not be saved during the suspend (to be used in the early - * initialization code) + * register_nosave_region - Register a region of unsaveable memory. + * + * Register a range of page frames the contents of which should not be saved + * during hibernation (to be used in the early initialization code). */ - -void __init -__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, - int use_kmalloc) +void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) { struct nosave_region *region; @@ -631,18 +1010,14 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, goto Report; } } - if (use_kmalloc) { - /* during init, this shouldn't fail */ - region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); - BUG_ON(!region); - } else - /* This allocation cannot fail */ - region = alloc_bootmem(sizeof(struct nosave_region)); + /* This allocation cannot fail */ + region = memblock_alloc_or_panic(sizeof(struct nosave_region), + SMP_CACHE_BYTES); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); Report: - printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n", (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); } @@ -698,10 +1073,12 @@ static void swsusp_unset_page_forbidden(struct page *page) } /** - * mark_nosave_pages - set bits corresponding to the page frames the - * contents of which should not be saved in a given bitmap. + * mark_nosave_pages - Mark pages that should not be saved. + * @bm: Memory bitmap. + * + * Set the bits in @bm that correspond to the page frames the contents of which + * should not be saved. */ - static void mark_nosave_pages(struct memory_bitmap *bm) { struct nosave_region *region; @@ -712,38 +1089,40 @@ static void mark_nosave_pages(struct memory_bitmap *bm) list_for_each_entry(region, &nosave_regions, list) { unsigned long pfn; - pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n", (unsigned long long) region->start_pfn << PAGE_SHIFT, ((unsigned long long) region->end_pfn << PAGE_SHIFT) - 1); - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - /* - * It is safe to ignore the result of - * mem_bm_set_bit_check() here, since we won't - * touch the PFNs for which the error is - * returned anyway. - */ - mem_bm_set_bit_check(bm, pfn); - } + for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } /** - * create_basic_memory_bitmaps - create bitmaps needed for marking page - * frames that should not be saved and free page frames. The pointers - * forbidden_pages_map and free_pages_map are only modified if everything - * goes well, because we don't want the bits to be used before both bitmaps - * are set up. + * create_basic_memory_bitmaps - Create bitmaps to hold basic page information. + * + * Create bitmaps needed for marking page frames that should not be saved and + * free page frames. The forbidden_pages_map and free_pages_map pointers are + * only modified if everything goes well, because we don't want the bits to be + * touched before both bitmaps are set up. */ - int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - int error = 0; + int error; - BUG_ON(forbidden_pages_map || free_pages_map); + if (forbidden_pages_map && free_pages_map) + return 0; + else + BUG_ON(forbidden_pages_map || free_pages_map); bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); if (!bm1) @@ -765,31 +1144,32 @@ int create_basic_memory_bitmaps(void) free_pages_map = bm2; mark_nosave_pages(forbidden_pages_map); - pr_debug("PM: Basic memory bitmaps created\n"); + pr_debug("Basic memory bitmaps created\n"); return 0; Free_second_object: kfree(bm2); Free_first_bitmap: - memory_bm_free(bm1, PG_UNSAFE_CLEAR); + memory_bm_free(bm1, PG_UNSAFE_CLEAR); Free_first_object: kfree(bm1); return -ENOMEM; } /** - * free_basic_memory_bitmaps - free memory bitmaps allocated by - * create_basic_memory_bitmaps(). The auxiliary pointers are necessary - * so that the bitmaps themselves are not referred to while they are being - * freed. + * free_basic_memory_bitmaps - Free memory bitmaps holding basic information. + * + * Free memory bitmaps allocated by create_basic_memory_bitmaps(). The + * auxiliary pointers are necessary so that the bitmaps themselves are not + * referred to while they are being freed. */ - void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - BUG_ON(!(forbidden_pages_map && free_pages_map)); + if (WARN_ON(!(forbidden_pages_map && free_pages_map))) + return; bm1 = forbidden_pages_map; bm2 = free_pages_map; @@ -800,31 +1180,119 @@ void free_basic_memory_bitmaps(void) memory_bm_free(bm2, PG_UNSAFE_CLEAR); kfree(bm2); - pr_debug("PM: Basic memory bitmaps freed\n"); + pr_debug("Basic memory bitmaps freed\n"); +} + +static void clear_or_poison_free_page(struct page *page) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, 1); + else if (want_init_on_free()) + clear_highpage(page); +} + +void clear_or_poison_free_pages(void) +{ + struct memory_bitmap *bm = free_pages_map; + unsigned long pfn; + + if (WARN_ON(!(free_pages_map))) + return; + + if (page_poisoning_enabled() || want_init_on_free()) { + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (pfn_valid(pfn)) + clear_or_poison_free_page(pfn_to_page(pfn)); + + pfn = memory_bm_next_pfn(bm); + } + memory_bm_position_reset(bm); + pr_info("free pages cleared after restore\n"); + } } /** - * snapshot_additional_pages - estimate the number of additional pages - * be needed for setting up the suspend image data structures for given - * zone (usually the returned value is greater than the exact number) + * snapshot_additional_pages - Estimate the number of extra pages needed. + * @zone: Memory zone to carry out the computation for. + * + * Estimate the number of additional pages needed for setting up a hibernation + * image data structures for @zone (usually, the returned value is greater than + * the exact number). */ - unsigned int snapshot_additional_pages(struct zone *zone) { - unsigned int res; + unsigned int rtree, nodes; - res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), - LINKED_PAGE_DATA_SIZE); - return 2 * res; + rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), + LINKED_PAGE_DATA_SIZE); + while (nodes > 1) { + nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); + rtree += nodes; + } + + return 2 * rtree; +} + +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + +static void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + unsigned long flags; + unsigned int order, t; + struct page *page; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) { + page = pfn_to_page(pfn); + + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + + if (page_zone(page) != zone) + continue; + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], buddy_list) { + unsigned long i; + + pfn = page_to_pfn(page); + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + } + spin_unlock_irqrestore(&zone->lock, flags); } #ifdef CONFIG_HIGHMEM /** - * count_free_highmem_pages - compute the total number of free highmem - * pages, system-wide. + * count_free_highmem_pages - Compute the total number of free highmem pages. + * + * The returned number is system-wide. */ - static unsigned int count_free_highmem_pages(void) { struct zone *zone; @@ -838,11 +1306,12 @@ static unsigned int count_free_highmem_pages(void) } /** - * saveable_highmem_page - Determine whether a highmem page should be - * included in the suspend image. + * saveable_highmem_page - Check if a highmem page is saveable. * - * We should save the page if it isn't Nosave or NosaveFree, or Reserved, - * and it isn't a part of a free chunk of pages. + * Determine whether a highmem page should be included in a hibernation image. + * + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't part of a free chunk of pages. */ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { @@ -851,14 +1320,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -868,10 +1339,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) } /** - * count_highmem_pages - compute the total number of saveable highmem - * pages. + * count_highmem_pages - Compute the total number of saveable highmem pages. */ - static unsigned int count_highmem_pages(void) { struct zone *zone; @@ -884,27 +1353,24 @@ static unsigned int count_highmem_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_highmem_page(zone, pfn)) n++; } return n; } -#else -static inline void *saveable_highmem_page(struct zone *z, unsigned long p) -{ - return NULL; -} #endif /* CONFIG_HIGHMEM */ /** - * saveable_page - Determine whether a non-highmem page should be included - * in the suspend image. + * saveable_page - Check if the given page is saveable. + * + * Determine whether a non-highmem page should be included in a hibernation + * image. * - * We should save the page if it isn't Nosave, and is not in the range - * of pages statically defined as 'unsaveable', and it isn't a part of - * a free chunk of pages. + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't part of + * a free chunk of pages. */ static struct page *saveable_page(struct zone *zone, unsigned long pfn) { @@ -913,8 +1379,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); @@ -922,6 +1388,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; @@ -933,10 +1402,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) } /** - * count_data_pages - compute the total number of saveable non-highmem - * pages. + * count_data_pages - Compute the total number of saveable non-highmem pages. */ - static unsigned int count_data_pages(void) { struct zone *zone; @@ -948,7 +1415,7 @@ static unsigned int count_data_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_page(zone, pfn)) n++; @@ -956,115 +1423,146 @@ static unsigned int count_data_pages(void) return n; } -/* This is needed, because copy_page and memcpy are not usable for copying - * task structs. +/* + * This is needed, because copy_page and memcpy are not usable for copying + * task structs. Returns true if the page was filled with only zeros, + * otherwise false. */ -static inline void do_copy_page(long *dst, long *src) +static inline bool do_copy_page(long *dst, long *src) { + long z = 0; int n; - for (n = PAGE_SIZE / sizeof(long); n; n--) + for (n = PAGE_SIZE / sizeof(long); n; n--) { + z |= *src; *dst++ = *src++; + } + return !z; } - /** - * safe_copy_page - check if the page we are going to copy is marked as - * present in the kernel page tables (this always is the case if - * CONFIG_DEBUG_PAGEALLOC is not set and in that case - * kernel_page_present() always returns 'true'). + * safe_copy_page - Copy a page in a safe way. + * + * Check if the page we are going to copy is marked as present in the kernel + * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or + * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() + * always returns 'true'. Returns true if the page was entirely composed of + * zeros, otherwise it will return false. */ -static void safe_copy_page(void *dst, struct page *s_page) +static bool safe_copy_page(void *dst, struct page *s_page) { + bool zeros_only; + if (kernel_page_present(s_page)) { - do_copy_page(dst, page_address(s_page)); + zeros_only = do_copy_page(dst, page_address(s_page)); } else { - kernel_map_pages(s_page, 1, 1); - do_copy_page(dst, page_address(s_page)); - kernel_map_pages(s_page, 1, 0); + hibernate_map_page(s_page); + zeros_only = do_copy_page(dst, page_address(s_page)); + hibernate_unmap_page(s_page); } + return zeros_only; } - #ifdef CONFIG_HIGHMEM -static inline struct page * -page_is_saveable(struct zone *zone, unsigned long pfn) +static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } -static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { struct page *s_page, *d_page; void *src, *dst; + bool zeros_only; s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page); - dst = kmap_atomic(d_page); - do_copy_page(dst, src); - kunmap_atomic(dst); - kunmap_atomic(src); + src = kmap_local_page(s_page); + dst = kmap_local_page(d_page); + zeros_only = do_copy_page(dst, src); + kunmap_local(dst); + kunmap_local(src); } else { if (PageHighMem(d_page)) { - /* Page pointed to by src may contain some kernel + /* + * The page pointed to by src may contain some kernel * data modified by kmap_atomic() */ - safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page); + zeros_only = safe_copy_page(buffer, s_page); + dst = kmap_local_page(d_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); } else { - safe_copy_page(page_address(d_page), s_page); + zeros_only = safe_copy_page(page_address(d_page), s_page); } } + return zeros_only; } #else #define page_is_saveable(zone, pfn) saveable_page(zone, pfn) -static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static inline int copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { - safe_copy_page(page_address(pfn_to_page(dst_pfn)), + return safe_copy_page(page_address(pfn_to_page(dst_pfn)), pfn_to_page(src_pfn)); } #endif /* CONFIG_HIGHMEM */ -static void -copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) +/* + * Copy data pages will copy all pages into pages pulled from the copy_bm. + * If a page was entirely filled with zeros it will be marked in the zero_bm. + * + * Returns the number of pages copied. + */ +static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, + struct memory_bitmap *orig_bm, + struct memory_bitmap *zero_bm) { + unsigned long copied_pages = 0; struct zone *zone; - unsigned long pfn; + unsigned long pfn, copy_pfn; for_each_populated_zone(zone) { unsigned long max_zone_pfn; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); - for(;;) { + copy_pfn = memory_bm_next_pfn(copy_bm); + for (;;) { pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; - copy_data_page(memory_bm_next_pfn(copy_bm), pfn); + if (copy_data_page(copy_pfn, pfn)) { + memory_bm_set_bit(zero_bm, pfn); + /* Use this copy_pfn for a page that is not full of zeros */ + continue; + } + copied_pages++; + copy_pfn = memory_bm_next_pfn(copy_bm); } + return copied_pages; } /* Total number of image pages */ static unsigned int nr_copy_pages; /* Number of pages needed for saving the original pfns of the image pages */ static unsigned int nr_meta_pages; +/* Number of zero pages */ +static unsigned int nr_zero_pages; + /* * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. */ -unsigned int alloc_normal, alloc_highmem; +static unsigned int alloc_normal, alloc_highmem; /* * Memory bitmap used for marking saveable pages (during hibernation) or * hibernation image pages (during restore) @@ -1080,38 +1578,59 @@ static struct memory_bitmap orig_bm; */ static struct memory_bitmap copy_bm; +/* Memory bitmap which tracks which saveable pages were zero filled. */ +static struct memory_bitmap zero_bm; + /** - * swsusp_free - free pages allocated for the suspend. + * swsusp_free - Free pages allocated for hibernation image. * - * Suspend pages are alocated before the atomic copy is made, so we - * need to release them after the resume. + * Image pages are allocated before snapshot creation, so they need to be + * released after resume. */ - void swsusp_free(void) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long fb_pfn, fr_pfn; - for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } + if (!forbidden_pages_map || !free_pages_map) + goto out; + + memory_bm_position_reset(forbidden_pages_map); + memory_bm_position_reset(free_pages_map); + +loop: + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + + /* + * Find the next bit set in both bitmaps. This is guaranteed to + * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. + */ + do { + if (fb_pfn < fr_pfn) + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + if (fr_pfn < fb_pfn) + fr_pfn = memory_bm_next_pfn(free_pages_map); + } while (fb_pfn != fr_pfn); + + if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { + struct page *page = pfn_to_page(fr_pfn); + + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); + hibernate_restore_unprotect_page(page_address(page)); + __free_page(page); + goto loop; } + +out: nr_copy_pages = 0; nr_meta_pages = 0; + nr_zero_pages = 0; restore_pblist = NULL; buffer = NULL; alloc_normal = 0; alloc_highmem = 0; + hibernate_restore_protection_end(); } /* Helper functions used for the shrinking of memory. */ @@ -1119,7 +1638,7 @@ void swsusp_free(void) #define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) /** - * preallocate_image_pages - Allocate a number of pages for hibernation image + * preallocate_image_pages - Allocate a number of pages for hibernation image. * @nr_pages: Number of page frames to allocate. * @mask: GFP flags to use for the allocation. * @@ -1169,18 +1688,16 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages) } /** - * __fraction - Compute (an approximation of) x * (multiplier / base) + * __fraction - Compute (an approximation of) x * (multiplier / base). */ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) { - x *= multiplier; - do_div(x, base); - return (unsigned long)x; + return div64_u64(x * multiplier, base); } static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) + unsigned long highmem, + unsigned long total) { unsigned long alloc = __fraction(nr_pages, highmem, total); @@ -1193,19 +1710,19 @@ static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) } static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, - unsigned long highmem, - unsigned long total) + unsigned long highmem, + unsigned long total) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * free_unnecessary_pages - Release preallocated pages not needed for the image + * free_unnecessary_pages - Release preallocated pages not needed for the image. */ -static void free_unnecessary_pages(void) +static unsigned long free_unnecessary_pages(void) { - unsigned long save, to_free_normal, to_free_highmem; + unsigned long save, to_free_normal, to_free_highmem, free; save = count_data_pages(); if (alloc_normal >= save) { @@ -1226,6 +1743,7 @@ static void free_unnecessary_pages(void) else to_free_normal = 0; } + free = to_free_normal + to_free_highmem; memory_bm_position_reset(©_bm); @@ -1249,10 +1767,12 @@ static void free_unnecessary_pages(void) swsusp_unset_page_free(page); __free_page(page); } + + return free; } /** - * minimum_image_size - Estimate the minimum acceptable size of an image + * minimum_image_size - Estimate the minimum acceptable size of an image. * @saveable: Number of saveable pages in the system. * * We want to avoid attempting to free too much memory too hard, so estimate the @@ -1264,37 +1784,35 @@ static void free_unnecessary_pages(void) * [number of saveable pages] - [number of pages that can be freed in theory] * * where the second term is the sum of (1) reclaimable slab pages, (2) active - * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, - * minus mapped file pages. + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages. */ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_page_state(NR_SLAB_RECLAIMABLE) - + global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_FILE) - - global_page_state(NR_FILE_MAPPED); + size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); return saveable <= size ? 0 : saveable - size; } /** - * hibernate_preallocate_memory - Preallocate memory for hibernation image + * hibernate_preallocate_memory - Preallocate memory for hibernation image. * * To create a hibernation image it is necessary to make a copy of every page * frame in use. We also need a number of page frames to be free during * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough - * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * - * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 - * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) + * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2 + * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) * * of them, which corresponds to the maximum size of a hibernation image. * @@ -1308,22 +1826,33 @@ int hibernate_preallocate_memory(void) struct zone *zone; unsigned long saveable, size, max_size, count, highmem, pages = 0; unsigned long alloc, save_highmem, pages_highmem, avail_normal; - struct timeval start, stop; + ktime_t start, stop; int error; - printk(KERN_INFO "PM: Preallocating image memory... "); - do_gettimeofday(&start); + pr_info("Preallocating image memory\n"); + start = ktime_get(); error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); - if (error) + if (error) { + pr_err("Cannot allocate original bitmap\n"); goto err_out; + } error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); - if (error) + if (error) { + pr_err("Cannot allocate copy bitmap\n"); goto err_out; + } + + error = memory_bm_create(&zero_bm, GFP_IMAGE, PG_ANY); + if (error) { + pr_err("Cannot allocate zero bitmap\n"); + goto err_out; + } alloc_normal = 0; alloc_highmem = 0; + nr_zero_pages = 0; /* Count the number of saveable data pages. */ save_highmem = count_highmem_pages(); @@ -1348,9 +1877,6 @@ int hibernate_preallocate_memory(void) count += highmem; count -= totalreserve_pages; - /* Add number of pages required for page keys (s390 only). */ - size += page_key_additional_pages(saveable); - /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); @@ -1399,15 +1925,22 @@ int hibernate_preallocate_memory(void) * highmem and non-highmem zones separately. */ pages_highmem = preallocate_image_highmem(highmem / 2); - alloc = (count - max_size) - pages_highmem; + alloc = count - max_size; + if (alloc > pages_highmem) + alloc -= pages_highmem; + else + alloc = 0; pages = preallocate_image_memory(alloc, avail_normal); if (pages < alloc) { /* We have exhausted non-highmem pages, try highmem. */ alloc -= pages; pages += pages_highmem; pages_highmem = preallocate_image_highmem(alloc); - if (pages_highmem < alloc) + if (pages_highmem < alloc) { + pr_err("Image allocation is %lu pages short\n", + alloc - pages_highmem); goto err_out; + } pages += pages_highmem; /* * size is the desired number of saveable pages to leave in @@ -1434,27 +1967,27 @@ int hibernate_preallocate_memory(void) * pages in memory, but we have allocated more. Release the excessive * ones now. */ - free_unnecessary_pages(); + pages -= free_unnecessary_pages(); out: - do_gettimeofday(&stop); - printk(KERN_CONT "done (allocated %lu pages)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Allocated"); + stop = ktime_get(); + pr_info("Allocated %lu pages for snapshot\n", pages); + swsusp_show_speed(start, stop, pages, "Allocated"); return 0; err_out: - printk(KERN_CONT "\n"); swsusp_free(); return -ENOMEM; } #ifdef CONFIG_HIGHMEM /** - * count_pages_for_highmem - compute the number of non-highmem pages - * that will be necessary for creating copies of highmem pages. - */ - + * count_pages_for_highmem - Count non-highmem pages needed for copying highmem. + * + * Compute the number of non-highmem pages that will be necessary for creating + * copies of highmem pages. + */ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; @@ -1467,15 +2000,12 @@ static unsigned int count_pages_for_highmem(unsigned int nr_highmem) return nr_highmem; } #else -static unsigned int -count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * enough_free_mem - Make sure we have enough free memory for the - * snapshot image. + * enough_free_mem - Check if there is enough free memory for the image. */ - static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; @@ -1486,32 +2016,33 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) free += zone_page_state(zone, NR_FREE_PAGES); nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, free); + pr_debug("Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); return free > nr_pages + PAGES_FOR_IO; } #ifdef CONFIG_HIGHMEM /** - * get_highmem_buffer - if there are some highmem pages in the suspend - * image, we may need the buffer to copy them and/or load their data. + * get_highmem_buffer - Allocate a buffer for highmem pages. + * + * If there are some highmem pages in the hibernation image, we may need a + * buffer to copy them and/or load their data. */ - static inline int get_highmem_buffer(int safe_needed) { - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + buffer = get_image_page(GFP_ATOMIC, safe_needed); return buffer ? 0 : -ENOMEM; } /** - * alloc_highmem_image_pages - allocate some highmem pages for the image. - * Try to allocate as many pages as needed, but if the number of free - * highmem pages is lesser than that, allocate them all. + * alloc_highmem_pages - Allocate some highmem pages for the image. + * + * Try to allocate as many pages as needed, but if the number of free highmem + * pages is less than that, allocate them all. */ - -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int nr_highmem) { unsigned int to_alloc = count_free_highmem_pages(); @@ -1522,7 +2053,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; @@ -1530,25 +2061,23 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) #else static inline int get_highmem_buffer(int safe_needed) { return 0; } -static inline unsigned int -alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, + unsigned int n) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * swsusp_alloc - allocate memory for the suspend image + * swsusp_alloc - Allocate memory for hibernation image. * - * We first try to allocate as many highmem pages as there are - * saveable highmem pages in the system. If that fails, we allocate - * non-highmem pages for the copies of the remaining highmem ones. + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. * - * In this approach it is likely that the copies of highmem pages will - * also be located in the high memory, because of the way in which - * copy_data_pages() works. + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. */ - -static int -swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, - unsigned int nr_pages, unsigned int nr_highmem) +static int swsusp_alloc(struct memory_bitmap *copy_bm, + unsigned int nr_pages, unsigned int nr_highmem) { if (nr_highmem > 0) { if (get_highmem_buffer(PG_ANY)) @@ -1563,7 +2092,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, while (nr_pages-- > 0) { struct page *page; - page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_image_page(GFP_ATOMIC); if (!page) goto err_out; memory_bm_set_bit(copy_bm, page_to_pfn(page)); @@ -1577,45 +2106,44 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, return -ENOMEM; } -asmlinkage int swsusp_save(void) +asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - printk(KERN_INFO "PM: Creating hibernation image:\n"); + pm_deferred_pr_dbg("Creating image\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); - printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); + pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem); if (!enough_free_mem(nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Not enough free memory\n"); + pm_deferred_pr_dbg("Not enough free memory for image creation\n"); return -ENOMEM; } - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Memory allocation failed\n"); + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) return -ENOMEM; - } - /* During allocating of suspend pagedir, new cold pages may appear. + /* + * During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(NULL); - copy_data_pages(©_bm, &orig_bm); + nr_copy_pages = copy_data_pages(©_bm, &orig_bm, &zero_bm); /* * End of critical section. From now on, we can write to memory, * but we should not touch disk. This specially means we must _not_ * touch swap space! Except we must write out our image of course. */ - nr_pages += nr_highmem; - nr_copy_pages = nr_pages; + /* We don't actually copy the zero pages */ + nr_zero_pages = nr_pages - nr_copy_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n", - nr_pages); + pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n", + nr_copy_pages, nr_zero_pages); return 0; } @@ -1628,17 +2156,17 @@ static int init_header_complete(struct swsusp_info *info) return 0; } -static char *check_image_kernel(struct swsusp_info *info) +static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; - if (strcmp(info->uts.sysname,init_utsname()->sysname)) + if (strcmp(info->uts.sysname, init_utsname()->sysname)) return "system type"; - if (strcmp(info->uts.release,init_utsname()->release)) + if (strcmp(info->uts.release, init_utsname()->release)) return "kernel release"; - if (strcmp(info->uts.version,init_utsname()->version)) + if (strcmp(info->uts.version, init_utsname()->version)) return "version"; - if (strcmp(info->uts.machine,init_utsname()->machine)) + if (strcmp(info->uts.machine, init_utsname()->machine)) return "machine"; return NULL; } @@ -1660,13 +2188,22 @@ static int init_header(struct swsusp_info *info) return init_header_complete(info); } +#define ENCODED_PFN_ZERO_FLAG ((unsigned long)1 << (BITS_PER_LONG - 1)) +#define ENCODED_PFN_MASK (~ENCODED_PFN_ZERO_FLAG) + /** - * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm - * are stored in the array @buf[] (1 page at a time) + * pack_pfns - Prepare PFNs for saving. + * @bm: Memory bitmap. + * @buf: Memory buffer to store the PFNs in. + * @zero_bm: Memory bitmap containing PFNs of zero pages. + * + * PFNs corresponding to set bits in @bm are stored in the area of memory + * pointed to by @buf (1 page at a time). Pages which were filled with only + * zeros will have the highest bit set in the packed format to distinguish + * them from PFNs which will be contained in the image file. */ - -static inline void -pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { int j; @@ -1674,28 +2211,27 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; - /* Save page key for data page (s390 only). */ - page_key_read(buf + j); + if (memory_bm_test_bit(zero_bm, buf[j])) + buf[j] |= ENCODED_PFN_ZERO_FLAG; } } /** - * snapshot_read_next - used for reading the system memory snapshot. + * snapshot_read_next - Get the address to read the next image page from. + * @handle: Snapshot handle to be used for the reading. * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. * - * On success the function returns a positive number. Then, the caller - * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. + * On success, the function returns a positive number. Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro. * - * The function returns 0 to indicate the end of data stream condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. + * The function returns 0 to indicate the end of the data stream condition, + * and negative numbers are returned on errors. If that happens, the structure + * pointed to by @handle is not updated and should not be used any more. */ - int snapshot_read_next(struct snapshot_handle *handle) { if (handle->cur > nr_meta_pages + nr_copy_pages) @@ -1718,21 +2254,22 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { clear_page(buffer); - pack_pfns(buffer, &orig_bm); + pack_pfns(buffer, &orig_bm, &zero_bm); } else { struct page *page; page = pfn_to_page(memory_bm_next_pfn(©_bm)); if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, + /* + * Highmem pages are copied to the buffer, * because we can't return with a kmapped * highmem page (we may not be called again). */ void *kaddr; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -1742,75 +2279,61 @@ int snapshot_read_next(struct snapshot_handle *handle) return PAGE_SIZE; } -/** - * mark_unsafe_pages - mark the pages that cannot be used for storing - * the image during resume, because they conflict with the pages that - * had been used before suspend - */ - -static int mark_unsafe_pages(struct memory_bitmap *bm) +static void duplicate_memory_bitmap(struct memory_bitmap *dst, + struct memory_bitmap *src) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long pfn; - /* Clear page flags */ - for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) - swsusp_unset_page_free(pfn_to_page(pfn)); + memory_bm_position_reset(src); + pfn = memory_bm_next_pfn(src); + while (pfn != BM_END_OF_MAP) { + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); } - - /* Mark pages that correspond to the "original" pfns as "unsafe" */ - memory_bm_position_reset(bm); - do { - pfn = memory_bm_next_pfn(bm); - if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) - swsusp_set_page_free(pfn_to_page(pfn)); - else - return -EFAULT; - } - } while (pfn != BM_END_OF_MAP); - - allocated_unsafe_pages = 0; - - return 0; } -static void -duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) +/** + * mark_unsafe_pages - Mark pages that were used before hibernation. + * + * Mark the pages that cannot be used for storing the image during restoration, + * because they conflict with the pages that had been used before hibernation. + */ +static void mark_unsafe_pages(struct memory_bitmap *bm) { unsigned long pfn; - memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src); + /* Clear the "free"/"unsafe" bit for all PFNs */ + memory_bm_position_reset(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, pfn); - pfn = memory_bm_next_pfn(src); + memory_bm_clear_current(free_pages_map); + pfn = memory_bm_next_pfn(free_pages_map); } + + /* Mark pages that correspond to the "original" PFNs as "unsafe" */ + duplicate_memory_bitmap(free_pages_map, bm); + + allocated_unsafe_pages = 0; } static int check_header(struct swsusp_info *info) { - char *reason; + const char *reason; reason = check_image_kernel(info); if (!reason && info->num_physpages != get_num_physpages()) reason = "memory size"; if (reason) { - printk(KERN_ERR "PM: Image mismatch: %s\n", reason); + pr_err("Image mismatch: %s\n", reason); return -EPERM; } return 0; } /** - * load header - check the image header and copy data from it + * load_header - Check the image header and copy the data from it. */ - -static int -load_header(struct swsusp_info *info) +static int load_header(struct swsusp_info *info) { int error; @@ -1824,36 +2347,48 @@ load_header(struct swsusp_info *info) } /** - * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set - * the corresponding bit in the memory bitmap @bm + * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. + * @bm: Memory bitmap. + * @buf: Area of memory containing the PFNs. + * @zero_bm: Memory bitmap with the zero PFNs marked. + * + * For each element of the array pointed to by @buf (1 page at a time), set the + * corresponding bit in @bm. If the page was originally populated with only + * zeros then a corresponding bit will also be set in @zero_bm. */ -static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { + unsigned long decoded_pfn; + bool zero; int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { if (unlikely(buf[j] == BM_END_OF_MAP)) break; - /* Extract and buffer page key for data page (s390 only). */ - page_key_memorize(buf + j); - - if (memory_bm_pfn_present(bm, buf[j])) - memory_bm_set_bit(bm, buf[j]); - else + zero = !!(buf[j] & ENCODED_PFN_ZERO_FLAG); + decoded_pfn = buf[j] & ENCODED_PFN_MASK; + if (pfn_valid(decoded_pfn) && memory_bm_pfn_present(bm, decoded_pfn)) { + memory_bm_set_bit(bm, decoded_pfn); + if (zero) { + memory_bm_set_bit(zero_bm, decoded_pfn); + nr_zero_pages++; + } + } else { + if (!pfn_valid(decoded_pfn)) + pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n", + (unsigned long long)PFN_PHYS(decoded_pfn)); return -EFAULT; + } } return 0; } -/* List of "safe" pages that may be used to store data loaded from the suspend - * image - */ -static struct linked_page *safe_pages_list; - #ifdef CONFIG_HIGHMEM -/* struct highmem_pbe is used for creating the list of highmem pages that +/* + * struct highmem_pbe is used for creating the list of highmem pages that * should be restored atomically during the resume from disk, because the page * frames they have occupied before the suspend are in use. */ @@ -1863,7 +2398,8 @@ struct highmem_pbe { struct highmem_pbe *next; }; -/* List of highmem PBEs needed for restoring the highmem pages that were +/* + * List of highmem PBEs needed for restoring the highmem pages that were * allocated before the suspend and included in the suspend image, but have * also been allocated by the "resume" kernel, so their contents cannot be * written directly to their "original" page frames. @@ -1871,11 +2407,11 @@ struct highmem_pbe { static struct highmem_pbe *highmem_pblist; /** - * count_highmem_image_pages - compute the number of highmem pages in the - * suspend image. The bits in the memory bitmap @bm that correspond to the - * image pages are assumed to be set. + * count_highmem_image_pages - Compute the number of highmem pages in the image. + * @bm: Memory bitmap. + * + * The bits in @bm that correspond to image pages are assumed to be set. */ - static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { unsigned long pfn; @@ -1892,24 +2428,25 @@ static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) return cnt; } -/** - * prepare_highmem_image - try to allocate as many highmem pages as - * there are highmem image pages (@nr_highmem_p points to the variable - * containing the number of highmem image pages). The pages that are - * "safe" (ie. will not be overwritten when the suspend image is - * restored) have the corresponding bits set in @bm (it must be - * unitialized). - * - * NOTE: This function should not be called if there are no highmem - * image pages. - */ - static unsigned int safe_highmem_pages; static struct memory_bitmap *safe_highmem_bm; -static int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +/** + * prepare_highmem_image - Allocate memory for loading highmem data from image. + * @bm: Pointer to an uninitialized memory bitmap structure. + * @nr_highmem_p: Pointer to the number of highmem image pages. + * + * Try to allocate as many highmem pages as there are highmem image pages + * (@nr_highmem_p points to the variable containing the number of highmem image + * pages). The pages that are "safe" (ie. will not be overwritten when the + * hibernation image is restored entirely) have the corresponding bits set in + * @bm (it must be uninitialized). + * + * NOTE: This function should not be called if there are no highmem image pages. + */ +static int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { unsigned int to_alloc; @@ -1944,39 +2481,42 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) return 0; } +static struct page *last_highmem_page; + /** - * get_highmem_page_buffer - for given highmem image page find the buffer - * that suspend_write_next() should set for its caller to write to. + * get_highmem_page_buffer - Prepare a buffer to store a highmem image page. * - * If the page is to be saved to its "original" page frame or a copy of - * the page is to be made in the highmem, @buffer is returned. Otherwise, - * the copy of the page is to be made in normal memory, so the address of - * the copy is returned. + * For a given highmem image page get a buffer that suspend_write_next() should + * return to its caller to write to. * - * If @buffer is returned, the caller of suspend_write_next() will write - * the page's contents to @buffer, so they will have to be copied to the - * right location on the next call to suspend_write_next() and it is done - * with the help of copy_last_highmem_page(). For this purpose, if - * @buffer is returned, @last_highmem page is set to the page to which - * the data will have to be copied from @buffer. + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem_page is set to the page to which + * the data will have to be copied from @buffer. */ - -static struct page *last_highmem_page; - -static void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) { struct highmem_pbe *pbe; void *kaddr; if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { - /* We have allocated the "original" page frame and we can + /* + * We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ last_highmem_page = page; return buffer; } - /* The "original" page frame has not been allocated and we have to + /* + * The "original" page frame has not been allocated and we have to * use a "safe" page frame to store the loaded page. */ pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); @@ -1996,8 +2536,9 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) pbe->copy_page = tmp; } else { /* Copy of the page will be stored in normal memory */ - kaddr = safe_pages_list; - safe_pages_list = safe_pages_list->next; + kaddr = __get_safe_page(ca->gfp_mask); + if (!kaddr) + return ERR_PTR(-ENOMEM); pbe->copy_page = virt_to_page(kaddr); } pbe->next = highmem_pblist; @@ -2006,19 +2547,20 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) } /** - * copy_last_highmem_page - copy the contents of a highmem image from - * @buffer, where the caller of snapshot_write_next() has place them, - * to the right location represented by @last_highmem_page . + * copy_last_highmem_page - Copy most the most recent highmem image page. + * + * Copy the contents of a highmem image from @buffer, where the caller of + * snapshot_write_next() has stored them, to the right location represented by + * @last_highmem_page . */ - static void copy_last_highmem_page(void) { if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page); + dst = kmap_local_page(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); last_highmem_page = NULL; } } @@ -2037,19 +2579,13 @@ static inline void free_highmem_data(void) free_image_page(buffer, PG_UNSAFE_CLEAR); } #else -static inline int get_safe_write_buffer(void) { return 0; } - -static unsigned int -count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } -static inline int -prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) -{ - return 0; -} +static inline int prepare_highmem_image(struct memory_bitmap *bm, + unsigned int *nr_highmem_p) { return 0; } -static inline void * -get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +static inline void *get_highmem_page_buffer(struct page *page, + struct chain_allocator *ca) { return ERR_PTR(-EINVAL); } @@ -2059,27 +2595,33 @@ static inline int last_highmem_page_copied(void) { return 1; } static inline void free_highmem_data(void) {} #endif /* CONFIG_HIGHMEM */ +#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) + /** - * prepare_image - use the memory bitmap @bm to mark the pages that will - * be overwritten in the process of restoring the system memory state - * from the suspend image ("unsafe" pages) and allocate memory for the - * image. + * prepare_image - Make room for loading hibernation image. + * @new_bm: Uninitialized memory bitmap structure. + * @bm: Memory bitmap with unsafe pages marked. + * @zero_bm: Memory bitmap containing the zero pages. + * + * Use @bm to mark the pages that will be overwritten in the process of + * restoring the system memory state from the suspend image ("unsafe" pages) + * and allocate memory for the image. + * + * The idea is to allocate a new memory bitmap first and then allocate + * as many pages as needed for image data, but without specifying what those + * pages will be used for just yet. Instead, we mark them all as allocated and + * create a lists of "safe" pages to be used later. On systems with high + * memory a list of "safe" highmem pages is created too. * - * The idea is to allocate a new memory bitmap first and then allocate - * as many pages as needed for the image data, but not to assign these - * pages to specific tasks initially. Instead, we just mark them as - * allocated and create a lists of "safe" pages that will be used - * later. On systems with high memory a list of "safe" highmem pages is - * also created. + * Because it was not known which pages were unsafe when @zero_bm was created, + * make a copy of it and recreate it within safe pages. */ - -#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) - -static int -prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { unsigned int nr_pages, nr_highmem; - struct linked_page *sp_list, *lp; + struct memory_bitmap tmp; + struct linked_page *lp; int error; /* If there is no highmem, the buffer will not be necessary */ @@ -2087,9 +2629,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) buffer = NULL; nr_highmem = count_highmem_image_pages(bm); - error = mark_unsafe_pages(bm); - if (error) - goto Free; + mark_unsafe_pages(bm); error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); if (error) @@ -2097,20 +2637,39 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) duplicate_memory_bitmap(new_bm, bm); memory_bm_free(bm, PG_UNSAFE_KEEP); + + /* Make a copy of zero_bm so it can be created in safe pages */ + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(&tmp, zero_bm); + memory_bm_free(zero_bm, PG_UNSAFE_KEEP); + + /* Recreate zero_bm in safe pages */ + error = memory_bm_create(zero_bm, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(zero_bm, &tmp); + memory_bm_free(&tmp, PG_UNSAFE_CLEAR); + /* At this point zero_bm is in safe pages and it can be used for restoring. */ + if (nr_highmem > 0) { error = prepare_highmem_image(bm, &nr_highmem); if (error) goto Free; } - /* Reserve some safe pages for potential later use. + /* + * Reserve some safe pages for potential later use. * * NOTE: This way we make sure there will be enough safe pages for the * chain_alloc() in get_buffer(). It is a bit wasteful, but * nr_copy_pages cannot be greater than 50% of the memory anyway. + * + * nr_copy_pages cannot be less than allocated_unsafe_pages too. */ - sp_list = NULL; - /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { lp = get_image_page(GFP_ATOMIC, PG_SAFE); @@ -2118,13 +2677,12 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) error = -ENOMEM; goto Free; } - lp->next = sp_list; - sp_list = lp; + lp->next = safe_pages_list; + safe_pages_list = lp; nr_pages--; } /* Preallocate memory for the image */ - safe_pages_list = NULL; - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); if (!lp) { @@ -2141,12 +2699,6 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) swsusp_set_page_free(virt_to_page(lp)); nr_pages--; } - /* Free the reserved safe pages so that chain_alloc() can use them */ - while (sp_list) { - lp = sp_list->next; - free_image_page(sp_list, PG_UNSAFE_CLEAR); - sp_list = lp; - } return 0; Free: @@ -2155,10 +2707,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) } /** - * get_buffer - compute the address that snapshot_write_next() should - * set for its caller to write to. + * get_buffer - Get the address to store the next image data page. + * + * Get the address that snapshot_write_next() should return to its caller to + * write to. */ - static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; @@ -2173,12 +2726,14 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) return get_highmem_page_buffer(page, ca); if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) - /* We have allocated the "original" page frame and we can + /* + * We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ return page_address(page); - /* The "original" page frame has not been allocated and we have to + /* + * The "original" page frame has not been allocated and we have to * use a "safe" page frame to store the loaded page. */ pbe = chain_alloc(ca, sizeof(struct pbe)); @@ -2187,41 +2742,40 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); - pbe->address = safe_pages_list; - safe_pages_list = safe_pages_list->next; + pbe->address = __get_safe_page(ca->gfp_mask); + if (!pbe->address) + return ERR_PTR(-ENOMEM); pbe->next = restore_pblist; restore_pblist = pbe; return pbe->address; } /** - * snapshot_write_next - used for writing the system memory snapshot. + * snapshot_write_next - Get the address to store the next image page. + * @handle: Snapshot handle structure to guide the writing. * - * On the first call to it @handle should point to a zeroed - * snapshot_handle structure. The structure gets updated and a pointer - * to it should be passed to this function every next time. + * On the first call, @handle should point to a zeroed snapshot_handle + * structure. The structure gets populated then and a pointer to it should be + * passed to this function every next time. * - * On success the function returns a positive number. Then, the caller - * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. + * On success, the function returns a positive number. Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro. * - * The function returns 0 to indicate the "end of file" condition, - * and a negative number is returned on error. In such cases the - * structure pointed to by @handle is not updated and should not be used - * any more. + * The function returns 0 to indicate the "end of file" condition. Negative + * numbers are returned on errors, in which cases the structure pointed to by + * @handle is not updated and should not be used any more. */ - int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; - int error = 0; + int error; +next: /* Check if we have already loaded the entire image */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) return 0; - handle->sync_read = 1; - if (!handle->cur) { if (!buffer) /* This makes the buffer be freed by swsusp_free() */ @@ -2236,100 +2790,113 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + safe_pages_list = NULL; + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); if (error) return error; - /* Allocate buffer for page keys. */ - error = page_key_alloc(nr_copy_pages); + error = memory_bm_create(&zero_bm, GFP_ATOMIC, PG_ANY); if (error) return error; + nr_zero_pages = 0; + + hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { - error = unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm, &zero_bm); if (error) return error; if (handle->cur == nr_meta_pages + 1) { - error = prepare_image(&orig_bm, ©_bm); + error = prepare_image(&orig_bm, ©_bm, &zero_bm); if (error) return error; chain_init(&ca, GFP_ATOMIC, PG_SAFE); memory_bm_position_reset(&orig_bm); + memory_bm_position_reset(&zero_bm); restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); } } else { copy_last_highmem_page(); - /* Restore page key for data page (s390 only). */ - page_key_write(handle->buffer); + error = hibernate_restore_protect_page(handle->buffer); + if (error) + return error; handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); - if (handle->buffer != buffer) - handle->sync_read = 0; } + handle->sync_read = (handle->buffer == buffer); handle->cur++; + + /* Zero pages were not included in the image, memset it and move on. */ + if (handle->cur > nr_meta_pages + 1 && + memory_bm_test_bit(&zero_bm, memory_bm_get_current(&orig_bm))) { + memset(handle->buffer, 0, PAGE_SIZE); + goto next; + } + return PAGE_SIZE; } /** - * snapshot_write_finalize - must be called after the last call to - * snapshot_write_next() in case the last page in the image happens - * to be a highmem page and its contents should be stored in the - * highmem. Additionally, it releases the memory that will not be - * used any more. + * snapshot_write_finalize - Complete the loading of a hibernation image. + * + * Must be called after the last call to snapshot_write_next() in case the last + * page in the image happens to be a highmem page and its contents should be + * stored in highmem. Additionally, it recycles bitmap memory that's not + * necessary any more. */ - -void snapshot_write_finalize(struct snapshot_handle *handle) +int snapshot_write_finalize(struct snapshot_handle *handle) { + int error; + copy_last_highmem_page(); - /* Restore page key for data page (s390 only). */ - page_key_write(handle->buffer); - page_key_free(); - /* Free only if we have loaded the image entirely */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { - memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + error = hibernate_restore_protect_page(handle->buffer); + /* Do that only if we have loaded the image entirely */ + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) { + memory_bm_recycle(&orig_bm); free_highmem_data(); } + return error; } int snapshot_image_loaded(struct snapshot_handle *handle) { return !(!nr_copy_pages || !last_highmem_page_copied() || - handle->cur <= nr_meta_pages + nr_copy_pages); + handle->cur <= nr_meta_pages + nr_copy_pages + nr_zero_pages); } #ifdef CONFIG_HIGHMEM /* Assumes that @buf is ready and points to a "safe" page */ -static inline void -swap_two_pages_data(struct page *p1, struct page *p2, void *buf) +static inline void swap_two_pages_data(struct page *p1, struct page *p2, + void *buf) { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1); - kaddr2 = kmap_atomic(p2); + kaddr1 = kmap_local_page(p1); + kaddr2 = kmap_local_page(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2); - kunmap_atomic(kaddr1); + kunmap_local(kaddr2); + kunmap_local(kaddr1); } /** - * restore_highmem - for each highmem page that was allocated before - * the suspend and included in the suspend image, and also has been - * allocated by the "resume" kernel swap its current (ie. "before - * resume") contents with the previous (ie. "before suspend") one. + * restore_highmem - Put highmem image pages into their original locations. + * + * For each highmem page that was in use before hibernation and is included in + * the image, and also has been allocated by the "restore" kernel, swap its + * current contents with the previous (ie. "before hibernation") ones. * - * If the resume eventually fails, we can call this function once - * again and restore the "before resume" highmem state. + * If the restore eventually fails, we can call this function once again and + * restore the highmem state as seen by the restore kernel. */ - int restore_highmem(void) { struct highmem_pbe *pbe = highmem_pblist; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ece04223bb1e..2da4482bb6eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -1,20 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/suspend.c - Suspend to RAM and standby functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. - * - * This file is released under the GPLv2. */ +#define pr_fmt(fmt) "PM: " fmt + #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/console.h> #include <linux/cpu.h> -#include <linux/syscalls.h> +#include <linux/cpuidle.h> #include <linux/gfp.h> #include <linux/io.h> #include <linux/kernel.h> @@ -24,43 +25,193 @@ #include <linux/export.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> +#include <linux/swait.h> #include <linux/ftrace.h> #include <trace/events/power.h> +#include <linux/compiler.h> +#include <linux/moduleparam.h> +#include <linux/fs.h> #include "power.h" -const char *const pm_states[PM_SUSPEND_MAX] = { - [PM_SUSPEND_FREEZE] = "freeze", - [PM_SUSPEND_STANDBY] = "standby", - [PM_SUSPEND_MEM] = "mem", +const char * const pm_labels[] = { + [PM_SUSPEND_TO_IDLE] = "freeze", + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; +const char *pm_states[PM_SUSPEND_MAX]; +static const char * const mem_sleep_labels[] = { + [PM_SUSPEND_TO_IDLE] = "s2idle", + [PM_SUSPEND_STANDBY] = "shallow", + [PM_SUSPEND_MEM] = "deep", }; +const char *mem_sleep_states[PM_SUSPEND_MAX]; + +suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE; +suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; +suspend_state_t pm_suspend_target_state; +EXPORT_SYMBOL_GPL(pm_suspend_target_state); + +unsigned int pm_suspend_global_flags; +EXPORT_SYMBOL_GPL(pm_suspend_global_flags); static const struct platform_suspend_ops *suspend_ops; +static const struct platform_s2idle_ops *s2idle_ops; +static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); + +enum s2idle_states __read_mostly s2idle_state; +static DEFINE_RAW_SPINLOCK(s2idle_lock); + +/** + * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend. + * + * Return 'true' if suspend-to-idle has been selected as the default system + * suspend method. + */ +bool pm_suspend_default_s2idle(void) +{ + return mem_sleep_current == PM_SUSPEND_TO_IDLE; +} +EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); + +void s2idle_set_ops(const struct platform_s2idle_ops *ops) +{ + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); + s2idle_ops = ops; + unlock_system_sleep(sleep_flags); +} + +static void s2idle_begin(void) +{ + s2idle_state = S2IDLE_STATE_NONE; +} -static bool need_suspend_ops(suspend_state_t state) +static void s2idle_enter(void) { - return !!(state > PM_SUSPEND_FREEZE); + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + + /* + * The correctness of the code below depends on the number of online + * CPUs being stable, but CPUs cannot be taken offline or put online + * while it is running. + * + * The s2idle_lock must be acquired before the pending wakeup check to + * prevent pm_system_wakeup() from running as a whole between that check + * and the subsequent s2idle_state update in which case a wakeup event + * would get lost. + */ + raw_spin_lock_irq(&s2idle_lock); + if (pm_wakeup_pending()) + goto out; + + s2idle_state = S2IDLE_STATE_ENTER; + raw_spin_unlock_irq(&s2idle_lock); + + /* Push all the CPUs into the idle loop. */ + wake_up_all_idle_cpus(); + /* Make the current CPU wait so it can enter the idle loop too. */ + swait_event_exclusive(s2idle_wait_head, + s2idle_state == S2IDLE_STATE_WAKE); + + /* + * Kick all CPUs to ensure that they resume their timers and restore + * consistent system state. + */ + wake_up_all_idle_cpus(); + + raw_spin_lock_irq(&s2idle_lock); + + out: + s2idle_state = S2IDLE_STATE_NONE; + raw_spin_unlock_irq(&s2idle_lock); + + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false); +} + +static void s2idle_loop(void) +{ + pm_pr_dbg("suspend-to-idle\n"); + + /* + * Suspend-to-idle equals: + * frozen processes + suspended devices + idle processors. + * Thus s2idle_enter() should be called right after all devices have + * been suspended. + * + * Wakeups during the noirq suspend of devices may be spurious, so try + * to avoid them upfront. + */ + for (;;) { + if (s2idle_ops && s2idle_ops->wake) { + if (s2idle_ops->wake()) + break; + } else if (pm_wakeup_pending()) { + break; + } + + if (s2idle_ops && s2idle_ops->check) + s2idle_ops->check(); + + s2idle_enter(); + } + + pm_pr_dbg("resume from suspend-to-idle\n"); } -static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); -static bool suspend_freeze_wake; +void s2idle_wake(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&s2idle_lock, flags); + if (s2idle_state > S2IDLE_STATE_NONE) { + s2idle_state = S2IDLE_STATE_WAKE; + swake_up_one(&s2idle_wait_head); + } + raw_spin_unlock_irqrestore(&s2idle_lock, flags); +} +EXPORT_SYMBOL_GPL(s2idle_wake); -static void freeze_begin(void) +static bool valid_state(suspend_state_t state) { - suspend_freeze_wake = false; + /* + * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level + * support and need to be valid to the low-level implementation. + * + * No ->valid() or ->enter() callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) && + suspend_ops->enter; } -static void freeze_enter(void) +void __init pm_states_init(void) { - wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + /* "mem" and "freeze" are always present in /sys/power/state. */ + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; + pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE]; + /* + * Suspend-to-idle should be supported even without any suspend_ops, + * initialize mem_sleep_states[] accordingly here. + */ + mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE]; } -void freeze_wake(void) +static int __init mem_sleep_default_setup(char *str) { - suspend_freeze_wake = true; - wake_up(&suspend_freeze_wait_head); + suspend_state_t state; + + for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++) + if (mem_sleep_labels[state] && + !strcmp(str, mem_sleep_labels[state])) { + mem_sleep_default = state; + mem_sleep_current = state; + break; + } + + return 1; } -EXPORT_SYMBOL_GPL(freeze_wake); +__setup("mem_sleep_default=", mem_sleep_default_setup); /** * suspend_set_ops - Set the global suspend method table. @@ -68,38 +219,31 @@ EXPORT_SYMBOL_GPL(freeze_wake); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); + suspend_ops = ops; - unlock_system_sleep(); -} -EXPORT_SYMBOL_GPL(suspend_set_ops); -bool valid_state(suspend_state_t state) -{ - if (state == PM_SUSPEND_FREEZE) { -#ifdef CONFIG_PM_DEBUG - if (pm_test_level != TEST_NONE && - pm_test_level != TEST_FREEZER && - pm_test_level != TEST_DEVICES && - pm_test_level != TEST_PLATFORM) { - printk(KERN_WARNING "Unsupported pm_test mode for " - "freeze state, please choose " - "none/freezer/devices/platform.\n"); - return false; - } -#endif - return true; + if (valid_state(PM_SUSPEND_STANDBY)) { + mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY]; + pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY]; + if (mem_sleep_default == PM_SUSPEND_STANDBY) + mem_sleep_current = PM_SUSPEND_STANDBY; } - /* - * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel - * support and need to be valid to the lowlevel - * implementation, no valid callback implies that none are valid. - */ - return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); + if (valid_state(PM_SUSPEND_MEM)) { + mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; + if (mem_sleep_default >= PM_SUSPEND_MEM) + mem_sleep_current = PM_SUSPEND_MEM; + } + + unlock_system_sleep(sleep_flags); } +EXPORT_SYMBOL_GPL(suspend_set_ops); /** * suspend_valid_only_mem - Generic memory-only valid callback. + * @state: Target system sleep state. * * Platform drivers that implement mem suspend only and only need to check for * that in their .valid() callback can use this instead of rolling their own @@ -111,12 +255,103 @@ int suspend_valid_only_mem(suspend_state_t state) } EXPORT_SYMBOL_GPL(suspend_valid_only_mem); +static bool sleep_state_supported(suspend_state_t state) +{ + return state == PM_SUSPEND_TO_IDLE || + (valid_state(state) && !cxl_mem_active()); +} + +static int platform_suspend_prepare(suspend_state_t state) +{ + return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ? + suspend_ops->prepare() : 0; +} + +static int platform_suspend_prepare_late(suspend_state_t state) +{ + return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ? + s2idle_ops->prepare() : 0; +} + +static int platform_suspend_prepare_noirq(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE) + return s2idle_ops && s2idle_ops->prepare_late ? + s2idle_ops->prepare_late() : 0; + + return suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0; +} + +static void platform_resume_noirq(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE) { + if (s2idle_ops && s2idle_ops->restore_early) + s2idle_ops->restore_early(); + } else if (suspend_ops->wake) { + suspend_ops->wake(); + } +} + +static void platform_resume_early(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore) + s2idle_ops->restore(); +} + +static void platform_resume_finish(suspend_state_t state) +{ + if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish) + suspend_ops->finish(); +} + +static int platform_suspend_begin(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin) + return s2idle_ops->begin(); + else if (suspend_ops && suspend_ops->begin) + return suspend_ops->begin(state); + else + return 0; +} + +static void platform_resume_end(suspend_state_t state) +{ + if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end) + s2idle_ops->end(); + else if (suspend_ops && suspend_ops->end) + suspend_ops->end(); +} + +static void platform_recover(suspend_state_t state) +{ + if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover) + suspend_ops->recover(); +} + +static bool platform_suspend_again(suspend_state_t state) +{ + return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ? + suspend_ops->suspend_again() : false; +} + +#ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from suspend test"); +#endif + static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG + int i; + if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); + pr_info("suspend debug: Waiting for %d second(s).\n", + pm_test_delay); + for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++) + msleep(1000); + return 1; } #endif /* !CONFIG_PM_DEBUG */ @@ -125,6 +360,7 @@ static int suspend_test(int level) /** * suspend_prepare - Prepare for entering system sleep state. + * @state: Target system sleep state. * * Common code run for every system sleep state that can be entered (except for * hibernation). Run suspend notifiers, allocate the "suspend" console and @@ -134,35 +370,38 @@ static int suspend_prepare(suspend_state_t state) { int error; - if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) + if (!sleep_state_supported(state)) return -EPERM; pm_prepare_console(); - error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + error = pm_notifier_call_chain_robust(PM_SUSPEND_PREPARE, PM_POST_SUSPEND); if (error) - goto Finish; + goto Restore; + filesystems_freeze(filesystem_freeze_enabled); + trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); + trace_suspend_resume(TPS("freeze_processes"), 0, false); if (!error) return 0; - suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); - Finish: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); + Restore: pm_restore_console(); return error; } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +void __weak arch_suspend_disable_irqs(void) { local_irq_disable(); } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +void __weak arch_suspend_enable_irqs(void) { local_irq_enable(); } @@ -178,71 +417,80 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (need_suspend_ops(state) && suspend_ops->prepare) { - error = suspend_ops->prepare(); - if (error) - goto Platform_finish; - } + error = platform_suspend_prepare(state); + if (error) + goto Platform_finish; - error = dpm_suspend_end(PMSG_SUSPEND); + error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); + pr_err("late suspend of devices failed\n"); goto Platform_finish; } + error = platform_suspend_prepare_late(state); + if (error) + goto Devices_early_resume; - if (need_suspend_ops(state) && suspend_ops->prepare_late) { - error = suspend_ops->prepare_late(); - if (error) - goto Platform_wake; + error = dpm_suspend_noirq(PMSG_SUSPEND); + if (error) { + pr_err("noirq suspend of devices failed\n"); + goto Platform_early_resume; } + error = platform_suspend_prepare_noirq(state); + if (error) + goto Platform_wake; if (suspend_test(TEST_PLATFORM)) goto Platform_wake; - /* - * PM_SUSPEND_FREEZE equals - * frozen processes + suspended devices + idle processors. - * Thus we should invoke freeze_enter() soon after - * all the devices are suspended. - */ - if (state == PM_SUSPEND_FREEZE) { - freeze_enter(); + if (state == PM_SUSPEND_TO_IDLE) { + s2idle_loop(); goto Platform_wake; } - error = disable_nonboot_cpus(); + error = pm_sleep_disable_secondary_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); + system_state = SYSTEM_SUSPEND; + error = syscore_suspend(); if (!error) { *wakeup = pm_wakeup_pending(); if (!(suspend_test(TEST_CORE) || *wakeup)) { + trace_suspend_resume(TPS("machine_suspend"), + state, true); error = suspend_ops->enter(state); - events_check_enabled = false; + trace_suspend_resume(TPS("machine_suspend"), + state, false); + } else if (*wakeup) { + error = -EBUSY; } syscore_resume(); } + system_state = SYSTEM_RUNNING; + arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); Enable_cpus: - enable_nonboot_cpus(); + pm_sleep_enable_secondary_cpus(); Platform_wake: - if (need_suspend_ops(state) && suspend_ops->wake) - suspend_ops->wake(); + platform_resume_noirq(state); + dpm_resume_noirq(PMSG_RESUME); - dpm_resume_start(PMSG_RESUME); + Platform_early_resume: + platform_resume_early(state); - Platform_finish: - if (need_suspend_ops(state) && suspend_ops->finish) - suspend_ops->finish(); + Devices_early_resume: + dpm_resume_early(PMSG_RESUME); + Platform_finish: + platform_resume_finish(state); return error; } @@ -255,21 +503,23 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (need_suspend_ops(state) && !suspend_ops) + if (!sleep_state_supported(state)) return -ENOSYS; - trace_machine_suspend(state); - if (need_suspend_ops(state) && suspend_ops->begin) { - error = suspend_ops->begin(state); - if (error) - goto Close; - } - suspend_console(); - ftrace_stop(); + pm_suspend_target_state = state; + + if (state == PM_SUSPEND_TO_IDLE) + pm_set_suspend_no_platform(); + + error = platform_suspend_begin(state); + if (error) + goto Close; + + console_suspend_all(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { - pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); + pr_err("Some devices failed to suspend, or early wake event detected\n"); goto Recover_platform; } suspend_test_finish("suspend devices"); @@ -278,24 +528,23 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup && need_suspend_ops(state) - && suspend_ops->suspend_again && suspend_ops->suspend_again()); + } while (!error && !wakeup && platform_suspend_again(state)); Resume_devices: suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - ftrace_start(); - resume_console(); + trace_suspend_resume(TPS("console_resume_all"), state, true); + console_resume_all(); + trace_suspend_resume(TPS("console_resume_all"), state, false); + Close: - if (need_suspend_ops(state) && suspend_ops->end) - suspend_ops->end(); - trace_machine_suspend(PWR_EVENT_EXIT); + platform_resume_end(state); + pm_suspend_target_state = PM_SUSPEND_ON; return error; Recover_platform: - if (need_suspend_ops(state) && suspend_ops->recover) - suspend_ops->recover(); + platform_recover(state); goto Resume_devices; } @@ -308,6 +557,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } @@ -324,20 +574,35 @@ static int enter_state(suspend_state_t state) { int error; - if (!valid_state(state)) - return -ENODEV; - - if (!mutex_trylock(&pm_mutex)) + trace_suspend_resume(TPS("suspend_enter"), state, true); + if (state == PM_SUSPEND_TO_IDLE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { + pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); + return -EAGAIN; + } +#endif + } else if (!valid_state(state)) { + return -EINVAL; + } + if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; - if (state == PM_SUSPEND_FREEZE) - freeze_begin(); + if (state == PM_SUSPEND_TO_IDLE) + s2idle_begin(); - printk(KERN_INFO "PM: Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); + if (sync_on_suspend_enabled) { + trace_suspend_resume(TPS("sync_filesystems"), 0, true); + + error = pm_sleep_fs_sync(); + if (error) + goto Unlock; - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); + } + + pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); + pm_suspend_clear_flags(); error = suspend_prepare(state); if (error) goto Unlock; @@ -345,16 +610,16 @@ static int enter_state(suspend_state_t state) if (suspend_test(TEST_FREEZER)) goto Finish; - pr_debug("PM: Entering %s sleep\n", pm_states[state]); - pm_restrict_gfp_mask(); + trace_suspend_resume(TPS("suspend_enter"), state, false); + pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]); error = suspend_devices_and_enter(state); - pm_restore_gfp_mask(); Finish: - pr_debug("PM: Finishing wakeup.\n"); + events_check_enabled = false; + pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: - mutex_unlock(&pm_mutex); + mutex_unlock(&system_transition_mutex); return error; } @@ -372,13 +637,10 @@ int pm_suspend(suspend_state_t state) if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) return -EINVAL; + pr_info("suspend entry (%s)\n", mem_sleep_labels[state]); error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else { - suspend_stats.success++; - } + dpm_save_errno(error); + pr_info("suspend exit\n"); return error; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 9b2a1d58558d..d4856ec61570 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. * * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> - * - * This file is released under the GPLv2. */ #include <linux/init.h> @@ -22,6 +21,8 @@ #define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; +static u32 test_repeat_count_max = 1; +static u32 test_repeat_count_current; void suspend_test_start(void) { @@ -69,20 +70,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) static char info_test[] __initdata = KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; - unsigned long now; + time64_t now; struct rtc_wkalrm alm; int status; /* this may fail if the RTC hasn't been initialized */ +repeat: status = rtc_read_time(rtc, &alm.time); if (status < 0) { printk(err_readtime, dev_name(&rtc->dev), status); return; } - rtc_tm_to_time(&alm.time, &now); + now = rtc_tm_to_time64(&alm.time); memset(&alm, 0, sizeof alm); - rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + rtc_time64_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); alm.enabled = true; status = rtc_set_alarm(rtc, &alm); @@ -100,10 +102,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) if (state == PM_SUSPEND_STANDBY) { printk(info_test, pm_states[state]); status = pm_suspend(state); + if (status < 0) + state = PM_SUSPEND_TO_IDLE; } + if (state == PM_SUSPEND_TO_IDLE) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) printk(err_suspend, status); + test_repeat_count_current++; + if (test_repeat_count_current < test_repeat_count_max) + goto repeat; + /* Some platforms can't detect that the alarm triggered the * wakeup, or (accordingly) disable it after it afterwards. * It's supposed to give oneshot behavior; cope. @@ -116,7 +129,7 @@ static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); - if (!candidate->ops->set_alarm) + if (!test_bit(RTC_FEATURE_ALARM, candidate->features)) return 0; if (!device_may_wakeup(candidate->dev.parent)) return 0; @@ -129,27 +142,37 @@ static int __init has_wakealarm(struct device *dev, const void *data) * at startup time. They're normally disabled, for faster boot and because * we can't know which states really work on this particular system. */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; +static const char *test_state_label __initdata; static char warn_bad_state[] __initdata = KERN_WARNING "PM: can't test '%s' suspend state\n"; static int __init setup_test_suspend(char *value) { - unsigned i; + int i; + char *repeat; + char *suspend_type; - /* "=mem" ==> "mem" */ + /* example : "=mem[,N]" ==> "mem[,N]" */ value++; - for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (!pm_states[i]) - continue; - if (strcmp(pm_states[i], value) != 0) - continue; - test_state = (__force suspend_state_t) i; - return 0; + suspend_type = strsep(&value, ","); + if (!suspend_type) + return 1; + + repeat = strsep(&value, ","); + if (repeat) { + if (kstrtou32(repeat, 0, &test_repeat_count_max)) + return 1; } - printk(warn_bad_state, value); - return 0; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (!strcmp(pm_labels[i], suspend_type)) { + test_state_label = pm_labels[i]; + return 1; + } + + printk(warn_bad_state, suspend_type); + return 1; } __setup("test_suspend", setup_test_suspend); @@ -160,28 +183,37 @@ static int __init test_suspend(void) struct rtc_device *rtc = NULL; struct device *dev; + suspend_state_t test_state; /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!valid_state(test_state)) { - printk(warn_bad_state, pm_states[test_state]); - goto done; + if (!test_state_label) + return 0; + + for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) { + const char *state_label = pm_states[test_state]; + + if (state_label && !strcmp(test_state_label, state_label)) + break; + } + if (test_state == PM_SUSPEND_MAX) { + printk(warn_bad_state, test_state_label); + return 0; } /* RTCs have initialized by now too ... can we use one? */ - dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); - if (dev) + dev = class_find_device(&rtc_class, NULL, NULL, has_wakealarm); + if (dev) { rtc = rtc_class_open(dev_name(dev)); + put_device(dev); + } if (!rtc) { printk(warn_no_rtc); - goto done; + return 0; } /* go for it */ test_wakealarm(rtc, test_state); rtc_class_close(rtc); -done: return 0; } late_initcall(test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c33ed200410..33a186373bef 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/swap.c * @@ -7,16 +8,15 @@ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> - * - * This file is released under the GPLv2. - * */ +#define pr_fmt(fmt) "PM: " fmt + +#include <crypto/acompress.h> #include <linux/module.h> #include <linux/file.h> #include <linux/delay.h> #include <linux/bitops.h> -#include <linux/genhd.h> #include <linux/device.h> #include <linux/bio.h> #include <linux/blkdev.h> @@ -24,31 +24,40 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/slab.h> -#include <linux/lzo.h> #include <linux/vmalloc.h> #include <linux/cpumask.h> #include <linux/atomic.h> #include <linux/kthread.h> #include <linux/crc32.h> +#include <linux/ktime.h> #include "power.h" #define HIBERNATE_SIG "S1SUSPEND" +u32 swsusp_hardware_signature; + +/* + * When reading an {un,}compressed image, we may restore pages in place, + * in which case some architectures need these pages cleaning before they + * can be executed. We don't know which pages these may be, so clean the lot. + */ +static bool clean_pages_on_read; +static bool clean_pages_on_decompress; + /* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_ENTRIES swap entries. These + * structures are stored on the swap and linked together with the help of the + * .next_swap member. * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. + * The swap map is created during suspend. The swap map pages are allocated and + * populated one at a time, so we only need one memory page to set up the entire + * structure. * - * During resume we pick up all swap_map_page structures into a list. + * During resume we pick up all swap_map_page structures into a list. */ - #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) /* @@ -78,11 +87,9 @@ struct swap_map_page_list { struct swap_map_page_list *next; }; -/** - * The swap_map_handle structure is used for handling swap in - * a file-alike way +/* + * The swap_map_handle structure is used for handling swap in a file-alike way. */ - struct swap_map_handle { struct swap_map_page *cur; struct swap_map_page_list *maps; @@ -95,21 +102,21 @@ struct swap_map_handle { struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - - sizeof(u32)]; + sizeof(u32) - sizeof(u32)]; + u32 hw_sig; u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; -} __attribute__((packed)); +} __packed; static struct swsusp_header *swsusp_header; -/** - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. +/* + * The following functions are used for tracing the allocated swap pages, so + * that they can be freed in case of an error. */ - struct swsusp_extent { struct rb_node node; unsigned long start; @@ -159,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/** - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - sector_t alloc_swapdev_block(int swap) { unsigned long offset; + /* + * Allocate a swap page and register that it has been allocated, so that + * it can be freed in case of an error. + */ offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) @@ -178,24 +184,21 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/** - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entries had been - * allocated. - */ - void free_all_swap_pages(int swap) { struct rb_node *node; + /* + * Free swap pages allocated for saving image data. It also frees the + * extents used to register which swap entries had been allocated. + */ while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; - unsigned long offset; - ext = container_of(node, struct swsusp_extent, node); + ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); + swap_free_nr(swp_entry(swap, ext->start), + ext->end - ext->start + 1); kfree(ext); } @@ -211,7 +214,84 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -struct block_device *hib_resume_bdev; +static struct file *hib_resume_bdev_file; + +struct hib_bio_batch { + atomic_t count; + wait_queue_head_t wait; + blk_status_t error; + struct blk_plug plug; +}; + +static void hib_init_batch(struct hib_bio_batch *hb) +{ + atomic_set(&hb->count, 0); + init_waitqueue_head(&hb->wait); + hb->error = BLK_STS_OK; + blk_start_plug(&hb->plug); +} + +static void hib_finish_batch(struct hib_bio_batch *hb) +{ + blk_finish_plug(&hb->plug); +} + +static void hib_end_io(struct bio *bio) +{ + struct hib_bio_batch *hb = bio->bi_private; + struct page *page = bio_first_page_all(bio); + + if (bio->bi_status) { + pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); + } + + if (bio_data_dir(bio) == WRITE) + put_page(page); + else if (clean_pages_on_read) + flush_icache_range((unsigned long)page_address(page), + (unsigned long)page_address(page) + PAGE_SIZE); + + if (bio->bi_status && !hb->error) + hb->error = bio->bi_status; + if (atomic_dec_and_test(&hb->count)) + wake_up(&hb->wait); + + bio_put(bio); +} + +static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr) +{ + return bdev_rw_virt(file_bdev(hib_resume_bdev_file), + page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf); +} + +static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr, + struct hib_bio_batch *hb) +{ + struct bio *bio; + + bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, + GFP_NOIO | __GFP_HIGH); + bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); + bio_add_virt_nofail(bio, addr, PAGE_SIZE); + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(bio); + return 0; +} + +static int hib_wait_io(struct hib_bio_batch *hb) +{ + /* + * We are relying on the behavior of blk_plug that a thread with + * a plug will flush the plug list before sleeping. + */ + wait_event(hb->wait, atomic_read(&hb->count) == 0); + return blk_status_to_errno(hb->error); +} /* * Saving part @@ -221,90 +301,85 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; + if (swsusp_hardware_signature) { + swsusp_header->hw_sig = swsusp_hardware_signature; + flags |= SF_HW_SIG; + } swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_bio_write_page(swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, swsusp_header); } else { - printk(KERN_ERR "PM: Swap header not found!\n"); + pr_err("Swap header not found!\n"); error = -ENODEV; } return error; } -/** - * swsusp_swap_check - check if the resume device is a swap device - * and get its index (if so) - * - * This is called before saving image +/* + * Hold the swsusp_header flag. This is used in software_resume() in + * 'kernel/power/hibernate' to check if the image is compressed and query + * for the compression algorithm support(if so). */ +unsigned int swsusp_header_flags; + static int swsusp_swap_check(void) { int res; - res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &hib_resume_bdev); + /* + * Check if the resume device is a swap device and get its index (if so). + * This is called before saving the image. + */ + if (swsusp_resume_device) + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + else + res = find_first_swap(&swsusp_resume_device); if (res < 0) return res; - root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); - if (res) - return res; - res = set_blocksize(hib_resume_bdev, PAGE_SIZE); - if (res < 0) - blkdev_put(hib_resume_bdev, FMODE_WRITE); + hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, + BLK_OPEN_WRITE, NULL, NULL); + if (IS_ERR(hib_resume_bdev_file)) + return PTR_ERR(hib_resume_bdev_file); - return res; + return 0; } -/** - * write_page - Write one page to given swap location. - * @buf: Address we're writing. - * @offset: Offset of the swap page we're writing to. - * @bio_chain: Link the next write BIO here - */ - -static int write_page(void *buf, sector_t offset, struct bio **bio_chain) +static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; void *src; int ret; if (!offset) return -ENOSPC; - if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ - if (ret) - return ret; - src = (void *)__get_free_page(__GFP_WAIT | - __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - src = buf; - } - } - } else { - src = buf; + if (!hb) + goto sync_io; + + src = (void *)__get_free_page(gfp); + if (!src) { + ret = hib_wait_io(hb); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(gfp); + if (WARN_ON_ONCE(!src)) + goto sync_io; } - return hib_bio_write_page(offset, src, bio_chain); + + copy_page(src, buf); + return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); +sync_io: + return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf); } static void release_swap_writer(struct swap_map_handle *handle) @@ -321,8 +396,7 @@ static int get_swap_writer(struct swap_map_handle *handle) ret = swsusp_swap_check(); if (ret) { if (ret != -ENOSPC) - printk(KERN_ERR "PM: Cannot find swap device, try " - "swapon -a.\n"); + pr_err("Cannot find swap device, try swapon -a\n"); return ret; } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); @@ -342,20 +416,20 @@ static int get_swap_writer(struct swap_map_handle *handle) err_rel: release_swap_writer(handle); err_close: - swsusp_close(FMODE_WRITE); + swsusp_close(); return ret; } static int swap_write_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) + struct hib_bio_batch *hb) { - int error = 0; + int error; sector_t offset; if (!handle->cur) return -EINVAL; offset = alloc_swapdev_block(root_swap); - error = write_page(buf, offset, bio_chain); + error = write_page(buf, offset, hb); if (error) return error; handle->cur->entries[handle->k++] = offset; @@ -364,15 +438,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, bio_chain); + error = write_page(handle->cur, handle->cur_swap, hb); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; - if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); + if (hb && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_io(hb); if (error) goto out; /* @@ -398,43 +472,45 @@ static int swap_writer_finish(struct swap_map_handle *handle, unsigned int flags, int error) { if (!error) { - flush_swap_writer(handle); - printk(KERN_INFO "PM: S"); + pr_info("S"); error = mark_swapfiles(handle, flags); - printk("|\n"); + pr_cont("|\n"); + flush_swap_writer(handle); } if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); - swsusp_close(FMODE_WRITE); + swsusp_close(); return error; } +/* + * Bytes we need for compressed data in worst case. We assume(limitation) + * this is the worst of all the compression algorithms. + */ +#define bytes_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2) + /* We need to remember how much compressed data we need to read. */ -#define LZO_HEADER sizeof(size_t) +#define CMP_HEADER sizeof(size_t) /* Number of pages/bytes we'll compress at one time. */ -#define LZO_UNC_PAGES 32 -#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) +#define UNC_PAGES 32 +#define UNC_SIZE (UNC_PAGES * PAGE_SIZE) -/* Number of pages/bytes we need for compressed data (worst case). */ -#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ - LZO_HEADER, PAGE_SIZE) -#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) +/* Number of pages we need for compressed data (worst case). */ +#define CMP_PAGES DIV_ROUND_UP(bytes_worst_compress(UNC_SIZE) + \ + CMP_HEADER, PAGE_SIZE) +#define CMP_SIZE (CMP_PAGES * PAGE_SIZE) -/* Maximum number of threads for compression/decompression. */ -#define LZO_THREADS 3 +/* Default number of threads for compression/decompression. */ +#define CMP_THREADS 3 +static unsigned int hibernate_compression_threads = CMP_THREADS; /* Minimum/maximum number of pages for read buffering. */ -#define LZO_MIN_RD_PAGES 1024 -#define LZO_MAX_RD_PAGES 8192 - - -/** - * save_image - save the suspend image data - */ +#define CMP_MIN_RD_PAGES 1024 +#define CMP_MAX_RD_PAGES 8192 static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, @@ -444,41 +520,43 @@ static int save_image(struct swap_map_handle *handle, int ret; int nr_pages; int err2; - struct bio *bio; - struct timeval start; - struct timeval stop; + struct hib_bio_batch hb; + ktime_t start; + ktime_t stop; - printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", + hib_init_batch(&hb); + + pr_info("Saving image data pages (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) break; - ret = swap_write_page(handle, data_of(*snapshot), &bio); + ret = swap_write_page(handle, data_of(*snapshot), &hb); if (ret) break; if (!(nr_pages % m)) - printk(KERN_INFO "PM: Image saving progress: %3d%%\n", - nr_pages / m * 10); + pr_info("Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } - err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) - printk(KERN_INFO "PM: Image saving done.\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + pr_info("Image saving done\n"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); return ret; } -/** +/* * Structure used for CRC32. */ struct crc_data { @@ -489,24 +567,59 @@ struct crc_data { wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ - size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ - unsigned char *unc[LZO_THREADS]; /* uncompressed data */ + size_t **unc_len; /* uncompressed lengths */ + unsigned char **unc; /* uncompressed data */ }; -/** - * CRC32 update function that runs in its own thread. - */ +static struct crc_data *alloc_crc_data(int nr_threads) +{ + struct crc_data *crc; + + crc = kzalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) + return NULL; + + crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL); + if (!crc->unc) + goto err_free_crc; + + crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL); + if (!crc->unc_len) + goto err_free_unc; + + return crc; + +err_free_unc: + kfree(crc->unc); +err_free_crc: + kfree(crc); + return NULL; +} + +static void free_crc_data(struct crc_data *crc) +{ + if (!crc) + return; + + if (crc->thr) + kthread_stop(crc->thr); + + kfree(crc->unc_len); + kfree(crc->unc); + kfree(crc); +} + static int crc32_threadfn(void *data) { struct crc_data *d = data; unsigned i; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -515,16 +628,19 @@ static int crc32_threadfn(void *data) for (i = 0; i < d->run_threads; i++) *d->crc32 = crc32_le(*d->crc32, d->unc[i], *d->unc_len[i]); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * Structure used for LZO data compression. + +/* + * Structure used for data compression. */ struct cmp_data { struct task_struct *thr; /* thread */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -532,92 +648,92 @@ struct cmp_data { wait_queue_head_t done; /* compression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ - unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ - unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ - unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ + unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; -/** - * Compression function that runs in its own thread. - */ -static int lzo_compress_threadfn(void *data) +/* Indicates the image size after compression */ +static atomic64_t compressed_size = ATOMIC_INIT(0); + +static int compress_threadfn(void *data) { struct cmp_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); - d->ret = lzo1x_1_compress(d->unc, d->unc_len, - d->cmp + LZO_HEADER, &d->cmp_len, - d->wrk); - atomic_set(&d->stop, 1); + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->unc, d->unc_len); + acomp_request_set_dst_nondma(d->cr, d->cmp + CMP_HEADER, + CMP_SIZE - CMP_HEADER); + d->ret = crypto_acomp_compress(d->cr); + d->cmp_len = d->cr->dlen; + + atomic64_add(d->cmp_len, &compressed_size); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * save_image_lzo - Save the suspend image data compressed with LZO. - * @handle: Swap mam handle to use for saving the image. - * @snapshot: Image to read data from. - * @nr_to_write: Number of pages to save. - */ -static int save_image_lzo(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_write) +static int save_compressed_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) { unsigned int m; int ret = 0; int nr_pages; int err2; - struct bio *bio; - struct timeval start; - struct timeval stop; + struct hib_bio_batch hb; + ktime_t start; + ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; + hib_init_batch(&hb); + + atomic64_set(&compressed_size, 0); + /* * We'll limit the number of threads for compression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - data = vmalloc(sizeof(*data) * nr_threads); + data = vcalloc(nr_threads, sizeof(*data)); if (!data) { - printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - for (thr = 0; thr < nr_threads; thr++) - memset(&data[thr], 0, offsetof(struct cmp_data, go)); - crc = kmalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { - printk(KERN_ERR "PM: Failed to allocate crc\n"); + pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } - memset(crc, 0, offsetof(struct crc_data, go)); /* * Start the compression threads. @@ -626,13 +742,26 @@ static int save_image_lzo(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].thr = kthread_run(lzo_compress_threadfn, + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(data[thr].cc)) { + pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); + ret = -EFAULT; + goto out_clean; + } + + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + + data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; - printk(KERN_ERR - "PM: Cannot start compression threads\n"); + pr_err("Cannot start compression threads\n"); ret = -ENOMEM; goto out_clean; } @@ -654,7 +783,7 @@ static int save_image_lzo(struct swap_map_handle *handle, crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; - printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } @@ -665,19 +794,17 @@ static int save_image_lzo(struct swap_map_handle *handle, */ handle->reqd_free_pages = reqd_free_pages(); - printk(KERN_INFO - "PM: Using %u thread(s) for compression.\n" - "PM: Compressing and saving image data (%u pages)...\n", - nr_threads, nr_to_write); + pr_info("Using %u thread(s) for %s compression\n", nr_threads, hib_comp_algo); + pr_info("Compressing and saving image data (%u pages)...\n", + nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { - for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + for (off = 0; off < UNC_SIZE; off += PAGE_SIZE) { ret = snapshot_read_next(snapshot); if (ret < 0) goto out_finish; @@ -689,10 +816,8 @@ static int save_image_lzo(struct swap_map_handle *handle, data_of(*snapshot), PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_INFO - "PM: Image saving progress: " - "%3d%%\n", - nr_pages / m * 10); + pr_info("Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } if (!off) @@ -700,7 +825,7 @@ static int save_image_lzo(struct swap_map_handle *handle, data[thr].unc_len = off; - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -708,26 +833,25 @@ static int save_image_lzo(struct swap_map_handle *handle, break; crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; if (ret < 0) { - printk(KERN_ERR "PM: LZO compression failed\n"); + pr_err("%s compression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > - lzo1x_worst_compress(data[thr].unc_len))) { - printk(KERN_ERR - "PM: Invalid LZO compressed length\n"); + bytes_worst_compress(data[thr].unc_len))) { + pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } @@ -743,73 +867,74 @@ static int save_image_lzo(struct swap_map_handle *handle, * read it. */ for (off = 0; - off < LZO_HEADER + data[thr].cmp_len; + off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(page, data[thr].cmp + off, PAGE_SIZE); - ret = swap_write_page(handle, page, &bio); + ret = swap_write_page(handle, page, &hb); if (ret) goto out_finish; } } - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } out_finish: - err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + err2 = hib_wait_io(&hb); + stop = ktime_get(); if (!ret) ret = err2; - if (!ret) - printk(KERN_INFO "PM: Image saving done.\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); -out_clean: - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); + pr_info("Image saving done\n"); + } else { + pr_err("Image saving failed: %d\n", ret); } + +out_clean: + hib_finish_batch(&hb); + free_crc_data(crc); if (data) { - for (thr = 0; thr < nr_threads; thr++) + for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); + } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable from the resume partition. - */ - -static int enough_swap(unsigned int nr_pages, unsigned int flags) +static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); unsigned int required; - pr_debug("PM: Free swap pages: %u\n", free_swap); + pr_debug("Free swap pages: %u\n", free_swap); required = PAGES_FOR_IO + nr_pages; return free_swap > required; } /** - * swsusp_write - Write entire image and metadata. - * @flags: flags to pass to the "boot" kernel in the image header + * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header + * + * It is important _NOT_ to umount filesystems at this point. We want them + * synced (in case something goes wrong) but we DO not want to mark filesystem + * clean: it is not. (And it does not matter, if we resume correctly, we'll mark + * system clean, anyway.) * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) + * Return: 0 on success, negative error code on failure. */ - int swsusp_write(unsigned int flags) { struct swap_map_handle handle; @@ -821,19 +946,19 @@ int swsusp_write(unsigned int flags) pages = snapshot_get_image_size(); error = get_swap_writer(&handle); if (error) { - printk(KERN_ERR "PM: Cannot get swap writer\n"); + pr_err("Cannot get swap writer\n"); return error; } if (flags & SF_NOCOMPRESS_MODE) { - if (!enough_swap(pages, flags)) { - printk(KERN_ERR "PM: Not enough free swap\n"); + if (!enough_swap(pages)) { + pr_err("Not enough free swap\n"); error = -ENOSPC; goto out_finish; } } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); - if (error < PAGE_SIZE) { + if (error < (int)PAGE_SIZE) { if (error >= 0) error = -EFAULT; @@ -844,16 +969,16 @@ int swsusp_write(unsigned int flags) if (!error) { error = (flags & SF_NOCOMPRESS_MODE) ? save_image(&handle, &snapshot, pages - 1) : - save_image_lzo(&handle, &snapshot, pages - 1); + save_compressed_image(&handle, &snapshot, pages - 1); } out_finish: error = swap_writer_finish(&handle, flags, error); return error; } -/** - * The following functions allow us to read data using a swap map - * in a file-alike way +/* + * The following functions allow us to read data using a swap map in a file-like + * way. */ static void release_swap_reader(struct swap_map_handle *handle) @@ -886,12 +1011,11 @@ static int get_swap_reader(struct swap_map_handle *handle, last = handle->maps = NULL; offset = swsusp_header->image; while (offset) { - tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL); if (!tmp) { release_swap_reader(handle); return -ENOMEM; } - memset(tmp, 0, sizeof(*tmp)); if (!handle->maps) handle->maps = tmp; if (last) @@ -899,13 +1023,13 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); + __get_free_page(GFP_NOIO | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; } - error = hib_bio_read_page(offset, tmp->map, NULL); + error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map); if (error) { release_swap_reader(handle); return error; @@ -918,7 +1042,7 @@ static int get_swap_reader(struct swap_map_handle *handle, } static int swap_read_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) + struct hib_bio_batch *hb) { sector_t offset; int error; @@ -929,7 +1053,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_bio_read_page(offset, buf, bio_chain); + if (hb) + error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb); + else + error = hib_submit_io_sync(REQ_OP_READ, offset, buf); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -953,67 +1080,65 @@ static int swap_reader_finish(struct swap_map_handle *handle) return 0; } -/** - * load_image - load the image using the swap map handle - * @handle and the snapshot handle @snapshot - * (assume there are @nr_pages pages to load) - */ - static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) { unsigned int m; int ret = 0; - struct timeval start; - struct timeval stop; - struct bio *bio; + ktime_t start; + ktime_t stop; + struct hib_bio_batch hb; int err2; unsigned nr_pages; - printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", - nr_to_read); + hib_init_batch(&hb); + + clean_pages_on_read = true; + pr_info("Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) break; - ret = swap_read_page(handle, data_of(*snapshot), &bio); + ret = swap_read_page(handle, data_of(*snapshot), &hb); if (ret) break; if (snapshot->sync_read) - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) break; if (!(nr_pages % m)) - printk(KERN_INFO "PM: Image loading progress: %3d%%\n", - nr_pages / m * 10); + pr_info("Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } - err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) { - printk(KERN_INFO "PM: Image loading done.\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) + pr_info("Image loading done\n"); + ret = snapshot_write_finalize(snapshot); + if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(start, stop, nr_to_read, "Read"); return ret; } -/** - * Structure used for LZO data decompression. +/* + * Structure used for data decompression. */ struct dec_data { struct task_struct *thr; /* thread */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -1021,54 +1146,54 @@ struct dec_data { wait_queue_head_t done; /* decompression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ - unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ - unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; -/** - * Deompression function that runs in its own thread. - */ -static int lzo_decompress_threadfn(void *data) +static int decompress_threadfn(void *data) { struct dec_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); - d->unc_len = LZO_UNC_SIZE; - d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, - d->unc, &d->unc_len); - atomic_set(&d->stop, 1); + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->cmp + CMP_HEADER, + d->cmp_len); + acomp_request_set_dst_nondma(d->cr, d->unc, UNC_SIZE); + d->ret = crypto_acomp_decompress(d->cr); + d->unc_len = d->cr->dlen; + + if (clean_pages_on_decompress) + flush_icache_range((unsigned long)d->unc, + (unsigned long)d->unc + d->unc_len); + + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * load_image_lzo - Load compressed image data and decompress them with LZO. - * @handle: Swap map handle to use for loading data. - * @snapshot: Image to copy uncompressed data into. - * @nr_to_read: Number of pages to load. - */ -static int load_image_lzo(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_read) +static int load_compressed_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) { unsigned int m; int ret = 0; int eof = 0; - struct bio *bio; - struct timeval start; - struct timeval stop; + struct hib_bio_batch hb; + ktime_t start; + ktime_t stop; unsigned nr_pages; size_t off; unsigned i, thr, run_threads, nr_threads; @@ -1079,36 +1204,37 @@ static int load_image_lzo(struct swap_map_handle *handle, struct dec_data *data = NULL; struct crc_data *crc = NULL; + hib_init_batch(&hb); + /* * We'll limit the number of threads for decompression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); - page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); + page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page)); if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - data = vmalloc(sizeof(*data) * nr_threads); + data = vcalloc(nr_threads, sizeof(*data)); if (!data) { - printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - for (thr = 0; thr < nr_threads; thr++) - memset(&data[thr], 0, offsetof(struct dec_data, go)); - crc = kmalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { - printk(KERN_ERR "PM: Failed to allocate crc\n"); + pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } - memset(crc, 0, offsetof(struct crc_data, go)); + + clean_pages_on_decompress = true; /* * Start the decompression threads. @@ -1117,13 +1243,26 @@ static int load_image_lzo(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].thr = kthread_run(lzo_decompress_threadfn, + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(data[thr].cc)) { + pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); + ret = -EFAULT; + goto out_clean; + } + + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + + data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; - printk(KERN_ERR - "PM: Cannot start decompression threads\n"); + pr_err("Cannot start decompression threads\n"); ret = -ENOMEM; goto out_clean; } @@ -1145,7 +1284,7 @@ static int load_image_lzo(struct swap_map_handle *handle, crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; - printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } @@ -1159,19 +1298,18 @@ static int load_image_lzo(struct swap_map_handle *handle, */ if (low_free_pages() > snapshot_get_image_size()) read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; - read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); + read_pages = clamp_val(read_pages, CMP_MIN_RD_PAGES, CMP_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { - page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); + page[i] = (void *)__get_free_page(i < CMP_PAGES ? + GFP_NOIO | __GFP_HIGH : + GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY); if (!page[i]) { - if (i < LZO_CMP_PAGES) { + if (i < CMP_PAGES) { ring_size = i; - printk(KERN_ERR - "PM: Failed to allocate LZO pages\n"); + pr_err("Failed to allocate %s pages\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } else { @@ -1181,16 +1319,14 @@ static int load_image_lzo(struct swap_map_handle *handle, } want = ring_size = i; - printk(KERN_INFO - "PM: Using %u thread(s) for decompression.\n" - "PM: Loading and decompressing image data (%u pages)...\n", - nr_threads, nr_to_read); + pr_info("Using %u thread(s) for %s decompression\n", nr_threads, hib_comp_algo); + pr_info("Loading and decompressing image data (%u pages)...\n", + nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); ret = snapshot_write_next(snapshot); if (ret <= 0) @@ -1198,7 +1334,7 @@ static int load_image_lzo(struct swap_map_handle *handle, for(;;) { for (i = 0; !eof && i < want; i++) { - ret = swap_read_page(handle, page[ring], &bio); + ret = swap_read_page(handle, page[ring], &hb); if (ret) { /* * On real read error, finish. On end of data, @@ -1225,7 +1361,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!asked) break; - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; @@ -1235,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); crc->run_threads = 0; } @@ -1244,14 +1380,13 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].cmp_len = *(size_t *)page[pg]; if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > - lzo1x_worst_compress(LZO_UNC_SIZE))) { - printk(KERN_ERR - "PM: Invalid LZO compressed length\n"); + bytes_worst_compress(UNC_SIZE))) { + pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } - need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + need = DIV_ROUND_UP(data[thr].cmp_len + CMP_HEADER, PAGE_SIZE); if (need > have) { if (eof > 1) { @@ -1262,7 +1397,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } for (off = 0; - off < LZO_HEADER + data[thr].cmp_len; + off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(data[thr].cmp + off, page[pg], PAGE_SIZE); @@ -1272,15 +1407,15 @@ static int load_image_lzo(struct swap_map_handle *handle, pg = 0; } - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } /* * Wait for more data while we are decompressing. */ - if (have < LZO_CMP_PAGES && asked) { - ret = hib_wait_on_bio_chain(&bio); + if (have < CMP_PAGES && asked) { + ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; @@ -1291,22 +1426,20 @@ static int load_image_lzo(struct swap_map_handle *handle, for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; if (ret < 0) { - printk(KERN_ERR - "PM: LZO decompression failed\n"); + pr_err("%s decompression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].unc_len || - data[thr].unc_len > LZO_UNC_SIZE || - data[thr].unc_len & (PAGE_SIZE - 1))) { - printk(KERN_ERR - "PM: Invalid LZO uncompressed length\n"); + data[thr].unc_len > UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + pr_err("Invalid %s uncompressed length\n", hib_comp_algo); ret = -1; goto out_finish; } @@ -1317,16 +1450,14 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].unc + off, PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_INFO - "PM: Image loading progress: " - "%3d%%\n", - nr_pages / m * 10); + pr_info("Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; ret = snapshot_write_next(snapshot); if (ret <= 0) { crc->run_threads = thr + 1; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); goto out_finish; } @@ -1334,47 +1465,46 @@ static int load_image_lzo(struct swap_map_handle *handle, } crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); } out_finish: if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) { - printk(KERN_INFO "PM: Image loading done.\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) + pr_info("Image loading done\n"); + ret = snapshot_write_finalize(snapshot); + if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; if (!ret) { if (swsusp_header->flags & SF_CRC32_MODE) { if(handle->crc32 != swsusp_header->crc32) { - printk(KERN_ERR - "PM: Invalid image CRC32!\n"); + pr_err("Invalid image CRC32!\n"); ret = -ENODATA; } } } } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: + hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { - for (thr = 0; thr < nr_threads; thr++) + for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); + } vfree(data); } - if (page) vfree(page); + vfree(page); return ret; } @@ -1383,8 +1513,9 @@ out_clean: * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memory location + * + * Return: 0 on success, negative error code on failure. */ - int swsusp_read(unsigned int *flags_p) { int error; @@ -1394,7 +1525,7 @@ int swsusp_read(unsigned int *flags_p) memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_write_next(&snapshot); - if (error < PAGE_SIZE) + if (error < (int)PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); error = get_swap_reader(&handle, flags_p); @@ -1405,89 +1536,102 @@ int swsusp_read(unsigned int *flags_p) if (!error) { error = (*flags_p & SF_NOCOMPRESS_MODE) ? load_image(&handle, &snapshot, header->pages - 1) : - load_image_lzo(&handle, &snapshot, header->pages - 1); + load_compressed_image(&handle, &snapshot, header->pages - 1); } swap_reader_finish(&handle); end: if (!error) - pr_debug("PM: Image successfully loaded\n"); + pr_debug("Image successfully loaded\n"); else - pr_debug("PM: Error %d resuming\n", error); + pr_debug("Error %d resuming\n", error); return error; } +static void *swsusp_holder; + /** - * swsusp_check - Check for swsusp signature in the resume device + * swsusp_check - Open the resume device and check for the swsusp signature. + * @exclusive: Open the resume device exclusively. + * + * Return: 0 if a valid image is found, negative error code otherwise. */ - -int swsusp_check(void) +int swsusp_check(bool exclusive) { + void *holder = exclusive ? &swsusp_holder : NULL; int error; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ, NULL); - if (!IS_ERR(hib_resume_bdev)) { - set_blocksize(hib_resume_bdev, PAGE_SIZE); + hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, + BLK_OPEN_READ, holder, NULL); + if (!IS_ERR(hib_resume_bdev_file)) { clear_page(swsusp_header); - error = hib_bio_read_page(swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, + swsusp_header); if (error) goto put; if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); + swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ - error = hib_bio_write_page(swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, + swsusp_header); } else { error = -EINVAL; } + if (!error && swsusp_header->flags & SF_HW_SIG && + swsusp_header->hw_sig != swsusp_hardware_signature) { + pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n", + swsusp_header->hw_sig, swsusp_hardware_signature); + error = -EINVAL; + } put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ); + bdev_fput(hib_resume_bdev_file); else - pr_debug("PM: Image signature found, resuming\n"); + pr_debug("Image signature found, resuming\n"); } else { - error = PTR_ERR(hib_resume_bdev); + error = PTR_ERR(hib_resume_bdev_file); } if (error) - pr_debug("PM: Image not found (code %d)\n", error); + pr_debug("Image not found (code %d)\n", error); return error; } /** - * swsusp_close - close swap device. + * swsusp_close - close resume device. */ - -void swsusp_close(fmode_t mode) +void swsusp_close(void) { - if (IS_ERR(hib_resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); + if (IS_ERR(hib_resume_bdev_file)) { + pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, mode); + fput(hib_resume_bdev_file); } /** - * swsusp_unmark - Unmark swsusp signature in the resume device + * swsusp_unmark - Unmark swsusp signature in the resume device + * + * Return: 0 on success, negative error code on failure. */ - #ifdef CONFIG_SUSPEND int swsusp_unmark(void) { int error; - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_bio_write_page(swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, + swsusp_header); } else { - printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); + pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; } @@ -1500,8 +1644,46 @@ int swsusp_unmark(void) } #endif -static int swsusp_header_init(void) +static ssize_t hibernate_compression_threads_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { + return sysfs_emit(buf, "%d\n", hibernate_compression_threads); +} + +static ssize_t hibernate_compression_threads_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 0, &val)) + return -EINVAL; + + if (val < 1) + return -EINVAL; + + hibernate_compression_threads = val; + return n; +} +power_attr(hibernate_compression_threads); + +static struct attribute *g[] = { + &hibernate_compression_threads_attr.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = g, +}; + +static int __init swsusp_header_init(void) +{ + int error; + + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return -ENOMEM; + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) panic("Could not allocate memory for swsusp_header\n"); @@ -1509,3 +1691,19 @@ static int swsusp_header_init(void) } core_initcall(swsusp_header_init); + +static int __init hibernate_compression_threads_setup(char *str) +{ + int rc = kstrtouint(str, 0, &hibernate_compression_threads); + + if (rc) + return rc; + + if (hibernate_compression_threads < 1) + hibernate_compression_threads = CMP_THREADS; + + return 1; + +} + +__setup("hibernate_compression_threads=", hibernate_compression_threads_setup); diff --git a/kernel/power/user.c b/kernel/power/user.c index 4ed81e74f86f..4401cfe26e5c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -1,16 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/user.c * * This file provides the user space interface for software suspend/resume. * * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * This file is released under the GPLv2. - * */ #include <linux/suspend.h> -#include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> @@ -25,81 +22,84 @@ #include <linux/cpu.h> #include <linux/freezer.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "power.h" - -#define SNAPSHOT_MINOR 231 +static bool need_wait; static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; - char frozen; - char ready; - char platform_support; + bool frozen; + bool ready; + bool platform_support; + bool free_bitmaps; + dev_t dev; } snapshot_state; -atomic_t snapshot_device_available = ATOMIC_INIT(1); +int is_hibernate_resume_dev(dev_t dev) +{ + return hibernation_available() && snapshot_state.dev == dev; +} static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; int error; - lock_system_sleep(); + if (!hibernation_available()) + return -EPERM; - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + sleep_flags = lock_system_sleep(); + + if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { - atomic_inc(&snapshot_device_available); + hibernate_release(); error = -ENOSYS; goto Unlock; } - if(create_basic_memory_bitmaps()) { - atomic_inc(&snapshot_device_available); - error = -ENOMEM; - goto Unlock; - } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ - data->swap = swsusp_resume_device ? - swap_type_of(swsusp_resume_device, 0, NULL) : -1; + data->swap = swap_type_of(swsusp_resume_device, 0); data->mode = O_RDONLY; - error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); - if (error) - pm_notifier_call_chain(PM_POST_HIBERNATION); + data->free_bitmaps = false; + error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); } else { /* * Resuming. We may need to wait for the image device to * appear. */ - wait_for_device_probe(); + need_wait = true; data->swap = -1; data->mode = O_WRONLY; - error = pm_notifier_call_chain(PM_RESTORE_PREPARE); - if (error) - pm_notifier_call_chain(PM_POST_RESTORE); - } - if (error) { - free_basic_memory_bitmaps(); - atomic_inc(&snapshot_device_available); + error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); + if (!error) { + error = create_basic_memory_bitmaps(); + data->free_bitmaps = !error; + } } - data->frozen = 0; - data->ready = 0; - data->platform_support = 0; + if (error) + hibernate_release(); + + data->frozen = false; + data->ready = false; + data->platform_support = false; + data->dev = 0; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } @@ -107,22 +107,26 @@ static int snapshot_open(struct inode *inode, struct file *filp) static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); swsusp_free(); - free_basic_memory_bitmaps(); data = filp->private_data; + data->dev = 0; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); thaw_processes(); + } else if (data->free_bitmaps) { + free_basic_memory_bitmaps(); } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); - atomic_inc(&snapshot_device_available); + hibernate_release(); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return 0; } @@ -130,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp) static ssize_t snapshot_read(struct file *filp, char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned int sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -155,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } @@ -163,11 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, static ssize_t snapshot_write(struct file *filp, const char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned long sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - lock_system_sleep(); + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + + sleep_flags = lock_system_sleep(); data = filp->private_data; @@ -176,7 +187,12 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res <= 0) goto unlock; } else { - res = PAGE_SIZE - pg_offp; + res = PAGE_SIZE; + } + + if (!data_of(data->handle)) { + res = -EINVAL; + goto unlock; } res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, @@ -184,11 +200,52 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } +struct compat_resume_swap_area { + compat_loff_t offset; + u32 dev; +} __packed; + +static int snapshot_set_swap_area(struct snapshot_data *data, + void __user *argp) +{ + sector_t offset; + dev_t swdev; + + if (swsusp_swap_in_use()) + return -EPERM; + + if (in_compat_syscall()) { + struct compat_resume_swap_area swap_area; + + if (copy_from_user(&swap_area, argp, sizeof(swap_area))) + return -EFAULT; + swdev = new_decode_dev(swap_area.dev); + offset = swap_area.offset; + } else { + struct resume_swap_area swap_area; + + if (copy_from_user(&swap_area, argp, sizeof(swap_area))) + return -EFAULT; + swdev = new_decode_dev(swap_area.dev); + offset = swap_area.offset; + } + + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + data->swap = swap_type_of(swdev, offset); + if (data->swap < 0) + return swdev ? -ENODEV : -EINVAL; + data->dev = swdev; + return 0; +} + static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -197,6 +254,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, loff_t size; sector_t offset; + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) @@ -204,9 +266,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!mutex_trylock(&pm_mutex)) + if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; + lock_device_hotplug(); data = filp->private_data; switch (cmd) { @@ -215,21 +278,30 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - printk("Syncing filesystems ... "); - sys_sync(); - printk("done.\n"); + error = pm_sleep_fs_sync(); + if (error) + break; error = freeze_processes(); - if (!error) - data->frozen = 1; + if (error) + break; + + error = create_basic_memory_bitmaps(); + if (error) + thaw_processes(); + else + data->frozen = true; + break; case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); + data->free_bitmaps = false; thaw_processes(); - data->frozen = 0; + data->frozen = false; break; case SNAPSHOT_CREATE_IMAGE: @@ -247,7 +319,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_ATOMIC_RESTORE: - snapshot_write_finalize(&data->handle); + error = snapshot_write_finalize(&data->handle); + if (error) + break; if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; @@ -259,7 +333,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = 0; + data->ready = false; /* * It is necessary to thaw kernel threads here, because * SNAPSHOT_CREATE_IMAGE may be invoked directly after @@ -323,7 +397,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - data->ready = 0; + data->ready = false; break; case SNAPSHOT_PLATFORM_SUPPORT: @@ -336,34 +410,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_SET_SWAP_AREA: - if (swsusp_swap_in_use()) { - error = -EPERM; - } else { - struct resume_swap_area swap_area; - dev_t swdev; - - error = copy_from_user(&swap_area, (void __user *)arg, - sizeof(struct resume_swap_area)); - if (error) { - error = -EFAULT; - break; - } - - /* - * User space encodes device types as two-byte values, - * so we need to recode them - */ - swdev = new_decode_dev(swap_area.dev); - if (swdev) { - offset = swap_area.offset; - data->swap = swap_type_of(swdev, offset, NULL); - if (data->swap < 0) - error = -ENODEV; - } else { - data->swap = -1; - error = -EINVAL; - } - } + error = snapshot_set_swap_area(data, (void __user *)arg); break; default: @@ -371,18 +418,13 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } - mutex_unlock(&pm_mutex); + unlock_device_hotplug(); + mutex_unlock(&system_transition_mutex); return error; } #ifdef CONFIG_COMPAT - -struct compat_resume_swap_area { - compat_loff_t offset; - u32 dev; -} __packed; - static long snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -391,49 +433,15 @@ snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case SNAPSHOT_GET_IMAGE_SIZE: case SNAPSHOT_AVAIL_SWAP_SIZE: - case SNAPSHOT_ALLOC_SWAP_PAGE: { - compat_loff_t __user *uoffset = compat_ptr(arg); - loff_t offset; - mm_segment_t old_fs; - int err; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, cmd, (unsigned long) &offset); - set_fs(old_fs); - if (!err && put_user(offset, uoffset)) - err = -EFAULT; - return err; - } - + case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_CREATE_IMAGE: + case SNAPSHOT_SET_SWAP_AREA: return snapshot_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); - - case SNAPSHOT_SET_SWAP_AREA: { - struct compat_resume_swap_area __user *u_swap_area = - compat_ptr(arg); - struct resume_swap_area swap_area; - mm_segment_t old_fs; - int err; - - err = get_user(swap_area.offset, &u_swap_area->offset); - err |= get_user(swap_area.dev, &u_swap_area->dev); - if (err) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, - (unsigned long) &swap_area); - set_fs(old_fs); - return err; - } - default: return snapshot_ioctl(file, cmd, arg); } } - #endif /* CONFIG_COMPAT */ static const struct file_operations snapshot_fops = { @@ -441,7 +449,6 @@ static const struct file_operations snapshot_fops = { .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, - .llseek = no_llseek, .unlocked_ioctl = snapshot_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snapshot_compat_ioctl, diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 8f50de394d22..4e941999a53b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kernel/power/wakelock.c * @@ -17,13 +18,16 @@ #include <linux/list.h> #include <linux/rbtree.h> #include <linux/slab.h> +#include <linux/workqueue.h> + +#include "power.h" static DEFINE_MUTEX(wakelocks_lock); struct wakelock { char *name; struct rb_node node; - struct wakeup_source ws; + struct wakeup_source *ws; #ifdef CONFIG_PM_WAKELOCKS_GC struct list_head lru; #endif @@ -35,23 +39,23 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) { struct rb_node *node; struct wakelock *wl; - char *str = buf; - char *end = buf + PAGE_SIZE; + int len = 0; mutex_lock(&wakelocks_lock); for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); - if (wl->ws.active == show_active) - str += scnprintf(str, end - str, "%s ", wl->name); + if (wl->ws->active == show_active) + len += sysfs_emit_at(buf, len, "%s ", wl->name); } - if (str > buf) - str--; - str += scnprintf(str, end - str, "\n"); + if (len > 0) + --len; + + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); - return (str - buf); + return len; } #if CONFIG_PM_WAKELOCKS_LIMIT > 0 @@ -81,7 +85,9 @@ static inline void decrement_wakelocks_number(void) {} #define WL_GC_COUNT_MAX 100 #define WL_GC_TIME_SEC 300 +static void __wakelocks_gc(struct work_struct *work); static LIST_HEAD(wakelocks_lru_list); +static DECLARE_WORK(wakelock_work, __wakelocks_gc); static unsigned int wakelocks_gc_count; static inline void wakelocks_lru_add(struct wakelock *wl) @@ -94,29 +100,28 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl) list_move(&wl->lru, &wakelocks_lru_list); } -static void wakelocks_gc(void) +static void __wakelocks_gc(struct work_struct *work) { struct wakelock *wl, *aux; ktime_t now; - if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) - return; + mutex_lock(&wakelocks_lock); now = ktime_get(); list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { u64 idle_time_ns; bool active; - spin_lock_irq(&wl->ws.lock); - idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); - active = wl->ws.active; - spin_unlock_irq(&wl->ws.lock); + spin_lock_irq(&wl->ws->lock); + idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws->last_time)); + active = wl->ws->active; + spin_unlock_irq(&wl->ws->lock); if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) break; if (!active) { - wakeup_source_remove(&wl->ws); + wakeup_source_unregister(wl->ws); rb_erase(&wl->node, &wakelocks_tree); list_del(&wl->lru); kfree(wl->name); @@ -125,6 +130,16 @@ static void wakelocks_gc(void) } } wakelocks_gc_count = 0; + + mutex_unlock(&wakelocks_lock); +} + +static void wakelocks_gc(void) +{ + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + schedule_work(&wakelock_work); } #else /* !CONFIG_PM_WAKELOCKS_GC */ static inline void wakelocks_lru_add(struct wakelock *wl) {} @@ -172,8 +187,15 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len, kfree(wl); return ERR_PTR(-ENOMEM); } - wl->ws.name = wl->name; - wakeup_source_add(&wl->ws); + + wl->ws = wakeup_source_register(NULL, wl->name); + if (!wl->ws) { + kfree(wl->name); + kfree(wl); + return ERR_PTR(-ENOMEM); + } + wl->ws->last_time = ktime_get(); + rb_link_node(&wl->node, parent, node); rb_insert_color(&wl->node, &wakelocks_tree); wakelocks_lru_add(wl); @@ -217,9 +239,9 @@ int pm_wake_lock(const char *buf) u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; do_div(timeout_ms, NSEC_PER_MSEC); - __pm_wakeup_event(&wl->ws, timeout_ms); + __pm_wakeup_event(wl->ws, timeout_ms); } else { - __pm_stay_awake(&wl->ws); + __pm_stay_awake(wl->ws); } wakelocks_lru_most_recent(wl); @@ -255,7 +277,7 @@ int pm_wake_unlock(const char *buf) ret = PTR_ERR(wl); goto out; } - __pm_relax(&wl->ws); + __pm_relax(wl->ws); wakelocks_lru_most_recent(wl); wakelocks_gc(); |
