diff options
Diffstat (limited to 'kernel/power')
| -rw-r--r-- | kernel/power/Kconfig | 82 | ||||
| -rw-r--r-- | kernel/power/Makefile | 10 | ||||
| -rw-r--r-- | kernel/power/autosleep.c | 1 | ||||
| -rw-r--r-- | kernel/power/console.c | 15 | ||||
| -rw-r--r-- | kernel/power/em_netlink.c | 308 | ||||
| -rw-r--r-- | kernel/power/em_netlink.h | 39 | ||||
| -rw-r--r-- | kernel/power/em_netlink_autogen.c | 48 | ||||
| -rw-r--r-- | kernel/power/em_netlink_autogen.h | 23 | ||||
| -rw-r--r-- | kernel/power/energy_model.c | 786 | ||||
| -rw-r--r-- | kernel/power/hibernate.c | 470 | ||||
| -rw-r--r-- | kernel/power/main.c | 508 | ||||
| -rw-r--r-- | kernel/power/power.h | 45 | ||||
| -rw-r--r-- | kernel/power/poweroff.c | 2 | ||||
| -rw-r--r-- | kernel/power/process.c | 52 | ||||
| -rw-r--r-- | kernel/power/qos.c | 124 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 396 | ||||
| -rw-r--r-- | kernel/power/suspend.c | 71 | ||||
| -rw-r--r-- | kernel/power/suspend_test.c | 10 | ||||
| -rw-r--r-- | kernel/power/swap.c | 691 | ||||
| -rw-r--r-- | kernel/power/user.c | 46 | ||||
| -rw-r--r-- | kernel/power/wakelock.c | 14 |
21 files changed, 2844 insertions, 897 deletions
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a12779650f15..05337f437cca 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -39,9 +39,9 @@ config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on SWAP && ARCH_HIBERNATION_POSSIBLE select HIBERNATE_CALLBACKS - select LZO_COMPRESS - select LZO_DECOMPRESS select CRC32 + select CRYPTO + select CRYPTO_LZO help Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -92,6 +92,28 @@ config HIBERNATION_SNAPSHOT_DEV If in doubt, say Y. +choice + prompt "Default compressor" + default HIBERNATION_COMP_LZO + depends on HIBERNATION + +config HIBERNATION_COMP_LZO + bool "lzo" + depends on CRYPTO_LZO + +config HIBERNATION_COMP_LZ4 + bool "lz4" + depends on CRYPTO_LZ4 + +endchoice + +config HIBERNATION_DEF_COMP + string + default "lzo" if HIBERNATION_COMP_LZO + default "lz4" if HIBERNATION_COMP_LZ4 + help + Default compressor to be used for hibernation. + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION @@ -118,7 +140,6 @@ config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS select PM - select SRCU config PM_SLEEP_SMP def_bool y @@ -143,6 +164,26 @@ config PM_AUTOSLEEP Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. +config PM_USERSPACE_AUTOSLEEP + bool "Userspace opportunistic sleep" + depends on PM_SLEEP + help + Notify kernel of aggressive userspace autosleep power management policy. + + This option changes the behavior of various sleep-sensitive code to deal + with frequent userspace-initiated transitions into a global sleep state. + + Saying Y here, disables code paths that most users really should keep + enabled. In particular, only enable this if it is very common to be + asleep/awake for very short periods of time (<= 2 seconds). + + Only platforms, such as Android, that implement opportunistic sleep from + a userspace power manager service should enable this option; and not + other machines. Therefore, you should say N here, unless you are + extremely certain that this is what you want. The option otherwise has + bad, undesirable effects, and should not be enabled just for fun. + + config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP @@ -161,6 +202,17 @@ config PM_WAKELOCKS_GC depends on PM_WAKELOCKS default y +config PM_QOS_CPU_SYSTEM_WAKEUP + bool "User space interface for CPU system wakeup QoS" + depends on CPU_IDLE + help + Enable this to allow user space via the cpu_wakeup_latency file to + specify a CPU system wakeup latency limit. + + This may be particularly useful for platforms supporting multiple low + power states for CPUs during system-wide suspend and s2idle in + particular. + config PM bool "Device power management core functionality" help @@ -216,11 +268,30 @@ config DPM_WATCHDOG boot session. config DPM_WATCHDOG_TIMEOUT - int "Watchdog timeout in seconds" + int "Watchdog timeout to panic in seconds" range 1 120 default 120 depends on DPM_WATCHDOG +config DPM_WATCHDOG_WARNING_TIMEOUT + int "Watchdog timeout to warn in seconds" + range 1 DPM_WATCHDOG_TIMEOUT + default DPM_WATCHDOG_TIMEOUT + depends on DPM_WATCHDOG + help + If the DPM watchdog warning timeout and main timeout are + different then a non-fatal warning (with a stack trace of + the stuck suspend routine) will be printed when the warning + timeout expires. If the suspend routine gets un-stuck + before the main timeout expires then no other action is + taken. If the routine continues to be stuck and the main + timeout expires then an emergency-level message and stack + trace will be printed and the system will panic. + + If the warning timeout is equal to the main timeout (the + default) then the warning will never happen and the system + will jump straight to panic when the main timeout expires. + config PM_TRACE bool help @@ -320,8 +391,7 @@ config CPU_PM config ENERGY_MODEL bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" - depends on SMP - depends on CPU_FREQ + depends on CPU_FREQ || PM_DEVFREQ help Several subsystems (thermal and/or the task scheduler for example) can leverage information about the energy consumed by devices to diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 5899260a8bef..773e2789412b 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +ifeq ($(CONFIG_DYNAMIC_DEBUG), y) +CFLAGS_swap.o := -DDEBUG +CFLAGS_snapshot.o := -DDEBUG +CFLAGS_energy_model.o := -DDEBUG +endif KASAN_SANITIZE_snapshot.o := n @@ -17,4 +21,6 @@ obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o -obj-$(CONFIG_ENERGY_MODEL) += energy_model.o +obj-$(CONFIG_ENERGY_MODEL) += em.o +em-y := energy_model.o +em-$(CONFIG_NET) += em_netlink_autogen.o em_netlink.o diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index b29c8aca7486..865df641b97c 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -9,7 +9,6 @@ #include <linux/device.h> #include <linux/mutex.h> -#include <linux/pm_wakeup.h> #include "power.h" diff --git a/kernel/power/console.c b/kernel/power/console.c index fcdf0e14a47d..a906a0ac0f9b 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -16,6 +16,7 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static bool vt_switch_done; static DEFINE_MUTEX(vt_switch_mutex); @@ -43,9 +44,10 @@ static LIST_HEAD(pm_vt_switch_list); * no_console_suspend argument has been passed on the command line, VT * switches will occur. */ -void pm_vt_switch_required(struct device *dev, bool required) +int pm_vt_switch_required(struct device *dev, bool required) { struct pm_vt_switch *entry, *tmp; + int ret = 0; mutex_lock(&vt_switch_mutex); list_for_each_entry(tmp, &pm_vt_switch_list, head) { @@ -57,8 +59,10 @@ void pm_vt_switch_required(struct device *dev, bool required) } entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) + if (!entry) { + ret = -ENOMEM; goto out; + } entry->required = required; entry->dev = dev; @@ -66,6 +70,7 @@ void pm_vt_switch_required(struct device *dev, bool required) list_add(&entry->head, &pm_vt_switch_list); out: mutex_unlock(&vt_switch_mutex); + return ret; } EXPORT_SYMBOL(pm_vt_switch_required); @@ -136,17 +141,21 @@ void pm_prepare_console(void) if (orig_fgconsole < 0) return; + vt_switch_done = true; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return; } void pm_restore_console(void) { - if (!pm_vt_switch()) + if (!pm_vt_switch() && !vt_switch_done) return; if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); } + + vt_switch_done = false; } diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c new file mode 100644 index 000000000000..4b85da138a06 --- /dev/null +++ b/kernel/power/em_netlink.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include <linux/energy_model.h> +#include <net/sock.h> +#include <net/genetlink.h> +#include <uapi/linux/energy_model.h> + +#include "em_netlink.h" +#include "em_netlink_autogen.h" + +#define EM_A_PD_CPUS_LEN 256 + +/*************************** Command encoding ********************************/ +static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + int *tot_msg_sz = data; + int msg_sz, cpus_sz; + + cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + + msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */ + nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */ + nla_total_size(cpus_sz); /* EM_A_PD_CPUS */ + + *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz)); + return 0; +} + +static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) +{ + char cpus_buf[EM_A_PD_CPUS_LEN]; + struct sk_buff *msg = data; + struct nlattr *entry; + + entry = nla_nest_start(msg, EM_A_PDS_PD); + if (!entry) + goto out_cancel_nest; + + if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id)) + goto out_cancel_nest; + + if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD)) + goto out_cancel_nest; + + snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", + cpumask_pr_args(to_cpumask(pd->cpus))); + if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf)) + goto out_cancel_nest; + + nla_nest_end(msg, entry); + + return 0; + +out_cancel_nest: + nla_nest_cancel(msg, entry); + + return -EMSGSIZE; +} + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + int cmd = info->genlhdr->cmd; + int ret = -EMSGSIZE, msg_sz = 0; + + for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = for_each_em_perf_domain(__em_nl_get_pd, msg); + if (ret) + goto out_cancel_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +out_cancel_msg: + genlmsg_cancel(msg, hdr); +out_free_msg: + nlmsg_free(msg); + + return ret; +} + +static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) +{ + struct em_perf_domain *pd; + int id; + + if (!attrs[EM_A_PD_TABLE_PD_ID]) + return NULL; + + id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]); + pd = em_perf_domain_get_by_id(id); + return pd; +} + +static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd) +{ + int id_sz, ps_sz; + + id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */ + nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */ + nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */ + ps_sz *= pd->nr_perf_states; + + return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz)); +} + +static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) +{ + struct em_perf_state *table, *ps; + struct nlattr *entry; + int i; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) + goto out_err; + + rcu_read_lock(); + table = em_perf_state_from_pd((struct em_perf_domain *)pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + ps = &table[i]; + + entry = nla_nest_start(msg, EM_A_PD_TABLE_PS); + if (!entry) + goto out_unlock_ps; + + if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE, + ps->performance, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY, + ps->frequency, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_POWER, + ps->power, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_COST, + ps->cost, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS, + ps->flags, EM_A_PS_PAD)) + goto out_cancel_ps_nest; + + nla_nest_end(msg, entry); + } + rcu_read_unlock(); + return 0; + +out_cancel_ps_nest: + nla_nest_cancel(msg, entry); +out_unlock_ps: + rcu_read_unlock(); +out_err: + return -EMSGSIZE; +} + +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) +{ + int cmd = info->genlhdr->cmd; + int msg_sz, ret = -EMSGSIZE; + struct em_perf_domain *pd; + struct sk_buff *msg; + void *hdr; + + pd = __em_nl_get_pd_table_id(info->attrs); + if (!pd) + return -EINVAL; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +out_free_msg: + nlmsg_free(msg); + return ret; +} + + +/**************************** Event encoding *********************************/ +static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) +{ + struct sk_buff *msg; + int msg_sz, ret = -EMSGSIZE; + void *hdr; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_nl_get_pd_table_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type); + if (!hdr) + goto out_free_msg; + + ret = __em_nl_get_pd_table(msg, pd); + if (ret) + goto out_free_msg; + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +void em_notify_pd_created(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_CREATED); +} + +void em_notify_pd_updated(const struct em_perf_domain *pd) +{ + __em_notify_pd_table(pd, EM_CMD_PD_UPDATED); +} + +static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) +{ + int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + + return nlmsg_total_size(genlmsg_msg_size(id_sz)); +} + +void em_notify_pd_deleted(const struct em_perf_domain *pd) +{ + struct sk_buff *msg; + void *hdr; + int msg_sz; + + if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + return; + + msg_sz = __em_notify_pd_deleted_size(pd); + + msg = genlmsg_new(msg_sz, GFP_KERNEL); + if (!msg) + return; + + hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED); + if (!hdr) + goto out_free_msg; + + if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) { + goto out_free_msg; + } + + genlmsg_end(msg, hdr); + + genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + + return; + +out_free_msg: + nlmsg_free(msg); + return; +} + +/**************************** Initialization *********************************/ +static int __init em_netlink_init(void) +{ + return genl_register_family(&em_nl_family); +} +postcore_initcall(em_netlink_init); diff --git a/kernel/power/em_netlink.h b/kernel/power/em_netlink.h new file mode 100644 index 000000000000..583d7f1c3939 --- /dev/null +++ b/kernel/power/em_netlink.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Generic netlink for energy model. + * + * Copyright (c) 2025 Valve Corporation. + * Author: Changwoo Min <changwoo@igalia.com> + */ +#ifndef _EM_NETLINK_H +#define _EM_NETLINK_H + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data); +struct em_perf_domain *em_perf_domain_get_by_id(int id); +void em_notify_pd_created(const struct em_perf_domain *pd); +void em_notify_pd_deleted(const struct em_perf_domain *pd); +void em_notify_pd_updated(const struct em_perf_domain *pd); +#else +static inline +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + return -EINVAL; +} +static inline +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + return NULL; +} + +static inline void em_notify_pd_created(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_deleted(const struct em_perf_domain *pd) {} + +static inline void em_notify_pd_updated(const struct em_perf_domain *pd) {} +#endif + +#endif /* _EM_NETLINK_H */ diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c new file mode 100644 index 000000000000..a7a09ab1d1c2 --- /dev/null +++ b/kernel/power/em_netlink_autogen.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "em_netlink_autogen.h" + +#include <uapi/linux/energy_model.h> + +/* EM_CMD_GET_PD_TABLE - do */ +static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { + [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +}; + +/* Ops table for em */ +static const struct genl_split_ops em_nl_ops[] = { + { + .cmd = EM_CMD_GET_PDS, + .doit = em_nl_get_pds_doit, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = EM_CMD_GET_PD_TABLE, + .doit = em_nl_get_pd_table_doit, + .policy = em_get_pd_table_nl_policy, + .maxattr = EM_A_PD_TABLE_PD_ID, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group em_nl_mcgrps[] = { + [EM_NLGRP_EVENT] = { "event", }, +}; + +struct genl_family em_nl_family __ro_after_init = { + .name = EM_FAMILY_NAME, + .version = EM_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = em_nl_ops, + .n_split_ops = ARRAY_SIZE(em_nl_ops), + .mcgrps = em_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), +}; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h new file mode 100644 index 000000000000..78ce609641f1 --- /dev/null +++ b/kernel/power/em_netlink_autogen.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_EM_GEN_H +#define _LINUX_EM_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/energy_model.h> + +int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); +int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + EM_NLGRP_EVENT, +}; + +extern struct genl_family em_nl_family; + +#endif /* _LINUX_EM_GEN_H */ diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 0153b0ca7b23..11af9f64aa82 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -17,12 +17,30 @@ #include <linux/sched/topology.h> #include <linux/slab.h> +#include "em_netlink.h" + /* * Mutex serializing the registrations of performance domains and letting * callbacks defined by drivers sleep. */ static DEFINE_MUTEX(em_pd_mutex); +/* + * Manage performance domains with IDs. One can iterate the performance domains + * through the list and pick one with their associated ID. The mutex serializes + * the list access. When holding em_pd_list_mutex, em_pd_mutex should not be + * taken to avoid potential deadlock. + */ +static DEFINE_IDA(em_pd_ida); +static LIST_HEAD(em_pd_list); +static DEFINE_MUTEX(em_pd_list_mutex); + +static void em_cpufreq_update_efficiencies(struct device *dev, + struct em_perf_state *table); +static void em_check_capacity_update(void); +static void em_update_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(em_update_work, em_update_workfn); + static bool _is_cpu_device(struct device *dev) { return (dev->bus == &cpu_subsys); @@ -31,19 +49,65 @@ static bool _is_cpu_device(struct device *dev) #ifdef CONFIG_DEBUG_FS static struct dentry *rootdir; -static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd) +struct em_dbg_info { + struct em_perf_domain *pd; + int ps_id; +}; + +#define DEFINE_EM_DBG_SHOW(name, fname) \ +static int em_debug_##fname##_show(struct seq_file *s, void *unused) \ +{ \ + struct em_dbg_info *em_dbg = s->private; \ + struct em_perf_state *table; \ + unsigned long val; \ + \ + rcu_read_lock(); \ + table = em_perf_state_from_pd(em_dbg->pd); \ + val = table[em_dbg->ps_id].name; \ + rcu_read_unlock(); \ + \ + seq_printf(s, "%lu\n", val); \ + return 0; \ +} \ +DEFINE_SHOW_ATTRIBUTE(em_debug_##fname) + +DEFINE_EM_DBG_SHOW(frequency, frequency); +DEFINE_EM_DBG_SHOW(power, power); +DEFINE_EM_DBG_SHOW(cost, cost); +DEFINE_EM_DBG_SHOW(performance, performance); +DEFINE_EM_DBG_SHOW(flags, inefficiency); + +static void em_debug_create_ps(struct em_perf_domain *em_pd, + struct em_dbg_info *em_dbg, int i, + struct dentry *pd) { + struct em_perf_state *table; + unsigned long freq; struct dentry *d; char name[24]; - snprintf(name, sizeof(name), "ps:%lu", ps->frequency); + em_dbg[i].pd = em_pd; + em_dbg[i].ps_id = i; + + rcu_read_lock(); + table = em_perf_state_from_pd(em_pd); + freq = table[i].frequency; + rcu_read_unlock(); + + snprintf(name, sizeof(name), "ps:%lu", freq); /* Create per-ps directory */ d = debugfs_create_dir(name, pd); - debugfs_create_ulong("frequency", 0444, d, &ps->frequency); - debugfs_create_ulong("power", 0444, d, &ps->power); - debugfs_create_ulong("cost", 0444, d, &ps->cost); - debugfs_create_ulong("inefficient", 0444, d, &ps->flags); + debugfs_create_file("frequency", 0444, d, &em_dbg[i], + &em_debug_frequency_fops); + debugfs_create_file("power", 0444, d, &em_dbg[i], + &em_debug_power_fops); + debugfs_create_file("cost", 0444, d, &em_dbg[i], + &em_debug_cost_fops); + debugfs_create_file("performance", 0444, d, &em_dbg[i], + &em_debug_performance_fops); + debugfs_create_file("inefficient", 0444, d, &em_dbg[i], + &em_debug_inefficiency_fops); } static int em_debug_cpus_show(struct seq_file *s, void *unused) @@ -54,31 +118,29 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); -static int em_debug_units_show(struct seq_file *s, void *unused) +static int em_debug_flags_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ? - "milliWatts" : "bogoWatts"; - seq_printf(s, "%s\n", units); + seq_printf(s, "%#lx\n", pd->flags); return 0; } -DEFINE_SHOW_ATTRIBUTE(em_debug_units); +DEFINE_SHOW_ATTRIBUTE(em_debug_flags); -static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused) +static int em_debug_id_show(struct seq_file *s, void *unused) { struct em_perf_domain *pd = s->private; - int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0; - seq_printf(s, "%d\n", enabled); + seq_printf(s, "%d\n", pd->id); return 0; } -DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies); +DEFINE_SHOW_ATTRIBUTE(em_debug_id); static void em_debug_create_pd(struct device *dev) { + struct em_dbg_info *em_dbg; struct dentry *d; int i; @@ -89,22 +151,25 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); - debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); - debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd, - &em_debug_skip_inefficiencies_fops); + debugfs_create_file("flags", 0444, d, dev->em_pd, + &em_debug_flags_fops); + + debugfs_create_file("id", 0444, d, dev->em_pd, &em_debug_id_fops); + + em_dbg = devm_kcalloc(dev, dev->em_pd->nr_perf_states, + sizeof(*em_dbg), GFP_KERNEL); + if (!em_dbg) + return; /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) - em_debug_create_ps(&dev->em_pd->table[i], d); + em_debug_create_ps(dev->em_pd, em_dbg, i, d); } static void em_debug_remove_pd(struct device *dev) { - struct dentry *debug_dir; - - debug_dir = debugfs_lookup(dev_name(dev), rootdir); - debugfs_remove_recursive(debug_dir); + debugfs_lookup_and_remove(dev_name(dev), rootdir); } static int __init em_debug_init(void) @@ -120,17 +185,187 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif -static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, - int nr_states, struct em_data_callback *cb) +static void em_release_table_kref(struct kref *kref) { - unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX; - struct em_perf_state *table; - int i, ret; - u64 fmax; + /* It was the last owner of this table so we can free */ + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); +} - table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); +/** + * em_table_free() - Handles safe free of the EM table when needed + * @table : EM table which is going to be freed + * + * No return values. + */ +void em_table_free(struct em_perf_table *table) +{ + kref_put(&table->kref, em_release_table_kref); +} + +/** + * em_table_alloc() - Allocate a new EM table + * @pd : EM performance domain for which this must be done + * + * Allocate a new EM table and initialize its kref to indicate that it + * has a user. + * Returns allocated table or NULL. + */ +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) +{ + struct em_perf_table *table; + int table_size; + + table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; + + table = kzalloc(sizeof(*table) + table_size, GFP_KERNEL); if (!table) - return -ENOMEM; + return NULL; + + kref_init(&table->kref); + + return table; +} + +static void em_init_performance(struct device *dev, struct em_perf_domain *pd, + struct em_perf_state *table, int nr_states) +{ + u64 fmax, max_cap; + int i, cpu; + + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return; + + cpu = cpumask_first(em_span_cpus(pd)); + + /* + * Calculate the performance value for each frequency with + * linear relationship. The final CPU capacity might not be ready at + * boot time, but the EM will be updated a bit later with correct one. + */ + fmax = (u64) table[nr_states - 1].frequency; + max_cap = (u64) arch_scale_cpu_capacity(cpu); + for (i = 0; i < nr_states; i++) + table[i].performance = div64_u64(max_cap * table[i].frequency, + fmax); +} + +static int em_compute_costs(struct device *dev, struct em_perf_state *table, + const struct em_data_callback *cb, int nr_states, + unsigned long flags) +{ + unsigned long prev_cost = ULONG_MAX; + int i, ret; + + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return 0; + + /* Compute the cost of each performance state. */ + for (i = nr_states - 1; i >= 0; i--) { + unsigned long power_res, cost; + + if ((flags & EM_PERF_DOMAIN_ARTIFICIAL) && cb->get_cost) { + ret = cb->get_cost(dev, table[i].frequency, &cost); + if (ret || !cost || cost > EM_MAX_POWER) { + dev_err(dev, "EM: invalid cost %lu %d\n", + cost, ret); + return -EINVAL; + } + } else { + /* increase resolution of 'cost' precision */ + power_res = table[i].power * 10; + cost = power_res / table[i].performance; + } + + table[i].cost = cost; + + if (table[i].cost >= prev_cost) { + table[i].flags = EM_PERF_STATE_INEFFICIENT; + dev_dbg(dev, "EM: OPP:%lu is inefficient\n", + table[i].frequency); + } else { + prev_cost = table[i].cost; + } + } + + return 0; +} + +/** + * em_dev_compute_costs() - Calculate cost values for new runtime EM table + * @dev : Device for which the EM table is to be updated + * @table : The new EM table that is going to get the costs calculated + * @nr_states : Number of performance states + * + * Calculate the em_perf_state::cost values for new runtime EM table. The + * values are used for EAS during task placement. It also calculates and sets + * the efficiency flag for each performance state. When the function finish + * successfully the EM table is ready to be updated and used by EAS. + * + * Return 0 on success or a proper error in case of failure. + */ +int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, + int nr_states) +{ + return em_compute_costs(dev, table, NULL, nr_states, 0); +} + +/** + * em_dev_update_perf_domain() - Update runtime EM table for a device + * @dev : Device for which the EM is to be updated + * @new_table : The new EM table that is going to be used from now + * + * Update EM runtime modifiable table for the @dev using the provided @table. + * + * This function uses a mutex to serialize writers, so it must not be called + * from a non-sleeping context. + * + * Return 0 on success or an error code on failure. + */ +int em_dev_update_perf_domain(struct device *dev, + struct em_perf_table *new_table) +{ + struct em_perf_table *old_table; + struct em_perf_domain *pd; + + if (!dev) + return -EINVAL; + + /* Serialize update/unregister or concurrent updates */ + mutex_lock(&em_pd_mutex); + + if (!dev->em_pd) { + mutex_unlock(&em_pd_mutex); + return -EINVAL; + } + pd = dev->em_pd; + + kref_get(&new_table->kref); + + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); + rcu_assign_pointer(pd->em_table, new_table); + + em_cpufreq_update_efficiencies(dev, new_table->state); + + em_table_free(old_table); + + mutex_unlock(&em_pd_mutex); + + em_notify_pd_updated(pd); + return 0; +} +EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); + +static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, + struct em_perf_state *table, + const struct em_data_callback *cb, + unsigned long flags) +{ + unsigned long power, freq, prev_freq = 0; + int nr_states = pd->nr_perf_states; + int i, ret; /* Build the list of performance states for this performance domain */ for (i = 0, freq = 0; i < nr_states; i++, freq++) { @@ -139,11 +374,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, * lowest performance state of 'dev' above 'freq' and updates * 'power' and 'freq' accordingly. */ - ret = cb->active_power(&power, &freq, dev); + ret = cb->active_power(dev, &power, &freq); if (ret) { dev_err(dev, "EM: invalid perf. state: %d\n", ret); - goto free_ps_table; + return -EINVAL; } /* @@ -153,57 +388,51 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, if (freq <= prev_freq) { dev_err(dev, "EM: non-increasing freq: %lu\n", freq); - goto free_ps_table; + return -EINVAL; } /* * The power returned by active_state() is expected to be - * positive and to fit into 16 bits. + * positive and be in range. */ if (!power || power > EM_MAX_POWER) { dev_err(dev, "EM: invalid power: %lu\n", power); - goto free_ps_table; + return -EINVAL; } table[i].power = power; table[i].frequency = prev_freq = freq; } - /* Compute the cost of each performance state. */ - fmax = (u64) table[nr_states - 1].frequency; - for (i = nr_states - 1; i >= 0; i--) { - unsigned long power_res = em_scale_power(table[i].power); + em_init_performance(dev, pd, table, nr_states); - table[i].cost = div64_u64(fmax * power_res, - table[i].frequency); - if (table[i].cost >= prev_cost) { - table[i].flags = EM_PERF_STATE_INEFFICIENT; - dev_dbg(dev, "EM: OPP:%lu is inefficient\n", - table[i].frequency); - } else { - prev_cost = table[i].cost; - } - } - - pd->table = table; - pd->nr_perf_states = nr_states; + ret = em_compute_costs(dev, table, cb, nr_states, flags); + if (ret) + return -EINVAL; return 0; - -free_ps_table: - kfree(table); - return -EINVAL; } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + const struct em_data_callback *cb, + const cpumask_t *cpus, + unsigned long flags) { + struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; - int cpu, ret; + int cpu, ret, num_cpus, id; if (_is_cpu_device(dev)) { + num_cpus = cpumask_weight(cpus); + + /* Prevent max possible energy calculation to not overflow */ + if (num_cpus > EM_MAX_NUM_CPUS) { + dev_err(dev, "EM: too many CPUs, overflow possible\n"); + return -EINVAL; + } + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); if (!pd) return -ENOMEM; @@ -215,11 +444,24 @@ static int em_create_pd(struct device *dev, int nr_states, return -ENOMEM; } - ret = em_create_perf_table(dev, pd, nr_states, cb); - if (ret) { - kfree(pd); - return ret; - } + pd->nr_perf_states = nr_states; + + INIT_LIST_HEAD(&pd->node); + + id = ida_alloc(&em_pd_ida, GFP_KERNEL); + if (id < 0) + return -ENOMEM; + pd->id = id; + + em_table = em_table_alloc(pd); + if (!em_table) + goto free_pd; + + ret = em_create_perf_table(dev, pd, em_table->state, cb, flags); + if (ret) + goto free_pd_table; + + rcu_assign_pointer(pd->em_table, em_table); if (_is_cpu_device(dev)) for_each_cpu(cpu, cpus) { @@ -230,26 +472,38 @@ static int em_create_pd(struct device *dev, int nr_states, dev->em_pd = pd; return 0; + +free_pd_table: + kfree(em_table); +free_pd: + kfree(pd); + ida_free(&em_pd_ida, id); + return -EINVAL; } -static void em_cpufreq_update_efficiencies(struct device *dev) +static void +em_cpufreq_update_efficiencies(struct device *dev, struct em_perf_state *table) { struct em_perf_domain *pd = dev->em_pd; - struct em_perf_state *table; struct cpufreq_policy *policy; int found = 0; - int i; + int i, cpu; - if (!_is_cpu_device(dev) || !pd) + if (!_is_cpu_device(dev)) return; - policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd))); - if (!policy) { - dev_warn(dev, "EM: Access to CPUFreq policy failed"); + /* Try to get a CPU which is active and in this PD */ + cpu = cpumask_first_and(em_span_cpus(pd), cpu_active_mask); + if (cpu >= nr_cpu_ids) { + dev_warn(dev, "EM: No online CPU for CPUFreq policy\n"); return; } - table = pd->table; + policy = cpufreq_cpu_get(cpu); + if (!policy) { + dev_warn(dev, "EM: Access to CPUFreq policy failed\n"); + return; + } for (i = 0; i < pd->nr_perf_states; i++) { if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT)) @@ -259,6 +513,8 @@ static void em_cpufreq_update_efficiencies(struct device *dev) found++; } + cpufreq_cpu_put(policy); + if (!found) return; @@ -312,13 +568,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * @cpus : Pointer to cpumask_t, which in case of a CPU device is * obligatory. It can be taken from i.e. 'policy->cpus'. For other * type of devices this should be set to NULL. - * @milliwatts : Flag indicating that the power values are in milliWatts or + * @microwatts : Flag indicating that the power values are in micro-Watts or * in some other scale. It must be set properly. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. * - * The @milliwatts is important to set with correct value. Some kernel + * The @microwatts is important to set with correct value. Some kernel * sub-systems might rely on this flag and check if all devices in the EM are * using the same scale. * @@ -328,10 +584,36 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, - bool milliwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { + int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_register_pd_no_update() - Register a perf domain for a device + * @dev : Device to register the PD for + * @nr_states : Number of performance states in the new PD + * @cb : Callback functions for populating the energy model + * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) + * @microwatts : Whether or not the power values in the EM will be in uW + * + * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity + * update after registering the PD, even if @dev is a CPU device. + */ +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ + struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; + unsigned long flags = 0; int cpu, ret; if (!dev || !nr_states || !cb) @@ -378,23 +660,51 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, } } - ret = em_create_pd(dev, nr_states, cb, cpus); + if (microwatts) + flags |= EM_PERF_DOMAIN_MICROWATTS; + else if (cb->get_cost) + flags |= EM_PERF_DOMAIN_ARTIFICIAL; + + /* + * EM only supports uW (exception is artificial EM). + * Therefore, check and force the drivers to provide + * power in uW. + */ + if (!microwatts && !(flags & EM_PERF_DOMAIN_ARTIFICIAL)) { + dev_err(dev, "EM: only supports uW power values\n"); + ret = -EINVAL; + goto unlock; + } + + ret = em_create_pd(dev, nr_states, cb, cpus, flags); if (ret) goto unlock; - if (milliwatts) - dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS; + dev->em_pd->flags |= flags; + dev->em_pd->min_perf_state = 0; + dev->em_pd->max_perf_state = nr_states - 1; - em_cpufreq_update_efficiencies(dev); + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); unlock: mutex_unlock(&em_pd_mutex); - return ret; + if (ret) + return ret; + + mutex_lock(&em_pd_list_mutex); + list_add_tail(&dev->em_pd->node, &em_pd_list); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_created(dev->em_pd); + + return 0; } -EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device @@ -410,6 +720,12 @@ void em_dev_unregister_perf_domain(struct device *dev) if (_is_cpu_device(dev)) return; + mutex_lock(&em_pd_list_mutex); + list_del_init(&dev->em_pd->node); + mutex_unlock(&em_pd_list_mutex); + + em_notify_pd_deleted(dev->em_pd); + /* * The mutex separates all register/unregister requests and protects * from potential clean-up/setup issues in the debugfs directories. @@ -418,9 +734,313 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - kfree(dev->em_pd->table); + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); + + ida_free(&em_pd_ida, dev->em_pd->id); + kfree(dev->em_pd); dev->em_pd = NULL; mutex_unlock(&em_pd_mutex); } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); + +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) +{ + struct em_perf_table *em_table; + struct em_perf_state *ps, *new_ps; + int ps_size; + + em_table = em_table_alloc(pd); + if (!em_table) + return NULL; + + new_ps = em_table->state; + + rcu_read_lock(); + ps = em_perf_state_from_pd(pd); + /* Initialize data based on old table */ + ps_size = sizeof(struct em_perf_state) * pd->nr_perf_states; + memcpy(new_ps, ps, ps_size); + + rcu_read_unlock(); + + return em_table; +} + +static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, + struct em_perf_table *em_table) +{ + int ret; + + if (!em_is_artificial(pd)) { + ret = em_compute_costs(dev, em_table->state, NULL, + pd->nr_perf_states, pd->flags); + if (ret) + goto free_em_table; + } + + ret = em_dev_update_perf_domain(dev, em_table); + if (ret) + goto free_em_table; + + /* + * This is one-time-update, so give up the ownership in this updater. + * The EM framework has incremented the usage counter and from now + * will keep the reference (then free the memory when needed). + */ +free_em_table: + em_table_free(em_table); + return ret; +} + +/* + * Adjustment of CPU performance values after boot, when all CPUs capacites + * are correctly calculated. + */ +static void em_adjust_new_capacity(unsigned int cpu, struct device *dev, + struct em_perf_domain *pd) +{ + unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu); + struct em_perf_table *em_table; + struct em_perf_state *table; + unsigned long em_max_perf; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + em_max_perf = table[pd->nr_perf_states - 1].performance; + rcu_read_unlock(); + + if (em_max_perf == cpu_capacity) + return; + + pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, + cpu_capacity, em_max_perf); + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return; + } + + em_init_performance(dev, pd, em_table->state, pd->nr_perf_states); + + em_recalc_and_update(dev, pd, em_table); +} + +/** + * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update. + * @cpu: Target CPU. + * + * Adjust the existing EM for @cpu after a capacity update under the assumption + * that the capacity has been updated in the same way for all of the CPUs in + * the same perf domain. + */ +void em_adjust_cpu_capacity(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + struct em_perf_domain *pd; + + pd = em_pd_get(dev); + if (pd) + em_adjust_new_capacity(cpu, dev, pd); +} + +static void em_check_capacity_update(void) +{ + cpumask_var_t cpu_done_mask; + int cpu, failed_cpus = 0; + + if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { + pr_warn("no free memory\n"); + return; + } + + /* Check if CPUs capacity has changed than update EM */ + for_each_possible_cpu(cpu) { + struct cpufreq_policy *policy; + struct em_perf_domain *pd; + struct device *dev; + + if (cpumask_test_cpu(cpu, cpu_done_mask)) + continue; + + policy = cpufreq_cpu_get(cpu); + if (!policy) { + failed_cpus++; + continue; + } + cpufreq_cpu_put(policy); + + dev = get_cpu_device(cpu); + pd = em_pd_get(dev); + if (!pd || em_is_artificial(pd)) + continue; + + cpumask_or(cpu_done_mask, cpu_done_mask, + em_span_cpus(pd)); + + em_adjust_new_capacity(cpu, dev, pd); + } + + if (failed_cpus) + schedule_delayed_work(&em_update_work, msecs_to_jiffies(1000)); + + free_cpumask_var(cpu_done_mask); +} + +static void em_update_workfn(struct work_struct *work) +{ + em_check_capacity_update(); +} + +/** + * em_dev_update_chip_binning() - Update Energy Model after the new voltage + * information is present in the OPPs. + * @dev : Device for which the Energy Model has to be updated. + * + * This function allows to update easily the EM with new values available in + * the OPP framework and DT. It can be used after the chip has been properly + * verified by device drivers and the voltages adjusted for the 'chip binning'. + */ +int em_dev_update_chip_binning(struct device *dev) +{ + struct em_perf_table *em_table; + struct em_perf_domain *pd; + int i, ret; + + if (IS_ERR_OR_NULL(dev)) + return -EINVAL; + + pd = em_pd_get(dev); + if (!pd) { + dev_warn(dev, "Couldn't find Energy Model\n"); + return -EINVAL; + } + + em_table = em_table_dup(pd); + if (!em_table) { + dev_warn(dev, "EM: allocation failed\n"); + return -ENOMEM; + } + + /* Update power values which might change due to new voltage in OPPs */ + for (i = 0; i < pd->nr_perf_states; i++) { + unsigned long freq = em_table->state[i].frequency; + unsigned long power; + + ret = dev_pm_opp_calc_power(dev, &power, &freq); + if (ret) { + em_table_free(em_table); + return ret; + } + + em_table->state[i].power = power; + } + + return em_recalc_and_update(dev, pd, em_table); +} +EXPORT_SYMBOL_GPL(em_dev_update_chip_binning); + + +/** + * em_update_performance_limits() - Update Energy Model with performance + * limits information. + * @pd : Performance Domain with EM that has to be updated. + * @freq_min_khz : New minimum allowed frequency for this device. + * @freq_max_khz : New maximum allowed frequency for this device. + * + * This function allows to update the EM with information about available + * performance levels. It takes the minimum and maximum frequency in kHz + * and does internal translation to performance levels. + * Returns 0 on success or -EINVAL when failed. + */ +int em_update_performance_limits(struct em_perf_domain *pd, + unsigned long freq_min_khz, unsigned long freq_max_khz) +{ + struct em_perf_state *table; + int min_ps = -1; + int max_ps = -1; + int i; + + if (!pd) + return -EINVAL; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + + for (i = 0; i < pd->nr_perf_states; i++) { + if (freq_min_khz == table[i].frequency) + min_ps = i; + if (freq_max_khz == table[i].frequency) + max_ps = i; + } + rcu_read_unlock(); + + /* Only update when both are found and sane */ + if (min_ps < 0 || max_ps < 0 || max_ps < min_ps) + return -EINVAL; + + + /* Guard simultaneous updates and make them atomic */ + mutex_lock(&em_pd_mutex); + pd->min_perf_state = min_ps; + pd->max_perf_state = max_ps; + mutex_unlock(&em_pd_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(em_update_performance_limits); + +static void rebuild_sd_workfn(struct work_struct *work) +{ + rebuild_sched_domains_energy(); +} + +void em_rebuild_sched_domains(void) +{ + static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); +} + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_NET) +int for_each_em_perf_domain(int (*cb)(struct em_perf_domain*, void *), + void *data) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + int ret; + + ret = cb(pd, data); + if (ret) + return ret; + } + + return 0; +} + +struct em_perf_domain *em_perf_domain_get_by_id(int id) +{ + struct em_perf_domain *pd; + + lockdep_assert_not_held(&em_pd_mutex); + guard(mutex)(&em_pd_list_mutex); + + list_for_each_entry(pd, &em_pd_list, node) { + if (pd->id == id) + return pd; + } + + return NULL; +} +#endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e6af502c2fd7..af8d07bafe02 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -11,6 +11,8 @@ #define pr_fmt(fmt) "PM: hibernation: " fmt +#include <crypto/acompress.h> +#include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> #include <linux/reboot.h> @@ -28,7 +30,6 @@ #include <linux/gfp.h> #include <linux/syscore_ops.h> #include <linux/ctype.h> -#include <linux/genhd.h> #include <linux/ktime.h> #include <linux/security.h> #include <linux/secretmem.h> @@ -47,6 +48,15 @@ dev_t swsusp_resume_device; sector_t swsusp_resume_block; __visible int in_suspend __nosavedata; +static char hibernate_compressor[CRYPTO_MAX_ALG_NAME] = CONFIG_HIBERNATION_DEF_COMP; + +/* + * Compression/decompression algorithm to be used while saving/loading + * image to/from disk. This would later be used in 'kernel/power/swap.c' + * to allocate comp streams. + */ +char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; + enum { HIBERNATION_INVALID, HIBERNATION_PLATFORM, @@ -70,6 +80,17 @@ static const struct platform_hibernation_ops *hibernation_ops; static atomic_t hibernate_atomic = ATOMIC_INIT(1); +#ifdef CONFIG_SUSPEND +/** + * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend + */ +bool pm_hibernation_mode_is_suspend(void) +{ + return hibernation_mode == HIBERNATION_SUSPEND; +} +EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend); +#endif + bool hibernate_acquire(void) { return atomic_add_unless(&hibernate_atomic, -1, 0); @@ -80,11 +101,16 @@ void hibernate_release(void) atomic_inc(&hibernate_atomic); } +bool hibernation_in_progress(void) +{ + return !atomic_read(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION) && - !secretmem_active(); + !secretmem_active() && !cxl_mem_active(); } /** @@ -93,20 +119,24 @@ bool hibernation_available(void) */ void hibernation_set_ops(const struct platform_hibernation_ops *ops) { + unsigned int sleep_flags; + if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } - lock_system_sleep(); + + sleep_flags = lock_system_sleep(); + hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(hibernation_set_ops); @@ -119,10 +149,15 @@ bool system_entering_hibernation(void) EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from hibernation test"); static void hibernation_debug_sleep(void) { - pr_info("debug: Waiting for 5 seconds.\n"); - mdelay(5000); + pr_info("hibernation debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); } static int hibernation_test(int level) @@ -357,6 +392,23 @@ static int create_image(int platform_mode) return error; } +static void shrink_shmem_memory(void) +{ + struct sysinfo info; + unsigned long nr_shmem_pages, nr_freed_pages; + + si_meminfo(&info); + nr_shmem_pages = info.sharedram; /* current page count used for shmem */ + /* + * The intent is to reclaim all shmem pages. Though shrink_all_memory() can + * only reclaim about half of them, it's enough for creating the hibernation + * image. + */ + nr_freed_pages = shrink_all_memory(nr_shmem_pages); + pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n", + nr_shmem_pages, nr_freed_pages); +} + /** * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. @@ -398,7 +450,16 @@ int hibernation_snapshot(int platform_mode) goto Thaw; } - suspend_console(); + /* + * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem + * pages will use lots of system memory, causing hibernation image creation + * fail due to insufficient free memory. + * This call is to force flush the shmem pages to swap disk and reclaim + * the system memory so that image creation can succeed. + */ + shrink_shmem_memory(); + + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -424,7 +485,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); dpm_complete(msg); Close: @@ -534,8 +595,7 @@ int hibernation_restore(int platform_mode) int error; pm_prepare_console(); - suspend_console(); - pm_restrict_gfp_mask(); + console_suspend_all(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); @@ -547,8 +607,7 @@ int hibernation_restore(int platform_mode) BUG_ON(!error); } dpm_resume_end(PMSG_RECOVER); - pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); pm_restore_console(); return error; } @@ -573,7 +632,7 @@ int hibernation_platform_enter(void) goto Close; entering_platform_hibernation = true; - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -595,7 +654,11 @@ int hibernation_platform_enter(void) local_irq_disable(); system_state = SYSTEM_SUSPEND; - syscore_suspend(); + + error = syscore_suspend(); + if (error) + goto Enable_irqs; + if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; @@ -607,6 +670,7 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); + Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); @@ -621,7 +685,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - resume_console(); + console_resume_all(); Close: hibernation_ops->end(); @@ -638,23 +702,16 @@ int hibernation_platform_enter(void) */ static void power_down(void) { -#ifdef CONFIG_SUSPEND int error; +#ifdef CONFIG_SUSPEND if (hibernation_mode == HIBERNATION_SUSPEND) { - error = suspend_devices_and_enter(PM_SUSPEND_MEM); - if (error) { - hibernation_mode = hibernation_ops ? - HIBERNATION_PLATFORM : - HIBERNATION_SHUTDOWN; - } else { - /* Restore swap signature. */ - error = swsusp_unmark(); - if (error) - pr_err("Swap will be unusable! Try swapon -a.\n"); + error = suspend_devices_and_enter(mem_sleep_current); + if (!error) + goto exit; - return; - } + hibernation_mode = hibernation_ops ? HIBERNATION_PLATFORM : + HIBERNATION_SHUTDOWN; } #endif @@ -663,11 +720,19 @@ static void power_down(void) kernel_restart(NULL); break; case HIBERNATION_PLATFORM: - hibernation_platform_enter(); + error = hibernation_platform_enter(); + if (error == -EAGAIN || error == -EBUSY) { + events_check_enabled = false; + pr_info("Wakeup event detected during hibernation, rolling back.\n"); + goto exit; + } fallthrough; case HIBERNATION_SHUTDOWN: - if (pm_power_off) + if (kernel_can_power_off()) { + entering_platform_hibernation = true; kernel_power_off(); + entering_platform_hibernation = false; + } break; } kernel_halt(); @@ -678,6 +743,12 @@ static void power_down(void) pr_crit("Power down manually\n"); while (1) cpu_relax(); + +exit: + /* Restore swap signature. */ + error = swsusp_unmark(); + if (error) + pr_err("Swap will be unusable! Try swapon -a.\n"); } static int load_image_and_restore(void) @@ -689,11 +760,13 @@ static int load_image_and_restore(void) lock_device_hotplug(); error = create_basic_memory_bitmaps(); - if (error) + if (error) { + swsusp_close(); goto Unlock; + } error = swsusp_read(&flags); - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -706,12 +779,16 @@ static int load_image_and_restore(void) return error; } +#define COMPRESSION_ALGO_LZO "lzo" +#define COMPRESSION_ALGO_LZ4 "lz4" + /** * hibernate - Carry out system hibernation, including saving the image. */ int hibernate(void) { bool snapshot_test = false; + unsigned int sleep_flags; int error; if (!hibernation_available()) { @@ -719,7 +796,18 @@ int hibernate(void) return -EPERM; } - lock_system_sleep(); + /* + * Query for the compression algorithm support if compression is enabled. + */ + if (!nocompress) { + strscpy(hib_comp_algo, hibernate_compressor); + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { + pr_err("%s compression is not available\n", hib_comp_algo); + return -EOPNOTSUPP; + } + } + + sleep_flags = lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; @@ -732,7 +820,11 @@ int hibernate(void) if (error) goto Restore; - ksys_sync_helper(); + error = pm_sleep_fs_sync(); + if (error) + goto Notify; + + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -753,11 +845,24 @@ int hibernate(void) if (hibernation_mode == HIBERNATION_PLATFORM) flags |= SF_PLATFORM_MODE; - if (nocompress) + if (nocompress) { flags |= SF_NOCOMPRESS_MODE; - else + } else { flags |= SF_CRC32_MODE; + /* + * By default, LZO compression is enabled. Use SF_COMPRESSION_ALG_LZ4 + * to override this behaviour and use LZ4. + * + * Refer kernel/power/power.h for more details + */ + + if (!strcmp(hib_comp_algo, COMPRESSION_ALGO_LZ4)) + flags |= SF_COMPRESSION_ALG_LZ4; + else + flags |= SF_COMPRESSION_ALG_LZO; + } + pm_pr_dbg("Writing hibernation image.\n"); error = swsusp_write(flags); swsusp_free(); @@ -779,7 +884,7 @@ int hibernate(void) unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); - error = swsusp_check(); + error = swsusp_check(false); if (!error) error = load_image_and_restore(); } @@ -788,12 +893,14 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: + filesystems_thaw(); + Notify: pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); hibernate_release(); Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); pr_info("hibernation exit\n"); return error; @@ -808,9 +915,10 @@ int hibernate(void) */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { + unsigned int sleep_flags; int error; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; @@ -823,6 +931,8 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; + filesystems_freeze(filesystem_freeze_enabled); + error = freeze_processes(); if (error) goto exit; @@ -843,7 +953,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto dpm_complete; - suspend_console(); + console_suspend_all(); error = dpm_suspend(PMSG_FREEZE); if (error) @@ -867,7 +977,7 @@ skip: dpm_resume: dpm_resume(PMSG_THAW); - resume_console(); + console_resume_all(); dpm_complete: dpm_complete(PMSG_THAW); @@ -882,6 +992,7 @@ thaw: thaw_processes(); exit: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); restore: @@ -890,56 +1001,16 @@ restore: hibernate_release(); unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } EXPORT_SYMBOL_GPL(hibernate_quiet_exec); -/** - * software_resume - Resume from a saved hibernation image. - * - * This routine is called as a late initcall, when all devices have been - * discovered and initialized already. - * - * The image reading code is called to see if there is a hibernation image - * available for reading. If that is the case, devices are quiesced and the - * contents of memory is restored from the saved image. - * - * If this is successful, control reappears in the restored target kernel in - * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine - * attempts to recover gracefully and make the kernel return to the normal mode - * of operation. - */ -static int software_resume(void) +static int __init find_resume_device(void) { - int error; - - /* - * If the user said "noresume".. bail out early. - */ - if (noresume || !hibernation_available()) - return 0; - - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take system_transition_mutex) - * this can cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); - - if (swsusp_resume_device) - goto Check_image; - - if (!strlen(resume_file)) { - error = -ENOENT; - goto Unlock; - } + if (!strlen(resume_file)) + return -ENOENT; pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); @@ -950,40 +1021,57 @@ static int software_resume(void) } /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - /* - * Some device discovery might still be in progress; we need - * to wait for this to finish. - */ - wait_for_device_probe(); - - if (resume_wait) { - while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) - msleep(10); - async_synchronize_full(); - } + if (!early_lookup_bdev(resume_file, &swsusp_resume_device)) + return 0; - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - error = -ENODEV; - goto Unlock; - } + /* + * Some device discovery might still be in progress; we need to wait for + * this to finish. + */ + wait_for_device_probe(); + if (resume_wait) { + while (early_lookup_bdev(resume_file, &swsusp_resume_device)) + msleep(10); + async_synchronize_full(); } - Check_image: + return early_lookup_bdev(resume_file, &swsusp_resume_device); +} + +static int software_resume(void) +{ + int error; + pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pm_pr_dbg("Looking for hibernation image.\n"); - error = swsusp_check(); + + mutex_lock(&system_transition_mutex); + error = swsusp_check(true); if (error) goto Unlock; + /* + * Check if the hibernation image is compressed. If so, query for + * the algorithm support. + */ + if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) { + if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4) + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4); + else + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO); + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { + pr_err("%s compression is not available\n", hib_comp_algo); + error = -EOPNOTSUPP; + goto Unlock; + } + } + /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(); goto Unlock; } @@ -993,19 +1081,25 @@ static int software_resume(void) if (error) goto Restore; + filesystems_freeze(filesystem_freeze_enabled); + pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); - if (error) + if (error) { + filesystems_thaw(); goto Close_Finish; + } error = freeze_kernel_threads(); if (error) { thaw_processes(); + filesystems_thaw(); goto Close_Finish; } error = load_image_and_restore(); thaw_processes(); + filesystems_thaw(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); Restore: @@ -1018,11 +1112,43 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(); goto Finish; } -late_initcall_sync(software_resume); +/** + * software_resume_initcall - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int __init software_resume_initcall(void) +{ + /* + * If the user said "noresume".. bail out early. + */ + if (noresume || !hibernation_available()) + return 0; + + if (!swsusp_resume_device) { + int error = find_resume_device(); + + if (error) + return error; + } + + return software_resume(); +} +late_initcall_sync(software_resume_initcall); static const char * const hibernation_modes[] = { @@ -1064,11 +1190,11 @@ static const char * const hibernation_modes[] = { static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + ssize_t count = 0; int i; - char *start = buf; if (!hibernation_available()) - return sprintf(buf, "[disabled]\n"); + return sysfs_emit(buf, "[disabled]\n"); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) @@ -1088,22 +1214,27 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, continue; } if (i == hibernation_mode) - buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]); else - buf += sprintf(buf, "%s ", hibernation_modes[i]); + count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]); } - buf += sprintf(buf, "\n"); - return buf-start; + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; } static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + int mode = HIBERNATION_INVALID; + unsigned int sleep_flags; int error = 0; - int i; int len; char *p; - int mode = HIBERNATION_INVALID; + int i; if (!hibernation_available()) return -EPERM; @@ -1111,7 +1242,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -1141,7 +1272,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, if (!error) pm_pr_dbg("Hibernation mode set to '%s'\n", hibernation_modes[mode]); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } @@ -1150,16 +1281,21 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); + return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); } static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - dev_t res; + unsigned int sleep_flags; int len = n; char *name; + dev_t dev; + int error; + + if (!hibernation_available()) + return n; if (len && buf[len-1] == '\n') len--; @@ -1167,14 +1303,32 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!name) return -ENOMEM; - res = name_to_dev_t(name); + error = lookup_bdev(name, &dev); + if (error) { + unsigned maj, min, offset; + char *p, dummy; + + error = 0; + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, + &dummy) == 3) { + dev = MKDEV(maj, min); + if (maj != MAJOR(dev) || min != MINOR(dev)) + error = -EINVAL; + } else { + dev = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + error = -EINVAL; + } + } kfree(name); - if (!res) - return -EINVAL; + if (error) + return error; + + sleep_flags = lock_system_sleep(); + swsusp_resume_device = dev; + unlock_system_sleep(sleep_flags); - lock_system_sleep(); - swsusp_resume_device = res; - unlock_system_sleep(); pm_pr_dbg("Configured hibernation resume from disk to %u\n", swsusp_resume_device); noresume = 0; @@ -1187,7 +1341,7 @@ power_attr(resume); static ssize_t resume_offset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block); + return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block); } static ssize_t resume_offset_store(struct kobject *kobj, @@ -1210,7 +1364,7 @@ power_attr(resume_offset); static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", image_size); + return sysfs_emit(buf, "%lu\n", image_size); } static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1231,7 +1385,7 @@ power_attr(image_size); static ssize_t reserved_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", reserved_size); + return sysfs_emit(buf, "%lu\n", reserved_size); } static ssize_t reserved_size_store(struct kobject *kobj, @@ -1278,7 +1432,7 @@ static int __init resume_setup(char *str) if (noresume) return 1; - strncpy(resume_file, str, 255); + strscpy(resume_file, str); return 1; } @@ -1328,7 +1482,7 @@ static int __init resumedelay_setup(char *str) int rc = kstrtouint(str, 0, &resume_delay); if (rc) - return rc; + pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } @@ -1339,6 +1493,56 @@ static int __init nohibernate_setup(char *str) return 1; } +static const char * const comp_alg_enabled[] = { +#if IS_ENABLED(CONFIG_CRYPTO_LZO) + COMPRESSION_ALGO_LZO, +#endif +#if IS_ENABLED(CONFIG_CRYPTO_LZ4) + COMPRESSION_ALGO_LZ4, +#endif +}; + +static int hibernate_compressor_param_set(const char *compressor, + const struct kernel_param *kp) +{ + int index, ret; + + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; + + index = sysfs_match_string(comp_alg_enabled, compressor); + if (index >= 0) { + ret = param_set_copystring(comp_alg_enabled[index], kp); + if (!ret) + strscpy(hib_comp_algo, comp_alg_enabled[index]); + } else { + ret = index; + } + + mutex_unlock(&system_transition_mutex); + + if (ret) + pr_debug("Cannot set specified compressor %s\n", + compressor); + + return ret; +} + +static const struct kernel_param_ops hibernate_compressor_param_ops = { + .set = hibernate_compressor_param_set, + .get = param_get_string, +}; + +static struct kparam_string hibernate_compressor_param_string = { + .maxlen = sizeof(hibernate_compressor), + .string = hibernate_compressor, +}; + +module_param_cb(compressor, &hibernate_compressor_param_ops, + &hibernate_compressor_param_string, 0644); +MODULE_PARM_DESC(compressor, + "Compression algorithm to be used with hibernation"); + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 44169f3081fd..03b2c5495c77 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -6,7 +6,9 @@ * Copyright (c) 2003 Open Source Development Lab */ +#include <linux/acpi.h> #include <linux/export.h> +#include <linux/init.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/pm-trace.h> @@ -16,37 +18,65 @@ #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> +#include <linux/atomic.h> +#include <linux/wait.h> #include "power.h" #ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). + */ +static unsigned int saved_gfp_count; +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + + if (WARN_ON(!saved_gfp_count) || --saved_gfp_count) + return; + + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + + pm_pr_dbg("GFP mask restored\n"); +} -void lock_system_sleep(void) +void pm_restrict_gfp_mask(void) { - current->flags |= PF_FREEZER_SKIP; + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + + if (saved_gfp_count++) { + WARN_ON((saved_gfp_mask & ~(__GFP_IO | __GFP_FS)) != gfp_allowed_mask); + return; + } + + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); + + pm_pr_dbg("GFP mask restricted\n"); +} + +unsigned int lock_system_sleep(void) +{ + unsigned int flags = current->flags; + current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); + return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); -void unlock_system_sleep(void) +void unlock_system_sleep(unsigned int flags) { - /* - * Don't use freezer_count() because we don't want the call to - * try_to_freeze() here. - * - * Reason: - * Fundamentally, we just don't need it, because freezing condition - * doesn't come into effect until we release the - * system_transition_mutex lock, since the freezer always works with - * system_transition_mutex held. - * - * More importantly, in the case of hibernation, - * unlock_system_sleep() gets called in snapshot_read() and - * snapshot_write() when the freezing condition is still in effect. - * Which means, if we use try_to_freeze() here, it would make them - * enter the refrigerator, thus causing hibernation to lockup. - */ - current->flags &= ~PF_FREEZER_SKIP; + if (!(flags & PF_NOFREEZE)) + current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); @@ -64,6 +94,61 @@ void ksys_sync_helper(void) } EXPORT_SYMBOL_GPL(ksys_sync_helper); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +/* Wakeup events handling resolution while syncing file systems in jiffies */ +#define PM_FS_SYNC_WAKEUP_RESOLUTION 5 + +static atomic_t pm_fs_sync_count = ATOMIC_INIT(0); +static struct workqueue_struct *pm_fs_sync_wq; +static DECLARE_WAIT_QUEUE_HEAD(pm_fs_sync_wait); + +static bool pm_fs_sync_completed(void) +{ + return atomic_read(&pm_fs_sync_count) == 0; +} + +static void pm_fs_sync_work_fn(struct work_struct *work) +{ + ksys_sync_helper(); + + if (atomic_dec_and_test(&pm_fs_sync_count)) + wake_up(&pm_fs_sync_wait); +} +static DECLARE_WORK(pm_fs_sync_work, pm_fs_sync_work_fn); + +/** + * pm_sleep_fs_sync() - Sync file systems in an interruptible way + * + * Return: 0 on successful file system sync, or -EBUSY if the file system sync + * was aborted. + */ +int pm_sleep_fs_sync(void) +{ + pm_wakeup_clear(0); + + /* + * Take back-to-back sleeps into account by queuing a subsequent fs sync + * only if the previous fs sync is running or is not queued. Multiple fs + * syncs increase the likelihood of saving the latest files immediately + * before sleep. + */ + if (!work_pending(&pm_fs_sync_work)) { + atomic_inc(&pm_fs_sync_count); + queue_work(pm_fs_sync_wq, &pm_fs_sync_work); + } + + while (!pm_fs_sync_completed()) { + if (pm_wakeup_pending()) + return -EBUSY; + + wait_event_timeout(pm_fs_sync_wait, pm_fs_sync_completed(), + PM_FS_SYNC_WAKEUP_RESOLUTION); + } + + return 0; +} +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); @@ -97,10 +182,18 @@ int pm_notifier_call_chain(unsigned long val) /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; +static int __init pm_async_setup(char *str) +{ + if (!strcmp(str, "off")) + pm_async_enabled = 0; + return 1; +} +__setup("pm_async=", pm_async_setup); + static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_async_enabled); + return sysfs_emit(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -124,24 +217,27 @@ power_attr(pm_async); static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; suspend_state_t i; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { + if (i >= PM_SUSPEND_MEM && cxl_mem_active()) + continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; if (mem_sleep_current == i) - s += sprintf(s, "[%s] ", label); + count += sysfs_emit_at(buf, count, "[%s] ", label); else - s += sprintf(s, "%s ", label); + count += sysfs_emit_at(buf, count, "%s ", label); } + } /* Convert the last space to a newline if needed. */ - if (s != buf) - *(s-1) = '\n'; + if (count > 0) + buf[count - 1] = '\n'; - return (s - buf); + return count; } static suspend_state_t decode_suspend_state(const char *buf, size_t n) @@ -192,17 +288,17 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr power_attr(mem_sleep); /* - * sync_on_suspend: invoke ksys_sync_helper() before suspend. + * sync_on_suspend: Sync file systems before suspend. * - * show() returns whether ksys_sync_helper() is invoked before suspend. - * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. + * show() returns whether file systems sync before suspend is enabled. + * store() accepts 0 or 1. 0 disables file systems sync and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); static ssize_t sync_on_suspend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", sync_on_suspend_enabled); + return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled); } static ssize_t sync_on_suspend_store(struct kobject *kobj, @@ -239,37 +335,38 @@ static const char * const pm_tests[__TEST_AFTER_LAST] = { static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) - s += sprintf(s, "[%s] ", pm_tests[level]); + count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]); else - s += sprintf(s, "%s ", pm_tests[level]); + count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]); } - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; - return (s - buf); + return count; } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + unsigned int sleep_flags; const char * const *s; + int error = -EINVAL; int level; char *p; int len; - int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -279,7 +376,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } @@ -287,44 +384,117 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ -static char *suspend_step_name(enum suspend_stat_step step) -{ - switch (step) { - case SUSPEND_FREEZE: - return "freeze"; - case SUSPEND_PREPARE: - return "prepare"; - case SUSPEND_SUSPEND: - return "suspend"; - case SUSPEND_SUSPEND_NOIRQ: - return "suspend_noirq"; - case SUSPEND_RESUME_NOIRQ: - return "resume_noirq"; - case SUSPEND_RESUME: - return "resume"; - default: - return ""; +#define SUSPEND_NR_STEPS SUSPEND_RESUME +#define REC_FAILED_NUM 2 + +struct suspend_stats { + unsigned int step_failures[SUSPEND_NR_STEPS]; + unsigned int success; + unsigned int fail; + int last_failed_dev; + char failed_devs[REC_FAILED_NUM][40]; + int last_failed_errno; + int errno[REC_FAILED_NUM]; + int last_failed_step; + u64 last_hw_sleep; + u64 total_hw_sleep; + u64 max_hw_sleep; + enum suspend_stat_step failed_steps[REC_FAILED_NUM]; +}; + +static struct suspend_stats suspend_stats; +static DEFINE_MUTEX(suspend_stats_lock); + +void dpm_save_failed_dev(const char *name) +{ + mutex_lock(&suspend_stats_lock); + + strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], + name, sizeof(suspend_stats.failed_devs[0])); + suspend_stats.last_failed_dev++; + suspend_stats.last_failed_dev %= REC_FAILED_NUM; + + mutex_unlock(&suspend_stats_lock); +} + +void dpm_save_failed_step(enum suspend_stat_step step) +{ + suspend_stats.step_failures[step-1]++; + suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; + suspend_stats.last_failed_step++; + suspend_stats.last_failed_step %= REC_FAILED_NUM; +} + +void dpm_save_errno(int err) +{ + if (!err) { + suspend_stats.success++; + return; } + + suspend_stats.fail++; + + suspend_stats.errno[suspend_stats.last_failed_errno] = err; + suspend_stats.last_failed_errno++; + suspend_stats.last_failed_errno %= REC_FAILED_NUM; +} + +void pm_report_hw_sleep_time(u64 t) +{ + suspend_stats.last_hw_sleep = t; + suspend_stats.total_hw_sleep += t; } +EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); + +void pm_report_max_hw_sleep(u64 t) +{ + suspend_stats.max_hw_sleep = t; +} +EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); + +static const char * const suspend_step_names[] = { + [SUSPEND_WORKING] = "", + [SUSPEND_FREEZE] = "freeze", + [SUSPEND_PREPARE] = "prepare", + [SUSPEND_SUSPEND] = "suspend", + [SUSPEND_SUSPEND_LATE] = "suspend_late", + [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq", + [SUSPEND_RESUME_NOIRQ] = "resume_noirq", + [SUSPEND_RESUME_EARLY] = "resume_early", + [SUSPEND_RESUME] = "resume", +}; + +#define suspend_attr(_name, format_str) \ +static ssize_t _name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ +{ \ + return sysfs_emit(buf, format_str, suspend_stats._name);\ +} \ +static struct kobj_attribute _name = __ATTR_RO(_name) + +suspend_attr(success, "%u\n"); +suspend_attr(fail, "%u\n"); +suspend_attr(last_hw_sleep, "%llu\n"); +suspend_attr(total_hw_sleep, "%llu\n"); +suspend_attr(max_hw_sleep, "%llu\n"); -#define suspend_attr(_name) \ +#define suspend_step_attr(_name, step) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ - return sprintf(buf, "%d\n", suspend_stats._name); \ + return sysfs_emit(buf, "%u\n", \ + suspend_stats.step_failures[step-1]); \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) -suspend_attr(success); -suspend_attr(fail); -suspend_attr(failed_freeze); -suspend_attr(failed_prepare); -suspend_attr(failed_suspend); -suspend_attr(failed_suspend_late); -suspend_attr(failed_suspend_noirq); -suspend_attr(failed_resume); -suspend_attr(failed_resume_early); -suspend_attr(failed_resume_noirq); +suspend_step_attr(failed_freeze, SUSPEND_FREEZE); +suspend_step_attr(failed_prepare, SUSPEND_PREPARE); +suspend_step_attr(failed_suspend, SUSPEND_SUSPEND); +suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE); +suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ); +suspend_step_attr(failed_resume, SUSPEND_RESUME); +suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY); +suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ); static ssize_t last_failed_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -336,7 +506,7 @@ static ssize_t last_failed_dev_show(struct kobject *kobj, index %= REC_FAILED_NUM; last_failed_dev = suspend_stats.failed_devs[index]; - return sprintf(buf, "%s\n", last_failed_dev); + return sysfs_emit(buf, "%s\n", last_failed_dev); } static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); @@ -350,23 +520,21 @@ static ssize_t last_failed_errno_show(struct kobject *kobj, index %= REC_FAILED_NUM; last_failed_errno = suspend_stats.errno[index]; - return sprintf(buf, "%d\n", last_failed_errno); + return sysfs_emit(buf, "%d\n", last_failed_errno); } static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); static ssize_t last_failed_step_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int index; enum suspend_stat_step step; - char *last_failed_step = NULL; + int index; index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; - last_failed_step = suspend_step_name(step); - return sprintf(buf, "%s\n", last_failed_step); + return sysfs_emit(buf, "%s\n", suspend_step_names[step]); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); @@ -384,18 +552,37 @@ static struct attribute *suspend_attrs[] = { &last_failed_dev.attr, &last_failed_errno.attr, &last_failed_step.attr, + &last_hw_sleep.attr, + &total_hw_sleep.attr, + &max_hw_sleep.attr, NULL, }; +static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) +{ + if (attr != &last_hw_sleep.attr && + attr != &total_hw_sleep.attr && + attr != &max_hw_sleep.attr) + return 0444; + +#ifdef CONFIG_ACPI + if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) + return 0444; +#endif + return 0; +} + static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, + .is_visible = suspend_attr_is_visible, }; #ifdef CONFIG_DEBUG_FS static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; + enum suspend_stat_step step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; @@ -403,47 +590,35 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", - "success", suspend_stats.success, - "fail", suspend_stats.fail, - "failed_freeze", suspend_stats.failed_freeze, - "failed_prepare", suspend_stats.failed_prepare, - "failed_suspend", suspend_stats.failed_suspend, - "failed_suspend_late", - suspend_stats.failed_suspend_late, - "failed_suspend_noirq", - suspend_stats.failed_suspend_noirq, - "failed_resume", suspend_stats.failed_resume, - "failed_resume_early", - suspend_stats.failed_resume_early, - "failed_resume_noirq", - suspend_stats.failed_resume_noirq); + + seq_printf(s, "success: %u\nfail: %u\n", + suspend_stats.success, suspend_stats.fail); + + for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++) + seq_printf(s, "failed_%s: %u\n", suspend_step_names[step], + suspend_stats.step_failures[step-1]); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", - suspend_stats.failed_devs[last_dev]); + suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_dev + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-s\n", - suspend_stats.failed_devs[index]); + seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]); } seq_printf(s, " last_failed_errno:\t%-d\n", suspend_stats.errno[last_errno]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_errno + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; - seq_printf(s, "\t\t\t%-d\n", - suspend_stats.errno[index]); + seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]); } seq_printf(s, " last_failed_step:\t%-s\n", - suspend_step_name( - suspend_stats.failed_steps[last_step])); + suspend_step_names[suspend_stats.failed_steps[last_step]]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_step + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", - suspend_step_name( - suspend_stats.failed_steps[index])); + suspend_step_names[suspend_stats.failed_steps[index]]); } return 0; @@ -460,6 +635,10 @@ static int __init pm_debugfs_init(void) late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +bool pm_sleep_transition_in_progress(void) +{ + return pm_suspend_in_progress() || hibernation_in_progress(); +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG @@ -474,7 +653,7 @@ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_print_times_enabled); + return sysfs_emit(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, @@ -497,24 +676,33 @@ power_attr(pm_print_times); static inline void pm_print_times_init(void) { - pm_print_times_enabled = !!initcall_debug; + pm_print_times_enabled = initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; + if (!pm_wakeup_irq()) + return -ENODATA; + + return sysfs_emit(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); bool pm_debug_messages_on __read_mostly; +bool pm_debug_messages_should_print(void) +{ + return pm_debug_messages_on && pm_sleep_transition_in_progress(); +} +EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); + static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_debug_messages_on); + return sysfs_emit(buf, "%d\n", pm_debug_messages_on); } static ssize_t pm_debug_messages_store(struct kobject *kobj, @@ -542,35 +730,6 @@ static int __init pm_debug_messages_setup(char *str) } __setup("pm_debug_messages", pm_debug_messages_setup); -/** - * __pm_pr_dbg - Print a suspend debug message to the kernel log. - * @defer: Whether or not to use printk_deferred() to print the message. - * @fmt: Message format. - * - * The message will be emitted if enabled through the pm_debug_messages - * sysfs attribute. - */ -void __pm_pr_dbg(bool defer, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - if (!pm_debug_messages_on) - return; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - if (defer) - printk_deferred(KERN_DEBUG "PM: %pV", &vaf); - else - printk(KERN_DEBUG "PM: %pV", &vaf); - - va_end(args); -} - #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ @@ -591,21 +750,23 @@ struct kobject *power_kobj; static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; #ifdef CONFIG_SUSPEND suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (pm_states[i]) - s += sprintf(s,"%s ", pm_states[i]); + count += sysfs_emit_at(buf, count, "%s ", pm_states[i]); #endif if (hibernation_available()) - s += sprintf(s, "disk "); - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; - return (s - buf); + count += sysfs_emit_at(buf, count, "disk "); + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; } static suspend_state_t decode_state(const char *buf, size_t n) @@ -705,7 +866,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj, unsigned int val; return pm_get_wakeup_count(&val, true) ? - sprintf(buf, "%u\n", val) : -EINTR; + sysfs_emit(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, @@ -747,17 +908,17 @@ static ssize_t autosleep_show(struct kobject *kobj, suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) - return sprintf(buf, "off\n"); + return sysfs_emit(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", pm_states[state] ? + return sysfs_emit(buf, "%s\n", pm_states[state] ? pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION - return sprintf(buf, "disk\n"); + return sysfs_emit(buf, "disk\n"); #else - return sprintf(buf, "error"); + return sysfs_emit(buf, "error\n"); #endif } @@ -826,7 +987,7 @@ int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_trace_enabled); + return sysfs_emit(buf, "%d\n", pm_trace_enabled); } static ssize_t @@ -863,7 +1024,7 @@ power_attr_ro(pm_trace_dev_match); static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", freeze_timeout_msecs); + return sysfs_emit(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, @@ -883,6 +1044,34 @@ power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +bool filesystem_freeze_enabled = false; + +static ssize_t freeze_filesystems_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled); +} + +static ssize_t freeze_filesystems_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + filesystem_freeze_enabled = !!val; + return n; +} + +power_attr(freeze_filesystems); +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -913,6 +1102,9 @@ static struct attribute * g[] = { #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + &freeze_filesystems_attr.attr, +#endif NULL, }; @@ -931,16 +1123,26 @@ static const struct attribute_group *attr_groups[] = { struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); -static int __init pm_start_workqueue(void) +static int __init pm_start_workqueues(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE | WQ_UNBOUND, 0); + if (!pm_wq) + return -ENOMEM; + +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + pm_fs_sync_wq = alloc_ordered_workqueue("pm_fs_sync", 0); + if (!pm_fs_sync_wq) { + destroy_workqueue(pm_wq); + return -ENOMEM; + } +#endif - return pm_wq ? 0 : -ENOMEM; + return 0; } static int __init pm_init(void) { - int error = pm_start_workqueue(); + int error = pm_start_workqueues(); if (error) return error; hibernate_image_size_init(); diff --git a/kernel/power/power.h b/kernel/power/power.h index b4f433943209..75b63843886e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -6,6 +6,7 @@ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/cpuidle.h> +#include <linux/crypto.h> struct swsusp_info { struct new_utsname uts; @@ -17,6 +18,11 @@ struct swsusp_info { unsigned long size; } __aligned(PAGE_SIZE); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern int pm_sleep_fs_sync(void); +extern bool filesystem_freeze_enabled; +#endif + #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void __init hibernate_reserved_size_init(void); @@ -26,9 +32,6 @@ extern void __init hibernate_image_size_init(void); /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) -extern int arch_hibernation_header_save(void *addr, unsigned int max_size); -extern int arch_hibernation_header_restore(void *addr); - static inline int init_header_complete(struct swsusp_info *info) { return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); @@ -41,8 +44,6 @@ static inline const char *check_image_kernel(struct swsusp_info *info) } #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ -extern int hibernate_resume_nonboot_cpu_disable(void); - /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] @@ -59,6 +60,10 @@ asmlinkage int swsusp_save(void); /* kernel/power/hibernate.c */ extern bool freezer_test_done; +extern char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; + +/* kernel/power/swap.c */ +extern unsigned int swsusp_header_flags; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); @@ -71,10 +76,14 @@ extern void enable_restore_image_protection(void); static inline void enable_restore_image_protection(void) {} #endif /* CONFIG_STRICT_KERNEL_RWX */ +extern bool hibernation_in_progress(void); + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} + +static inline bool hibernation_in_progress(void) { return false; } #endif /* !CONFIG_HIBERNATION */ #define power_attr(_name) \ @@ -110,7 +119,7 @@ extern int hibernate_preallocate_memory(void); extern void clear_or_poison_free_pages(void); -/** +/* * Auxiliary structure used for reading the snapshot image data and * metadata from and writing them to the list of page backup entries * (PBEs) which is the main data structure of swsusp. @@ -153,7 +162,7 @@ extern unsigned int snapshot_additional_pages(struct zone *zone); extern unsigned long snapshot_get_image_size(void); extern int snapshot_read_next(struct snapshot_handle *handle); extern int snapshot_write_next(struct snapshot_handle *handle); -extern void snapshot_write_finalize(struct snapshot_handle *handle); +int snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); extern bool hibernate_acquire(void); @@ -167,19 +176,35 @@ extern int swsusp_swap_in_use(void); * Flags that can be passed from the hibernatig hernel to the "boot" kernel in * the image header. */ +#define SF_COMPRESSION_ALG_LZO 0 /* dummy, details given below */ #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 #define SF_CRC32_MODE 4 #define SF_HW_SIG 8 +/* + * Bit to indicate the compression algorithm to be used(for LZ4). The same + * could be checked while saving/loading image to/from disk to use the + * corresponding algorithms. + * + * By default, LZO compression is enabled if SF_CRC32_MODE is set. Use + * SF_COMPRESSION_ALG_LZ4 to override this behaviour and use LZ4. + * + * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZO(dummy) -> Compression, LZO + * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZ4 -> Compression, LZ4 + */ +#define SF_COMPRESSION_ALG_LZ4 16 + /* kernel/power/hibernate.c */ -extern int swsusp_check(void); +int swsusp_check(bool exclusive); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -extern void swsusp_close(fmode_t); +void swsusp_close(void); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); +#else +static inline int swsusp_unmark(void) { return 0; } #endif struct __kernel_old_timeval; @@ -325,3 +350,5 @@ static inline void pm_sleep_enable_secondary_cpus(void) suspend_enable_secondary_cpus(); cpuidle_resume(); } + +void dpm_save_errno(int err); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 562aa0e450ed..1f306f158696 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -23,7 +23,7 @@ static void do_poweroff(struct work_struct *dummy) static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key) +static void handle_poweroff(u8 key) { /* run sysrq poweroff on boot cpu */ schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); diff --git a/kernel/power/process.c b/kernel/power/process.c index b7e7798637b8..dc0dfc349f22 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -6,9 +6,6 @@ * Originally from swsusp. */ - -#undef DEBUG - #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> @@ -30,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { + const char *what = user_only ? "user space processes" : + "remaining freezable tasks"; struct task_struct *g, *p; unsigned long end_time; unsigned int todo; @@ -39,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only) bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; + pr_info("Freezing %s\n", what); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -53,8 +54,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - if (!freezer_should_skip(p)) - todo++; + todo++; } read_unlock(&tasklist_lock); @@ -86,28 +86,26 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs = ktime_to_ms(elapsed); if (todo) { - pr_cont("\n"); - pr_err("Freezing of tasks %s after %d.%03d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", + pr_err("Freezing %s %s after %d.%03d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", what, wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (wq_busy) - show_all_workqueues(); + show_freezable_workqueues(); if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) - && freezing(p) && !frozen(p)) + if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); } } else { - pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, - elapsed_msecs % 1000); + pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n", + what, elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; @@ -132,17 +130,14 @@ int freeze_processes(void) current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); - pm_wakeup_clear(true); - pr_info("Freezing user space processes ... "); + pm_wakeup_clear(0); pm_freezing = true; error = try_to_freeze_tasks(true); - if (!error) { + if (!error) __usermodehelper_set_disable_depth(UMH_DISABLED); - pr_cont("done."); - } - pr_cont("\n"); + BUG_ON(in_atomic()); /* @@ -171,14 +166,9 @@ int freeze_kernel_threads(void) { int error; - pr_info("Freezing remaining freezable tasks ... "); - pm_nosig_freezing = true; error = try_to_freeze_tasks(false); - if (!error) - pr_cont("done."); - pr_cont("\n"); BUG_ON(in_atomic()); if (error) @@ -193,19 +183,17 @@ void thaw_processes(void) trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; oom_killer_enable(); - pr_info("Restarting tasks ... "); + pr_info("Restarting tasks: Starting\n"); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ @@ -220,7 +208,7 @@ void thaw_processes(void) usermodehelper_enable(); schedule(); - pr_cont("done.\n"); + pr_info("Restarting tasks: Done\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } @@ -229,7 +217,7 @@ void thaw_kernel_threads(void) struct task_struct *g, *p; pm_nosig_freezing = false; - pr_info("Restarting kernel threads ... "); + pr_info("Restarting kernel threads ...\n"); thaw_workqueues(); @@ -241,5 +229,5 @@ void thaw_kernel_threads(void) read_unlock(&tasklist_lock); schedule(); - pr_cont("done.\n"); + pr_info("Done restarting kernel threads.\n"); } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index ec7e1e85923e..f7d8064e9adc 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -220,6 +220,11 @@ static struct pm_qos_constraints cpu_latency_constraints = { .type = PM_QOS_MIN, }; +static inline bool cpu_latency_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + /** * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ @@ -263,7 +268,7 @@ static void cpu_latency_qos_apply(struct pm_qos_request *req, */ void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { - if (!req) + if (!req || cpu_latency_qos_value_invalid(value)) return; if (cpu_latency_qos_request_active(req)) { @@ -289,7 +294,7 @@ EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); */ void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { - if (!req) + if (!req || cpu_latency_qos_value_invalid(new_value)) return; if (!cpu_latency_qos_request_active(req)) { @@ -410,6 +415,105 @@ static struct miscdevice cpu_latency_qos_miscdev = { .fops = &cpu_latency_qos_fops, }; +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +/* The CPU system wakeup latency QoS. */ +static struct pm_qos_constraints cpu_wakeup_latency_constraints = { + .list = PLIST_HEAD_INIT(cpu_wakeup_latency_constraints.list), + .target_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .default_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT, + .type = PM_QOS_MIN, +}; + +/** + * cpu_wakeup_latency_qos_limit - Current CPU system wakeup latency QoS limit. + * + * Returns the current CPU system wakeup latency QoS limit that may have been + * requested by user space. + */ +s32 cpu_wakeup_latency_qos_limit(void) +{ + return pm_qos_read_value(&cpu_wakeup_latency_constraints); +} + +static int cpu_wakeup_latency_qos_open(struct inode *inode, struct file *filp) +{ + struct pm_qos_request *req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->qos = &cpu_wakeup_latency_constraints; + pm_qos_update_target(req->qos, &req->node, PM_QOS_ADD_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + filp->private_data = req; + + return 0; +} + +static int cpu_wakeup_latency_qos_release(struct inode *inode, + struct file *filp) +{ + struct pm_qos_request *req = filp->private_data; + + filp->private_data = NULL; + pm_qos_update_target(req->qos, &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_RESUME_LATENCY_NO_CONSTRAINT); + kfree(req); + + return 0; +} + +static ssize_t cpu_wakeup_latency_qos_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value = pm_qos_read_value(&cpu_wakeup_latency_constraints); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t cpu_wakeup_latency_qos_write(struct file *filp, + const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct pm_qos_request *req = filp->private_data; + s32 value; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else { + int ret; + + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; + } + + if (value < 0) + return -EINVAL; + + pm_qos_update_target(req->qos, &req->node, PM_QOS_UPDATE_REQ, value); + + return count; +} + +static const struct file_operations cpu_wakeup_latency_qos_fops = { + .open = cpu_wakeup_latency_qos_open, + .release = cpu_wakeup_latency_qos_release, + .read = cpu_wakeup_latency_qos_read, + .write = cpu_wakeup_latency_qos_write, + .llseek = noop_llseek, +}; + +static struct miscdevice cpu_wakeup_latency_qos_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "cpu_wakeup_latency", + .fops = &cpu_wakeup_latency_qos_fops, +}; +#endif /* CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP */ + static int __init cpu_latency_qos_init(void) { int ret; @@ -419,6 +523,13 @@ static int __init cpu_latency_qos_init(void) pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP + ret = misc_register(&cpu_wakeup_latency_qos_miscdev); + if (ret < 0) + pr_err("%s: %s setup failed\n", __func__, + cpu_wakeup_latency_qos_miscdev.name); +#endif + return ret; } late_initcall(cpu_latency_qos_init); @@ -426,6 +537,11 @@ late_initcall(cpu_latency_qos_init); /* Definitions related to the frequency QoS below. */ +static inline bool freq_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + /** * freq_constraints_init - Initialize frequency QoS constraints. * @qos: Frequency QoS constraints to initialize. @@ -531,7 +647,7 @@ int freq_qos_add_request(struct freq_constraints *qos, { int ret; - if (IS_ERR_OR_NULL(qos) || !req) + if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value)) return -EINVAL; if (WARN(freq_qos_request_active(req), @@ -563,7 +679,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request); */ int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) { - if (!req) + if (!req || freq_qos_value_invalid(new_value)) return -EINVAL; if (WARN(!freq_qos_request_active(req), diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f7a986078213..0a946932d5c1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -58,22 +58,24 @@ static inline void hibernate_restore_protection_end(void) hibernate_restore_protection_active = false; } -static inline void hibernate_restore_protect_page(void *page_address) +static inline int __must_check hibernate_restore_protect_page(void *page_address) { if (hibernate_restore_protection_active) - set_memory_ro((unsigned long)page_address, 1); + return set_memory_ro((unsigned long)page_address, 1); + return 0; } -static inline void hibernate_restore_unprotect_page(void *page_address) +static inline int hibernate_restore_unprotect_page(void *page_address) { if (hibernate_restore_protection_active) - set_memory_rw((unsigned long)page_address, 1); + return set_memory_rw((unsigned long)page_address, 1); + return 0; } #else static inline void hibernate_restore_protection_begin(void) {} static inline void hibernate_restore_protection_end(void) {} -static inline void hibernate_restore_protect_page(void *page_address) {} -static inline void hibernate_restore_unprotect_page(void *page_address) {} +static inline int __must_check hibernate_restore_protect_page(void *page_address) {return 0; } +static inline int hibernate_restore_unprotect_page(void *page_address) {return 0; } #endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ @@ -326,7 +328,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -/** +/* * Data types related to memory bitmaps. * * Memory bitmap is a structure consisting of many linked lists of @@ -361,7 +363,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) * * One radix tree is represented by one struct mem_zone_bm_rtree. There are * two linked lists for the nodes of the tree, one for the inner nodes and - * one for the leave nodes. The linked leave nodes are used for fast linear + * one for the leaf nodes. The linked leaf nodes are used for fast linear * access of the memory bitmap. * * The struct rtree_node represents one node of the radix tree. @@ -398,12 +400,13 @@ struct mem_zone_bm_rtree { unsigned int blocks; /* Number of Bitmap Blocks */ }; -/* strcut bm_position is used for browsing memory bitmaps */ +/* struct bm_position is used for browsing memory bitmaps */ struct bm_position { struct mem_zone_bm_rtree *zone; struct rtree_node *node; unsigned long node_pfn; + unsigned long cur_pfn; int node_bit; }; @@ -427,6 +430,10 @@ struct memory_bitmap { /** * alloc_rtree_node - Allocate a new node and add it to the radix tree. + * @gfp_mask: GFP mask for the allocation. + * @safe_needed: Get pages not used before hibernation (restore only) + * @ca: Pointer to a linked list of pages ("a chain") to allocate from + * @list: Radix Tree node to add. * * This function is used to allocate inner nodes as well as the * leave nodes of the radix tree. It also adds the node to the @@ -585,6 +592,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm) bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; + bm->cur.cur_pfn = BM_END_OF_MAP; bm->cur.node_bit = 0; } @@ -795,6 +803,7 @@ node_found: bm->cur.zone = zone; bm->cur.node = node; bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + bm->cur.cur_pfn = pfn; /* Set return values */ *addr = node->data; @@ -846,6 +855,11 @@ static void memory_bm_clear_current(struct memory_bitmap *bm) clear_bit(bit, bm->cur.node->data); } +static unsigned long memory_bm_get_current(struct memory_bitmap *bm) +{ + return bm->cur.cur_pfn; +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -902,7 +916,7 @@ static bool rtree_next_node(struct memory_bitmap *bm) } /** - * memory_bm_rtree_next_pfn - Find the next set bit in a memory bitmap. + * memory_bm_next_pfn - Find the next set bit in a memory bitmap. * @bm: Memory bitmap. * * Starting from the last returned position this function searches for the next @@ -925,10 +939,12 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) if (bit < bits) { pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; bm->cur.node_bit = bit + 1; + bm->cur.cur_pfn = pfn; return pfn; } } while (rtree_next_node(bm)); + bm->cur.cur_pfn = BM_END_OF_MAP; return BM_END_OF_MAP; } @@ -978,8 +994,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm) * Register a range of page frames the contents of which should not be saved * during hibernation (to be used in the early initialization code). */ -void __init __register_nosave_region(unsigned long start_pfn, - unsigned long end_pfn, int use_kmalloc) +void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) { struct nosave_region *region; @@ -995,18 +1010,9 @@ void __init __register_nosave_region(unsigned long start_pfn, goto Report; } } - if (use_kmalloc) { - /* During init, this shouldn't fail */ - region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); - BUG_ON(!region); - } else { - /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), - SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); - } + /* This allocation cannot fail */ + region = memblock_alloc_or_panic(sizeof(struct nosave_region), + SMP_CACHE_BYTES); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -1088,16 +1094,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) ((unsigned long long) region->end_pfn << PAGE_SHIFT) - 1); - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - /* - * It is safe to ignore the result of - * mem_bm_set_bit_check() here, since we won't - * touch the PFNs for which the error is - * returned anyway. - */ - mem_bm_set_bit_check(bm, pfn); - } + for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } @@ -1112,7 +1117,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) int create_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - int error = 0; + int error; if (forbidden_pages_map && free_pages_map) return 0; @@ -1231,6 +1236,57 @@ unsigned int snapshot_additional_pages(struct zone *zone) return 2 * rtree; } +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + +static void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + unsigned long flags; + unsigned int order, t; + struct page *page; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) { + page = pfn_to_page(pfn); + + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + + if (page_zone(page) != zone) + continue; + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], buddy_list) { + unsigned long i; + + pfn = page_to_pfn(page); + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} + #ifdef CONFIG_HIGHMEM /** * count_free_highmem_pages - Compute the total number of free highmem pages. @@ -1304,11 +1360,6 @@ static unsigned int count_highmem_pages(void) } return n; } -#else -static inline void *saveable_highmem_page(struct zone *z, unsigned long p) -{ - return NULL; -} #endif /* CONFIG_HIGHMEM */ /** @@ -1374,14 +1425,19 @@ static unsigned int count_data_pages(void) /* * This is needed, because copy_page and memcpy are not usable for copying - * task structs. + * task structs. Returns true if the page was filled with only zeros, + * otherwise false. */ -static inline void do_copy_page(long *dst, long *src) +static inline bool do_copy_page(long *dst, long *src) { + long z = 0; int n; - for (n = PAGE_SIZE / sizeof(long); n; n--) + for (n = PAGE_SIZE / sizeof(long); n; n--) { + z |= *src; *dst++ = *src++; + } + return !z; } /** @@ -1390,17 +1446,21 @@ static inline void do_copy_page(long *dst, long *src) * Check if the page we are going to copy is marked as present in the kernel * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() - * always returns 'true'. + * always returns 'true'. Returns true if the page was entirely composed of + * zeros, otherwise it will return false. */ -static void safe_copy_page(void *dst, struct page *s_page) +static bool safe_copy_page(void *dst, struct page *s_page) { + bool zeros_only; + if (kernel_page_present(s_page)) { - do_copy_page(dst, page_address(s_page)); + zeros_only = do_copy_page(dst, page_address(s_page)); } else { hibernate_map_page(s_page); - do_copy_page(dst, page_address(s_page)); + zeros_only = do_copy_page(dst, page_address(s_page)); hibernate_unmap_page(s_page); } + return zeros_only; } #ifdef CONFIG_HIGHMEM @@ -1410,49 +1470,59 @@ static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } -static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { struct page *s_page, *d_page; void *src, *dst; + bool zeros_only; s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page); - dst = kmap_atomic(d_page); - do_copy_page(dst, src); - kunmap_atomic(dst); - kunmap_atomic(src); + src = kmap_local_page(s_page); + dst = kmap_local_page(d_page); + zeros_only = do_copy_page(dst, src); + kunmap_local(dst); + kunmap_local(src); } else { if (PageHighMem(d_page)) { /* * The page pointed to by src may contain some kernel * data modified by kmap_atomic() */ - safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page); + zeros_only = safe_copy_page(buffer, s_page); + dst = kmap_local_page(d_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); } else { - safe_copy_page(page_address(d_page), s_page); + zeros_only = safe_copy_page(page_address(d_page), s_page); } } + return zeros_only; } #else #define page_is_saveable(zone, pfn) saveable_page(zone, pfn) -static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static inline int copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { - safe_copy_page(page_address(pfn_to_page(dst_pfn)), + return safe_copy_page(page_address(pfn_to_page(dst_pfn)), pfn_to_page(src_pfn)); } #endif /* CONFIG_HIGHMEM */ -static void copy_data_pages(struct memory_bitmap *copy_bm, - struct memory_bitmap *orig_bm) +/* + * Copy data pages will copy all pages into pages pulled from the copy_bm. + * If a page was entirely filled with zeros it will be marked in the zero_bm. + * + * Returns the number of pages copied. + */ +static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, + struct memory_bitmap *orig_bm, + struct memory_bitmap *zero_bm) { + unsigned long copied_pages = 0; struct zone *zone; - unsigned long pfn; + unsigned long pfn, copy_pfn; for_each_populated_zone(zone) { unsigned long max_zone_pfn; @@ -1465,18 +1535,29 @@ static void copy_data_pages(struct memory_bitmap *copy_bm, } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); - for(;;) { + copy_pfn = memory_bm_next_pfn(copy_bm); + for (;;) { pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; - copy_data_page(memory_bm_next_pfn(copy_bm), pfn); + if (copy_data_page(copy_pfn, pfn)) { + memory_bm_set_bit(zero_bm, pfn); + /* Use this copy_pfn for a page that is not full of zeros */ + continue; + } + copied_pages++; + copy_pfn = memory_bm_next_pfn(copy_bm); } + return copied_pages; } /* Total number of image pages */ static unsigned int nr_copy_pages; /* Number of pages needed for saving the original pfns of the image pages */ static unsigned int nr_meta_pages; +/* Number of zero pages */ +static unsigned int nr_zero_pages; + /* * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. @@ -1497,6 +1578,9 @@ static struct memory_bitmap orig_bm; */ static struct memory_bitmap copy_bm; +/* Memory bitmap which tracks which saveable pages were zero filled. */ +static struct memory_bitmap zero_bm; + /** * swsusp_free - Free pages allocated for hibernation image. * @@ -1541,6 +1625,7 @@ loop: out: nr_copy_pages = 0; nr_meta_pages = 0; + nr_zero_pages = 0; restore_pblist = NULL; buffer = NULL; alloc_normal = 0; @@ -1726,8 +1811,8 @@ static unsigned long minimum_image_size(unsigned long saveable) * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * - * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 - * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) + * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2 + * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) * * of them, which corresponds to the maximum size of a hibernation image. * @@ -1759,8 +1844,15 @@ int hibernate_preallocate_memory(void) goto err_out; } + error = memory_bm_create(&zero_bm, GFP_IMAGE, PG_ANY); + if (error) { + pr_err("Cannot allocate zero bitmap\n"); + goto err_out; + } + alloc_normal = 0; alloc_highmem = 0; + nr_zero_pages = 0; /* Count the number of saveable data pages. */ save_highmem = count_highmem_pages(); @@ -1944,7 +2036,7 @@ static inline int get_highmem_buffer(int safe_needed) } /** - * alloc_highmem_image_pages - Allocate some highmem pages for the image. + * alloc_highmem_pages - Allocate some highmem pages for the image. * * Try to allocate as many pages as needed, but if the number of free highmem * pages is less than that, allocate them all. @@ -2018,41 +2110,40 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - pr_info("Creating image:\n"); + pm_deferred_pr_dbg("Creating image\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); - pr_info("Need to copy %u pages\n", nr_pages + nr_highmem); + pm_deferred_pr_dbg("Need to copy %u pages\n", nr_pages + nr_highmem); if (!enough_free_mem(nr_pages, nr_highmem)) { - pr_err("Not enough free memory\n"); + pm_deferred_pr_dbg("Not enough free memory for image creation\n"); return -ENOMEM; } - if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { - pr_err("Memory allocation failed\n"); + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) return -ENOMEM; - } /* * During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(NULL); - copy_data_pages(©_bm, &orig_bm); + nr_copy_pages = copy_data_pages(©_bm, &orig_bm, &zero_bm); /* * End of critical section. From now on, we can write to memory, * but we should not touch disk. This specially means we must _not_ * touch swap space! Except we must write out our image of course. */ - nr_pages += nr_highmem; - nr_copy_pages = nr_pages; + /* We don't actually copy the zero pages */ + nr_zero_pages = nr_pages - nr_copy_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Image created (%d pages copied)\n", nr_pages); + pm_deferred_pr_dbg("Image created (%d pages copied, %d zero pages)\n", + nr_copy_pages, nr_zero_pages); return 0; } @@ -2069,13 +2160,13 @@ static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; - if (strcmp(info->uts.sysname,init_utsname()->sysname)) + if (strcmp(info->uts.sysname, init_utsname()->sysname)) return "system type"; - if (strcmp(info->uts.release,init_utsname()->release)) + if (strcmp(info->uts.release, init_utsname()->release)) return "kernel release"; - if (strcmp(info->uts.version,init_utsname()->version)) + if (strcmp(info->uts.version, init_utsname()->version)) return "version"; - if (strcmp(info->uts.machine,init_utsname()->machine)) + if (strcmp(info->uts.machine, init_utsname()->machine)) return "machine"; return NULL; } @@ -2097,15 +2188,22 @@ static int init_header(struct swsusp_info *info) return init_header_complete(info); } +#define ENCODED_PFN_ZERO_FLAG ((unsigned long)1 << (BITS_PER_LONG - 1)) +#define ENCODED_PFN_MASK (~ENCODED_PFN_ZERO_FLAG) + /** * pack_pfns - Prepare PFNs for saving. * @bm: Memory bitmap. * @buf: Memory buffer to store the PFNs in. + * @zero_bm: Memory bitmap containing PFNs of zero pages. * * PFNs corresponding to set bits in @bm are stored in the area of memory - * pointed to by @buf (1 page at a time). + * pointed to by @buf (1 page at a time). Pages which were filled with only + * zeros will have the highest bit set in the packed format to distinguish + * them from PFNs which will be contained in the image file. */ -static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { int j; @@ -2113,6 +2211,8 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; + if (memory_bm_test_bit(zero_bm, buf[j])) + buf[j] |= ENCODED_PFN_ZERO_FLAG; } } @@ -2154,7 +2254,7 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { clear_page(buffer); - pack_pfns(buffer, &orig_bm); + pack_pfns(buffer, &orig_bm, &zero_bm); } else { struct page *page; @@ -2167,9 +2267,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2231,7 +2331,7 @@ static int check_header(struct swsusp_info *info) } /** - * load header - Check the image header and copy the data from it. + * load_header - Check the image header and copy the data from it. */ static int load_header(struct swsusp_info *info) { @@ -2250,22 +2350,37 @@ static int load_header(struct swsusp_info *info) * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. * @bm: Memory bitmap. * @buf: Area of memory containing the PFNs. + * @zero_bm: Memory bitmap with the zero PFNs marked. * * For each element of the array pointed to by @buf (1 page at a time), set the - * corresponding bit in @bm. + * corresponding bit in @bm. If the page was originally populated with only + * zeros then a corresponding bit will also be set in @zero_bm. */ -static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { + unsigned long decoded_pfn; + bool zero; int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { if (unlikely(buf[j] == BM_END_OF_MAP)) break; - if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) - memory_bm_set_bit(bm, buf[j]); - else + zero = !!(buf[j] & ENCODED_PFN_ZERO_FLAG); + decoded_pfn = buf[j] & ENCODED_PFN_MASK; + if (pfn_valid(decoded_pfn) && memory_bm_pfn_present(bm, decoded_pfn)) { + memory_bm_set_bit(bm, decoded_pfn); + if (zero) { + memory_bm_set_bit(zero_bm, decoded_pfn); + nr_zero_pages++; + } + } else { + if (!pfn_valid(decoded_pfn)) + pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n", + (unsigned long long)PFN_PHYS(decoded_pfn)); return -EFAULT; + } } return 0; @@ -2421,8 +2536,9 @@ static void *get_highmem_page_buffer(struct page *page, pbe->copy_page = tmp; } else { /* Copy of the page will be stored in normal memory */ - kaddr = safe_pages_list; - safe_pages_list = safe_pages_list->next; + kaddr = __get_safe_page(ca->gfp_mask); + if (!kaddr) + return ERR_PTR(-ENOMEM); pbe->copy_page = virt_to_page(kaddr); } pbe->next = highmem_pblist; @@ -2442,9 +2558,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page); + dst = kmap_local_page(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); last_highmem_page = NULL; } } @@ -2485,6 +2601,7 @@ static inline void free_highmem_data(void) {} * prepare_image - Make room for loading hibernation image. * @new_bm: Uninitialized memory bitmap structure. * @bm: Memory bitmap with unsafe pages marked. + * @zero_bm: Memory bitmap containing the zero pages. * * Use @bm to mark the pages that will be overwritten in the process of * restoring the system memory state from the suspend image ("unsafe" pages) @@ -2495,10 +2612,15 @@ static inline void free_highmem_data(void) {} * pages will be used for just yet. Instead, we mark them all as allocated and * create a lists of "safe" pages to be used later. On systems with high * memory a list of "safe" highmem pages is created too. + * + * Because it was not known which pages were unsafe when @zero_bm was created, + * make a copy of it and recreate it within safe pages. */ -static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { unsigned int nr_pages, nr_highmem; + struct memory_bitmap tmp; struct linked_page *lp; int error; @@ -2515,6 +2637,24 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) duplicate_memory_bitmap(new_bm, bm); memory_bm_free(bm, PG_UNSAFE_KEEP); + + /* Make a copy of zero_bm so it can be created in safe pages */ + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(&tmp, zero_bm); + memory_bm_free(zero_bm, PG_UNSAFE_KEEP); + + /* Recreate zero_bm in safe pages */ + error = memory_bm_create(zero_bm, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(zero_bm, &tmp); + memory_bm_free(&tmp, PG_UNSAFE_CLEAR); + /* At this point zero_bm is in safe pages and it can be used for restoring. */ + if (nr_highmem > 0) { error = prepare_highmem_image(bm, &nr_highmem); if (error) @@ -2529,7 +2669,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) * * nr_copy_pages cannot be less than allocated_unsafe_pages too. */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { lp = get_image_page(GFP_ATOMIC, PG_SAFE); @@ -2542,7 +2682,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) nr_pages--; } /* Preallocate memory for the image */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); if (!lp) { @@ -2602,8 +2742,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); - pbe->address = safe_pages_list; - safe_pages_list = safe_pages_list->next; + pbe->address = __get_safe_page(ca->gfp_mask); + if (!pbe->address) + return ERR_PTR(-ENOMEM); pbe->next = restore_pblist; restore_pblist = pbe; return pbe->address; @@ -2628,14 +2769,13 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; - int error = 0; + int error; +next: /* Check if we have already loaded the entire image */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) return 0; - handle->sync_read = 1; - if (!handle->cur) { if (!buffer) /* This makes the buffer be freed by swsusp_free() */ @@ -2656,35 +2796,50 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + error = memory_bm_create(&zero_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + nr_zero_pages = 0; + hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { - error = unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm, &zero_bm); if (error) return error; if (handle->cur == nr_meta_pages + 1) { - error = prepare_image(&orig_bm, ©_bm); + error = prepare_image(&orig_bm, ©_bm, &zero_bm); if (error) return error; chain_init(&ca, GFP_ATOMIC, PG_SAFE); memory_bm_position_reset(&orig_bm); + memory_bm_position_reset(&zero_bm); restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); } } else { copy_last_highmem_page(); - hibernate_restore_protect_page(handle->buffer); + error = hibernate_restore_protect_page(handle->buffer); + if (error) + return error; handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); - if (handle->buffer != buffer) - handle->sync_read = 0; } + handle->sync_read = (handle->buffer == buffer); handle->cur++; + + /* Zero pages were not included in the image, memset it and move on. */ + if (handle->cur > nr_meta_pages + 1 && + memory_bm_test_bit(&zero_bm, memory_bm_get_current(&orig_bm))) { + memset(handle->buffer, 0, PAGE_SIZE); + goto next; + } + return PAGE_SIZE; } @@ -2696,21 +2851,24 @@ int snapshot_write_next(struct snapshot_handle *handle) * stored in highmem. Additionally, it recycles bitmap memory that's not * necessary any more. */ -void snapshot_write_finalize(struct snapshot_handle *handle) +int snapshot_write_finalize(struct snapshot_handle *handle) { + int error; + copy_last_highmem_page(); - hibernate_restore_protect_page(handle->buffer); + error = hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) { memory_bm_recycle(&orig_bm); free_highmem_data(); } + return error; } int snapshot_image_loaded(struct snapshot_handle *handle) { return !(!nr_copy_pages || !last_highmem_page_copied() || - handle->cur <= nr_meta_pages + nr_copy_pages); + handle->cur <= nr_meta_pages + nr_copy_pages + nr_zero_pages); } #ifdef CONFIG_HIGHMEM @@ -2720,13 +2878,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2, { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1); - kaddr2 = kmap_atomic(p2); + kaddr1 = kmap_local_page(p1); + kaddr2 = kmap_local_page(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2); - kunmap_atomic(kaddr1); + kunmap_local(kaddr2); + kunmap_local(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 80cc1f0f502b..2da4482bb6eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,6 +30,7 @@ #include <trace/events/power.h> #include <linux/compiler.h> #include <linux/moduleparam.h> +#include <linux/fs.h> #include "power.h" @@ -75,9 +76,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); s2idle_ops = ops; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } static void s2idle_begin(void) @@ -89,6 +92,16 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + /* + * The correctness of the code below depends on the number of online + * CPUs being stable, but CPUs cannot be taken offline or put online + * while it is running. + * + * The s2idle_lock must be acquired before the pending wakeup check to + * prevent pm_system_wakeup() from running as a whole between that check + * and the subsequent s2idle_state update in which case a wakeup event + * would get lost. + */ raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; @@ -96,15 +109,17 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - cpus_read_lock(); - /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ swait_event_exclusive(s2idle_wait_head, s2idle_state == S2IDLE_STATE_WAKE); - cpus_read_unlock(); + /* + * Kick all CPUs to ensure that they resume their timers and restore + * consistent system state. + */ + wake_up_all_idle_cpus(); raw_spin_lock_irq(&s2idle_lock); @@ -136,7 +151,8 @@ static void s2idle_loop(void) break; } - pm_wakeup_clear(false); + if (s2idle_ops && s2idle_ops->check) + s2idle_ops->check(); s2idle_enter(); } @@ -189,6 +205,7 @@ static int __init mem_sleep_default_setup(char *str) if (mem_sleep_labels[state] && !strcmp(str, mem_sleep_labels[state])) { mem_sleep_default = state; + mem_sleep_current = state; break; } @@ -202,7 +219,9 @@ __setup("mem_sleep_default=", mem_sleep_default_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); suspend_ops = ops; @@ -218,7 +237,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) mem_sleep_current = PM_SUSPEND_MEM; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(suspend_set_ops); @@ -238,7 +257,8 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem); static bool sleep_state_supported(suspend_state_t state) { - return state == PM_SUSPEND_TO_IDLE || valid_state(state); + return state == PM_SUSPEND_TO_IDLE || + (valid_state(state) && !cxl_mem_active()); } static int platform_suspend_prepare(suspend_state_t state) @@ -324,10 +344,14 @@ MODULE_PARM_DESC(pm_test_delay, static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG + int i; + if (pm_test_level == level) { pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); - mdelay(pm_test_delay * 1000); + for (i = 0; i < pm_test_delay && !pm_wakeup_pending(); i++) + msleep(1000); + return 1; } #endif /* !CONFIG_PM_DEBUG */ @@ -355,14 +379,15 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); if (!error) return 0; - suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); Restore: pm_restore_console(); @@ -490,7 +515,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (error) goto Close; - suspend_console(); + console_suspend_all(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -509,9 +534,9 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - trace_suspend_resume(TPS("resume_console"), state, true); - resume_console(); - trace_suspend_resume(TPS("resume_console"), state, false); + trace_suspend_resume(TPS("console_resume_all"), state, true); + console_resume_all(); + trace_suspend_resume(TPS("console_resume_all"), state, false); Close: platform_resume_end(state); @@ -532,6 +557,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } @@ -567,7 +593,11 @@ static int enter_state(suspend_state_t state) if (sync_on_suspend_enabled) { trace_suspend_resume(TPS("sync_filesystems"), 0, true); - ksys_sync_helper(); + + error = pm_sleep_fs_sync(); + if (error) + goto Unlock; + trace_suspend_resume(TPS("sync_filesystems"), 0, false); } @@ -582,9 +612,7 @@ static int enter_state(suspend_state_t state) trace_suspend_resume(TPS("suspend_enter"), state, false); pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]); - pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); - pm_restore_gfp_mask(); Finish: events_check_enabled = false; @@ -611,12 +639,7 @@ int pm_suspend(suspend_state_t state) pr_info("suspend entry (%s)\n", mem_sleep_labels[state]); error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else { - suspend_stats.success++; - } + dpm_save_errno(error); pr_info("suspend exit\n"); return error; } diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index d20526c5be15..d4856ec61570 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value) value++; suspend_type = strsep(&value, ","); if (!suspend_type) - return 0; + return 1; repeat = strsep(&value, ","); if (repeat) { if (kstrtou32(repeat, 0, &test_repeat_count_max)) - return 0; + return 1; } for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (!strcmp(pm_labels[i], suspend_type)) { test_state_label = pm_labels[i]; - return 0; + return 1; } printk(warn_bad_state, suspend_type); - return 0; + return 1; } __setup("test_suspend", setup_test_suspend); @@ -201,7 +201,7 @@ static int __init test_suspend(void) } /* RTCs have initialized by now too ... can we use one? */ - dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + dev = class_find_device(&rtc_class, NULL, NULL, has_wakealarm); if (dev) { rtc = rtc_class_open(dev_name(dev)); put_device(dev); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ad10359030a4..33a186373bef 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,11 +12,11 @@ #define pr_fmt(fmt) "PM: " fmt +#include <crypto/acompress.h> #include <linux/module.h> #include <linux/file.h> #include <linux/delay.h> #include <linux/bitops.h> -#include <linux/genhd.h> #include <linux/device.h> #include <linux/bio.h> #include <linux/blkdev.h> @@ -24,7 +24,6 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/slab.h> -#include <linux/lzo.h> #include <linux/vmalloc.h> #include <linux/cpumask.h> #include <linux/atomic.h> @@ -47,19 +46,18 @@ static bool clean_pages_on_read; static bool clean_pages_on_decompress; /* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_ENTRIES swap entries. These + * structures are stored on the swap and linked together with the help of the + * .next_swap member. * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. + * The swap map is created during suspend. The swap map pages are allocated and + * populated one at a time, so we only need one memory page to set up the entire + * structure. * - * During resume we pick up all swap_map_page structures into a list. + * During resume we pick up all swap_map_page structures into a list. */ - #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) /* @@ -89,11 +87,9 @@ struct swap_map_page_list { struct swap_map_page_list *next; }; -/** - * The swap_map_handle structure is used for handling swap in - * a file-alike way +/* + * The swap_map_handle structure is used for handling swap in a file-alike way. */ - struct swap_map_handle { struct swap_map_page *cur; struct swap_map_page_list *maps; @@ -117,11 +113,10 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; -/** - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. +/* + * The following functions are used for tracing the allocated swap pages, so + * that they can be freed in case of an error. */ - struct swsusp_extent { struct rb_node node; unsigned long start; @@ -171,15 +166,14 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/** - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - sector_t alloc_swapdev_block(int swap) { unsigned long offset; + /* + * Allocate a swap page and register that it has been allocated, so that + * it can be freed in case of an error. + */ offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) @@ -190,24 +184,21 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/** - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entries had been - * allocated. - */ - void free_all_swap_pages(int swap) { struct rb_node *node; + /* + * Free swap pages allocated for saving image data. It also frees the + * extents used to register which swap entries had been allocated. + */ while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; - unsigned long offset; ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); + swap_free_nr(swp_entry(swap, ext->start), + ext->end - ext->start + 1); kfree(ext); } @@ -223,7 +214,7 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -static struct block_device *hib_resume_bdev; +static struct file *hib_resume_bdev_file; struct hib_bio_batch { atomic_t count; @@ -270,36 +261,26 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, - struct hib_bio_batch *hb) +static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr) +{ + return bdev_rw_virt(file_bdev(hib_resume_bdev_file), + page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf); +} + +static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr, + struct hib_bio_batch *hb) { - struct page *page = virt_to_page(addr); struct bio *bio; - int error = 0; - bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1); + bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, + GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio_set_dev(bio, hib_resume_bdev); - bio_set_op_attrs(bio, op, op_flags); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - pr_err("Adding page to bio failed at %llu\n", - (unsigned long long)bio->bi_iter.bi_sector); - bio_put(bio); - return -EFAULT; - } - - if (hb) { - bio->bi_end_io = hib_end_io; - bio->bi_private = hb; - atomic_inc(&hb->count); - submit_bio(bio); - } else { - error = submit_bio_wait(bio); - bio_put(bio); - } - - return error; + bio_add_virt_nofail(bio, addr, PAGE_SIZE); + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(bio); + return 0; } static int hib_wait_io(struct hib_bio_batch *hb) @@ -315,12 +296,12 @@ static int hib_wait_io(struct hib_bio_batch *hb) /* * Saving part */ + static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -333,8 +314,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, - swsusp_resume_block, swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, swsusp_header); } else { pr_err("Swap header not found!\n"); error = -ENODEV; @@ -342,16 +323,21 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) return error; } -/** - * swsusp_swap_check - check if the resume device is a swap device - * and get its index (if so) - * - * This is called before saving image +/* + * Hold the swsusp_header flag. This is used in software_resume() in + * 'kernel/power/hibernate' to check if the image is compressed and query + * for the compression algorithm support(if so). */ +unsigned int swsusp_header_flags; + static int swsusp_swap_check(void) { int res; + /* + * Check if the resume device is a swap device and get its index (if so). + * This is called before saving the image. + */ if (swsusp_resume_device) res = swap_type_of(swsusp_resume_device, swsusp_resume_block); else @@ -360,57 +346,40 @@ static int swsusp_swap_check(void) return res; root_swap = res; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, - NULL); - if (IS_ERR(hib_resume_bdev)) - return PTR_ERR(hib_resume_bdev); + hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, + BLK_OPEN_WRITE, NULL, NULL); + if (IS_ERR(hib_resume_bdev_file)) + return PTR_ERR(hib_resume_bdev_file); - res = set_blocksize(hib_resume_bdev, PAGE_SIZE); - if (res < 0) - blkdev_put(hib_resume_bdev, FMODE_WRITE); - - return res; + return 0; } -/** - * write_page - Write one page to given swap location. - * @buf: Address we're writing. - * @offset: Offset of the swap page we're writing to. - * @hb: bio completion batch - */ - static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; void *src; int ret; if (!offset) return -ENOSPC; - if (hb) { - src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - ret = hib_wait_io(hb); /* Free pages */ - if (ret) - return ret; - src = (void *)__get_free_page(GFP_NOIO | - __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - WARN_ON_ONCE(1); - hb = NULL; /* Go synchronous */ - src = buf; - } - } - } else { - src = buf; + if (!hb) + goto sync_io; + + src = (void *)__get_free_page(gfp); + if (!src) { + ret = hib_wait_io(hb); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(gfp); + if (WARN_ON_ONCE(!src)) + goto sync_io; } - return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb); + + copy_page(src, buf); + return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); +sync_io: + return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf); } static void release_swap_writer(struct swap_map_handle *handle) @@ -447,14 +416,14 @@ static int get_swap_writer(struct swap_map_handle *handle) err_rel: release_swap_writer(handle); err_close: - swsusp_close(FMODE_WRITE); + swsusp_close(); return ret; } static int swap_write_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { - int error = 0; + int error; sector_t offset; if (!handle->cur) @@ -512,34 +481,36 @@ static int swap_writer_finish(struct swap_map_handle *handle, if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); - swsusp_close(FMODE_WRITE); + swsusp_close(); return error; } +/* + * Bytes we need for compressed data in worst case. We assume(limitation) + * this is the worst of all the compression algorithms. + */ +#define bytes_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2) + /* We need to remember how much compressed data we need to read. */ -#define LZO_HEADER sizeof(size_t) +#define CMP_HEADER sizeof(size_t) /* Number of pages/bytes we'll compress at one time. */ -#define LZO_UNC_PAGES 32 -#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) +#define UNC_PAGES 32 +#define UNC_SIZE (UNC_PAGES * PAGE_SIZE) -/* Number of pages/bytes we need for compressed data (worst case). */ -#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ - LZO_HEADER, PAGE_SIZE) -#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) +/* Number of pages we need for compressed data (worst case). */ +#define CMP_PAGES DIV_ROUND_UP(bytes_worst_compress(UNC_SIZE) + \ + CMP_HEADER, PAGE_SIZE) +#define CMP_SIZE (CMP_PAGES * PAGE_SIZE) -/* Maximum number of threads for compression/decompression. */ -#define LZO_THREADS 3 +/* Default number of threads for compression/decompression. */ +#define CMP_THREADS 3 +static unsigned int hibernate_compression_threads = CMP_THREADS; /* Minimum/maximum number of pages for read buffering. */ -#define LZO_MIN_RD_PAGES 1024 -#define LZO_MAX_RD_PAGES 8192 - - -/** - * save_image - save the suspend image data - */ +#define CMP_MIN_RD_PAGES 1024 +#define CMP_MAX_RD_PAGES 8192 static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, @@ -585,7 +556,7 @@ static int save_image(struct swap_map_handle *handle, return ret; } -/** +/* * Structure used for CRC32. */ struct crc_data { @@ -596,24 +567,59 @@ struct crc_data { wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ - size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ - unsigned char *unc[LZO_THREADS]; /* uncompressed data */ + size_t **unc_len; /* uncompressed lengths */ + unsigned char **unc; /* uncompressed data */ }; -/** - * CRC32 update function that runs in its own thread. - */ +static struct crc_data *alloc_crc_data(int nr_threads) +{ + struct crc_data *crc; + + crc = kzalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) + return NULL; + + crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL); + if (!crc->unc) + goto err_free_crc; + + crc->unc_len = kcalloc(nr_threads, sizeof(*crc->unc_len), GFP_KERNEL); + if (!crc->unc_len) + goto err_free_unc; + + return crc; + +err_free_unc: + kfree(crc->unc); +err_free_crc: + kfree(crc); + return NULL; +} + +static void free_crc_data(struct crc_data *crc) +{ + if (!crc) + return; + + if (crc->thr) + kthread_stop(crc->thr); + + kfree(crc->unc_len); + kfree(crc->unc); + kfree(crc); +} + static int crc32_threadfn(void *data) { struct crc_data *d = data; unsigned i; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } @@ -622,16 +628,19 @@ static int crc32_threadfn(void *data) for (i = 0; i < d->run_threads; i++) *d->crc32 = crc32_le(*d->crc32, d->unc[i], *d->unc_len[i]); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * Structure used for LZO data compression. + +/* + * Structure used for data compression. */ struct cmp_data { struct task_struct *thr; /* thread */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -639,48 +648,47 @@ struct cmp_data { wait_queue_head_t done; /* compression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ - unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ - unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ - unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ + unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; -/** - * Compression function that runs in its own thread. - */ -static int lzo_compress_threadfn(void *data) +/* Indicates the image size after compression */ +static atomic64_t compressed_size = ATOMIC_INIT(0); + +static int compress_threadfn(void *data) { struct cmp_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); - d->ret = lzo1x_1_compress(d->unc, d->unc_len, - d->cmp + LZO_HEADER, &d->cmp_len, - d->wrk); - atomic_set(&d->stop, 1); + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->unc, d->unc_len); + acomp_request_set_dst_nondma(d->cr, d->cmp + CMP_HEADER, + CMP_SIZE - CMP_HEADER); + d->ret = crypto_acomp_compress(d->cr); + d->cmp_len = d->cr->dlen; + + atomic64_add(d->cmp_len, &compressed_size); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * save_image_lzo - Save the suspend image data compressed with LZO. - * @handle: Swap map handle to use for saving the image. - * @snapshot: Image to read data from. - * @nr_to_write: Number of pages to save. - */ -static int save_image_lzo(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_write) +static int save_compressed_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) { unsigned int m; int ret = 0; @@ -690,35 +698,37 @@ static int save_image_lzo(struct swap_map_handle *handle, ktime_t start; ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); + atomic64_set(&compressed_size, 0); + /* * We'll limit the number of threads for compression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { - pr_err("Failed to allocate LZO page\n"); + pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - data = vzalloc(array_size(nr_threads, sizeof(*data))); + data = vcalloc(nr_threads, sizeof(*data)); if (!data) { - pr_err("Failed to allocate LZO data\n"); + pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -732,7 +742,21 @@ static int save_image_lzo(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].thr = kthread_run(lzo_compress_threadfn, + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(data[thr].cc)) { + pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); + ret = -EFAULT; + goto out_clean; + } + + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + + data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); if (IS_ERR(data[thr].thr)) { @@ -770,7 +794,7 @@ static int save_image_lzo(struct swap_map_handle *handle, */ handle->reqd_free_pages = reqd_free_pages(); - pr_info("Using %u thread(s) for compression\n", nr_threads); + pr_info("Using %u thread(s) for %s compression\n", nr_threads, hib_comp_algo); pr_info("Compressing and saving image data (%u pages)...\n", nr_to_write); m = nr_to_write / 10; @@ -780,7 +804,7 @@ static int save_image_lzo(struct swap_map_handle *handle, start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { - for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + for (off = 0; off < UNC_SIZE; off += PAGE_SIZE) { ret = snapshot_read_next(snapshot); if (ret < 0) goto out_finish; @@ -801,7 +825,7 @@ static int save_image_lzo(struct swap_map_handle *handle, data[thr].unc_len = off; - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } @@ -809,25 +833,25 @@ static int save_image_lzo(struct swap_map_handle *handle, break; crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; if (ret < 0) { - pr_err("LZO compression failed\n"); + pr_err("%s compression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > - lzo1x_worst_compress(data[thr].unc_len))) { - pr_err("Invalid LZO compressed length\n"); + bytes_worst_compress(data[thr].unc_len))) { + pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } @@ -843,7 +867,7 @@ static int save_image_lzo(struct swap_map_handle *handle, * read it. */ for (off = 0; - off < LZO_HEADER + data[thr].cmp_len; + off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(page, data[thr].cmp + off, PAGE_SIZE); @@ -853,7 +877,7 @@ static int save_image_lzo(struct swap_map_handle *handle, } } - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } @@ -862,34 +886,33 @@ out_finish: stop = ktime_get(); if (!ret) ret = err2; - if (!ret) + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); - swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + } else { + pr_err("Image saving failed: %d\n", ret); + } + out_clean: hib_finish_batch(&hb); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { - for (thr = 0; thr < nr_threads; thr++) + for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); + } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } -/** - * enough_swap - Make sure we have enough swap to save the image. - * - * Returns TRUE or FALSE after checking the total amount of swap - * space available from the resume partition. - */ - static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); @@ -902,15 +925,16 @@ static int enough_swap(unsigned int nr_pages) } /** - * swsusp_write - Write entire image and metadata. - * @flags: flags to pass to the "boot" kernel in the image header + * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header + * + * It is important _NOT_ to umount filesystems at this point. We want them + * synced (in case something goes wrong) but we DO not want to mark filesystem + * clean: it is not. (And it does not matter, if we resume correctly, we'll mark + * system clean, anyway.) * - * It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) + * Return: 0 on success, negative error code on failure. */ - int swsusp_write(unsigned int flags) { struct swap_map_handle handle; @@ -945,16 +969,16 @@ int swsusp_write(unsigned int flags) if (!error) { error = (flags & SF_NOCOMPRESS_MODE) ? save_image(&handle, &snapshot, pages - 1) : - save_image_lzo(&handle, &snapshot, pages - 1); + save_compressed_image(&handle, &snapshot, pages - 1); } out_finish: error = swap_writer_finish(&handle, flags, error); return error; } -/** - * The following functions allow us to read data using a swap map - * in a file-alike way +/* + * The following functions allow us to read data using a swap map in a file-like + * way. */ static void release_swap_reader(struct swap_map_handle *handle) @@ -1005,7 +1029,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL); + error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map); if (error) { release_swap_reader(handle); return error; @@ -1029,7 +1053,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb); + if (hb) + error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb); + else + error = hib_submit_io_sync(REQ_OP_READ, offset, buf); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1053,12 +1080,6 @@ static int swap_reader_finish(struct swap_map_handle *handle) return 0; } -/** - * load_image - load the image using the swap map handle - * @handle and the snapshot handle @snapshot - * (assume there are @nr_pages pages to load) - */ - static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) @@ -1103,19 +1124,21 @@ static int load_image(struct swap_map_handle *handle, ret = err2; if (!ret) { pr_info("Image loading done\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) + ret = snapshot_write_finalize(snapshot); + if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; } swsusp_show_speed(start, stop, nr_to_read, "Read"); return ret; } -/** - * Structure used for LZO data decompression. +/* + * Structure used for data decompression. */ struct dec_data { struct task_struct *thr; /* thread */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -1123,51 +1146,47 @@ struct dec_data { wait_queue_head_t done; /* decompression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ - unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ - unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; -/** - * Decompression function that runs in its own thread. - */ -static int lzo_decompress_threadfn(void *data) +static int decompress_threadfn(void *data) { struct dec_data *d = data; while (1) { - wait_event(d->go, atomic_read(&d->ready) || + wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); - d->unc_len = LZO_UNC_SIZE; - d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, - d->unc, &d->unc_len); + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->cmp + CMP_HEADER, + d->cmp_len); + acomp_request_set_dst_nondma(d->cr, d->unc, UNC_SIZE); + d->ret = crypto_acomp_decompress(d->cr); + d->unc_len = d->cr->dlen; + if (clean_pages_on_decompress) flush_icache_range((unsigned long)d->unc, (unsigned long)d->unc + d->unc_len); - atomic_set(&d->stop, 1); + atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } -/** - * load_image_lzo - Load compressed image data and decompress them with LZO. - * @handle: Swap map handle to use for loading data. - * @snapshot: Image to copy uncompressed data into. - * @nr_to_read: Number of pages to load. - */ -static int load_image_lzo(struct swap_map_handle *handle, - struct snapshot_handle *snapshot, - unsigned int nr_to_read) +static int load_compressed_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) { unsigned int m; int ret = 0; @@ -1192,23 +1211,23 @@ static int load_image_lzo(struct swap_map_handle *handle, * footprint. */ nr_threads = num_online_cpus() - 1; - nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + nr_threads = clamp_val(nr_threads, 1, hibernate_compression_threads); - page = vmalloc(array_size(LZO_MAX_RD_PAGES, sizeof(*page))); + page = vmalloc_array(CMP_MAX_RD_PAGES, sizeof(*page)); if (!page) { - pr_err("Failed to allocate LZO page\n"); + pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - data = vzalloc(array_size(nr_threads, sizeof(*data))); + data = vcalloc(nr_threads, sizeof(*data)); if (!data) { - pr_err("Failed to allocate LZO data\n"); + pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } - crc = kzalloc(sizeof(*crc), GFP_KERNEL); + crc = alloc_crc_data(nr_threads); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; @@ -1224,7 +1243,21 @@ static int load_image_lzo(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].thr = kthread_run(lzo_decompress_threadfn, + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR_OR_NULL(data[thr].cc)) { + pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); + ret = -EFAULT; + goto out_clean; + } + + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + + data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); if (IS_ERR(data[thr].thr)) { @@ -1265,18 +1298,18 @@ static int load_image_lzo(struct swap_map_handle *handle, */ if (low_free_pages() > snapshot_get_image_size()) read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; - read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); + read_pages = clamp_val(read_pages, CMP_MIN_RD_PAGES, CMP_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { - page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + page[i] = (void *)__get_free_page(i < CMP_PAGES ? GFP_NOIO | __GFP_HIGH : GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!page[i]) { - if (i < LZO_CMP_PAGES) { + if (i < CMP_PAGES) { ring_size = i; - pr_err("Failed to allocate LZO pages\n"); + pr_err("Failed to allocate %s pages\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } else { @@ -1286,7 +1319,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } want = ring_size = i; - pr_info("Using %u thread(s) for decompression\n", nr_threads); + pr_info("Using %u thread(s) for %s decompression\n", nr_threads, hib_comp_algo); pr_info("Loading and decompressing image data (%u pages)...\n", nr_to_read); m = nr_to_read / 10; @@ -1338,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); crc->run_threads = 0; } @@ -1347,13 +1380,13 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].cmp_len = *(size_t *)page[pg]; if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > - lzo1x_worst_compress(LZO_UNC_SIZE))) { - pr_err("Invalid LZO compressed length\n"); + bytes_worst_compress(UNC_SIZE))) { + pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } - need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + need = DIV_ROUND_UP(data[thr].cmp_len + CMP_HEADER, PAGE_SIZE); if (need > have) { if (eof > 1) { @@ -1364,7 +1397,7 @@ static int load_image_lzo(struct swap_map_handle *handle, } for (off = 0; - off < LZO_HEADER + data[thr].cmp_len; + off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(data[thr].cmp + off, page[pg], PAGE_SIZE); @@ -1374,14 +1407,14 @@ static int load_image_lzo(struct swap_map_handle *handle, pg = 0; } - atomic_set(&data[thr].ready, 1); + atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } /* * Wait for more data while we are decompressing. */ - if (have < LZO_CMP_PAGES && asked) { + if (have < CMP_PAGES && asked) { ret = hib_wait_io(&hb); if (ret) goto out_finish; @@ -1393,20 +1426,20 @@ static int load_image_lzo(struct swap_map_handle *handle, for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, - atomic_read(&data[thr].stop)); + atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; if (ret < 0) { - pr_err("LZO decompression failed\n"); + pr_err("%s decompression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].unc_len || - data[thr].unc_len > LZO_UNC_SIZE || - data[thr].unc_len & (PAGE_SIZE - 1))) { - pr_err("Invalid LZO uncompressed length\n"); + data[thr].unc_len > UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + pr_err("Invalid %s uncompressed length\n", hib_comp_algo); ret = -1; goto out_finish; } @@ -1424,7 +1457,7 @@ static int load_image_lzo(struct swap_map_handle *handle, ret = snapshot_write_next(snapshot); if (ret <= 0) { crc->run_threads = thr + 1; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); goto out_finish; } @@ -1432,20 +1465,20 @@ static int load_image_lzo(struct swap_map_handle *handle, } crc->run_threads = thr; - atomic_set(&crc->ready, 1); + atomic_set_release(&crc->ready, 1); wake_up(&crc->go); } out_finish: if (crc->run_threads) { - wait_event(crc->done, atomic_read(&crc->stop)); + wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } stop = ktime_get(); if (!ret) { pr_info("Image loading done\n"); - snapshot_write_finalize(snapshot); - if (!snapshot_image_loaded(snapshot)) + ret = snapshot_write_finalize(snapshot); + if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; if (!ret) { if (swsusp_header->flags & SF_CRC32_MODE) { @@ -1461,15 +1494,14 @@ out_clean: hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); - if (crc) { - if (crc->thr) - kthread_stop(crc->thr); - kfree(crc); - } + free_crc_data(crc); if (data) { - for (thr = 0; thr < nr_threads; thr++) + for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); + } vfree(data); } vfree(page); @@ -1481,8 +1513,9 @@ out_clean: * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memory location + * + * Return: 0 on success, negative error code on failure. */ - int swsusp_read(unsigned int *flags_p) { int error; @@ -1503,7 +1536,7 @@ int swsusp_read(unsigned int *flags_p) if (!error) { error = (*flags_p & SF_NOCOMPRESS_MODE) ? load_image(&handle, &snapshot, header->pages - 1) : - load_image_lzo(&handle, &snapshot, header->pages - 1); + load_compressed_image(&handle, &snapshot, header->pages - 1); } swap_reader_finish(&handle); end: @@ -1514,32 +1547,35 @@ end: return error; } +static void *swsusp_holder; + /** - * swsusp_check - Check for swsusp signature in the resume device + * swsusp_check - Open the resume device and check for the swsusp signature. + * @exclusive: Open the resume device exclusively. + * + * Return: 0 if a valid image is found, negative error code otherwise. */ - -int swsusp_check(void) +int swsusp_check(bool exclusive) { + void *holder = exclusive ? &swsusp_holder : NULL; int error; - void *holder; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ | FMODE_EXCL, &holder); - if (!IS_ERR(hib_resume_bdev)) { - set_blocksize(hib_resume_bdev, PAGE_SIZE); + hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, + BLK_OPEN_READ, holder, NULL); + if (!IS_ERR(hib_resume_bdev_file)) { clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, 0, - swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, + swsusp_header); if (error) goto put; if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); + swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { error = -EINVAL; } @@ -1552,11 +1588,11 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL); + bdev_fput(hib_resume_bdev_file); else pr_debug("Image signature found, resuming\n"); } else { - error = PTR_ERR(hib_resume_bdev); + error = PTR_ERR(hib_resume_bdev_file); } if (error) @@ -1566,35 +1602,34 @@ put: } /** - * swsusp_close - close swap device. + * swsusp_close - close resume device. */ - -void swsusp_close(fmode_t mode) +void swsusp_close(void) { - if (IS_ERR(hib_resume_bdev)) { + if (IS_ERR(hib_resume_bdev_file)) { pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, mode); + fput(hib_resume_bdev_file); } /** - * swsusp_unmark - Unmark swsusp signature in the resume device + * swsusp_unmark - Unmark swsusp signature in the resume device + * + * Return: 0 on success, negative error code on failure. */ - #ifdef CONFIG_SUSPEND int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; @@ -1609,8 +1644,46 @@ int swsusp_unmark(void) } #endif +static ssize_t hibernate_compression_threads_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", hibernate_compression_threads); +} + +static ssize_t hibernate_compression_threads_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 0, &val)) + return -EINVAL; + + if (val < 1) + return -EINVAL; + + hibernate_compression_threads = val; + return n; +} +power_attr(hibernate_compression_threads); + +static struct attribute *g[] = { + &hibernate_compression_threads_attr.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = g, +}; + static int __init swsusp_header_init(void) { + int error; + + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return -ENOMEM; + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) panic("Could not allocate memory for swsusp_header\n"); @@ -1618,3 +1691,19 @@ static int __init swsusp_header_init(void) } core_initcall(swsusp_header_init); + +static int __init hibernate_compression_threads_setup(char *str) +{ + int rc = kstrtouint(str, 0, &hibernate_compression_threads); + + if (rc) + return rc; + + if (hibernate_compression_threads < 1) + hibernate_compression_threads = CMP_THREADS; + + return 1; + +} + +__setup("hibernate_compression_threads=", hibernate_compression_threads_setup); diff --git a/kernel/power/user.c b/kernel/power/user.c index ad241b4ff64c..4401cfe26e5c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -26,6 +26,7 @@ #include "power.h" +static bool need_wait; static struct snapshot_data { struct snapshot_handle handle; @@ -46,12 +47,13 @@ int is_hibernate_resume_dev(dev_t dev) static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; int error; if (!hibernation_available()) return -EPERM; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; @@ -78,7 +80,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) * Resuming. We may need to wait for the image device to * appear. */ - wait_for_device_probe(); + need_wait = true; data->swap = -1; data->mode = O_WRONLY; @@ -97,7 +99,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->dev = 0; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } @@ -105,8 +107,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); swsusp_free(); data = filp->private_data; @@ -123,7 +126,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) PM_POST_HIBERNATION : PM_POST_RESTORE); hibernate_release(); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return 0; } @@ -131,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp) static ssize_t snapshot_read(struct file *filp, char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned int sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -156,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } @@ -164,11 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, static ssize_t snapshot_write(struct file *filp, const char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned long sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - lock_system_sleep(); + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + + sleep_flags = lock_system_sleep(); data = filp->private_data; @@ -190,7 +200,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } @@ -244,6 +254,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, loff_t size; sector_t offset; + if (need_wait) { + wait_for_device_probe(); + need_wait = false; + } + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) @@ -263,7 +278,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - ksys_sync_helper(); + error = pm_sleep_fs_sync(); + if (error) + break; error = freeze_processes(); if (error) @@ -302,7 +319,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_ATOMIC_RESTORE: - snapshot_write_finalize(&data->handle); + error = snapshot_write_finalize(&data->handle); + if (error) + break; if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; @@ -430,7 +449,6 @@ static const struct file_operations snapshot_fops = { .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, - .llseek = no_llseek, .unlocked_ioctl = snapshot_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snapshot_compat_ioctl, diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 105df4dfc783..4e941999a53b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -39,23 +39,23 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) { struct rb_node *node; struct wakelock *wl; - char *str = buf; - char *end = buf + PAGE_SIZE; + int len = 0; mutex_lock(&wakelocks_lock); for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { wl = rb_entry(node, struct wakelock, node); if (wl->ws->active == show_active) - str += scnprintf(str, end - str, "%s ", wl->name); + len += sysfs_emit_at(buf, len, "%s ", wl->name); } - if (str > buf) - str--; - str += scnprintf(str, end - str, "\n"); + if (len > 0) + --len; + + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); - return (str - buf); + return len; } #if CONFIG_PM_WAKELOCKS_LIMIT > 0 |
