summaryrefslogtreecommitdiff
path: root/arch/s390/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/kernel')
-rw-r--r--arch/s390/kernel/.gitignore1
-rw-r--r--arch/s390/kernel/Makefile40
-rw-r--r--arch/s390/kernel/abs_lowcore.c46
-rw-r--r--arch/s390/kernel/alternative.c84
-rw-r--r--arch/s390/kernel/asm-offsets.c130
-rw-r--r--arch/s390/kernel/audit.c12
-rw-r--r--arch/s390/kernel/base.S63
-rw-r--r--arch/s390/kernel/cache.c9
-rw-r--r--arch/s390/kernel/cert_store.c812
-rw-r--r--arch/s390/kernel/compat_audit.c13
-rw-r--r--arch/s390/kernel/compat_linux.h89
-rw-r--r--arch/s390/kernel/compat_signal.c34
-rw-r--r--arch/s390/kernel/cpcmd.c42
-rw-r--r--arch/s390/kernel/cpufeature.c46
-rw-r--r--arch/s390/kernel/crash_dump.c188
-rw-r--r--arch/s390/kernel/ctlreg.c121
-rw-r--r--arch/s390/kernel/debug.c350
-rw-r--r--arch/s390/kernel/diag.c108
-rw-r--r--arch/s390/kernel/dis.c31
-rw-r--r--arch/s390/kernel/dumpstack.c69
-rw-r--r--arch/s390/kernel/early.c155
-rw-r--r--arch/s390/kernel/early_printk.c2
-rw-r--r--arch/s390/kernel/earlypgm.S23
-rw-r--r--arch/s390/kernel/ebcdic.c2
-rw-r--r--arch/s390/kernel/entry.S1475
-rw-r--r--arch/s390/kernel/entry.h62
-rw-r--r--arch/s390/kernel/facility.c21
-rw-r--r--arch/s390/kernel/fpu.c94
-rw-r--r--arch/s390/kernel/ftrace.c335
-rw-r--r--arch/s390/kernel/ftrace.h24
-rw-r--r--arch/s390/kernel/guarded_storage.c6
-rw-r--r--arch/s390/kernel/head64.S26
-rw-r--r--arch/s390/kernel/idle.c130
-rw-r--r--arch/s390/kernel/ipl.c874
-rw-r--r--arch/s390/kernel/ipl_vmparm.c2
-rw-r--r--arch/s390/kernel/irq.c161
-rw-r--r--arch/s390/kernel/jump_label.c42
-rw-r--r--arch/s390/kernel/kprobes.c343
-rw-r--r--arch/s390/kernel/kprobes.h9
-rw-r--r--arch/s390/kernel/kprobes_insn_page.S22
-rw-r--r--arch/s390/kernel/lgr.c5
-rw-r--r--arch/s390/kernel/machine_kexec.c112
-rw-r--r--arch/s390/kernel/machine_kexec_file.c114
-rw-r--r--arch/s390/kernel/machine_kexec_reloc.c1
-rw-r--r--arch/s390/kernel/mcount.S183
-rw-r--r--arch/s390/kernel/module.c273
-rw-r--r--arch/s390/kernel/nmi.c381
-rw-r--r--arch/s390/kernel/nospec-branch.c50
-rw-r--r--arch/s390/kernel/nospec-sysfs.c4
-rw-r--r--arch/s390/kernel/numa.c35
-rw-r--r--arch/s390/kernel/os_info.c23
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c1792
-rw-r--r--arch/s390/kernel/perf_cpum_cf_common.c201
-rw-r--r--arch/s390/kernel/perf_cpum_cf_diag.c705
-rw-r--r--arch/s390/kernel/perf_cpum_cf_events.c277
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c445
-rw-r--r--arch/s390/kernel/perf_event.c64
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c773
-rw-r--r--arch/s390/kernel/perf_pai_ext.c671
-rw-r--r--arch/s390/kernel/perf_regs.c9
-rw-r--r--arch/s390/kernel/pgm_check.S147
-rw-r--r--arch/s390/kernel/process.c120
-rw-r--r--arch/s390/kernel/processor.c228
-rw-r--r--arch/s390/kernel/ptrace.c539
-rw-r--r--arch/s390/kernel/reipl.S10
-rw-r--r--arch/s390/kernel/relocate_kernel.S100
-rw-r--r--arch/s390/kernel/rethook.c34
-rw-r--r--arch/s390/kernel/rethook.h7
-rw-r--r--arch/s390/kernel/runtime_instr.c2
-rw-r--r--arch/s390/kernel/setup.c741
-rw-r--r--arch/s390/kernel/signal.c72
-rw-r--r--arch/s390/kernel/smp.c463
-rw-r--r--arch/s390/kernel/stacktrace.c60
-rw-r--r--arch/s390/kernel/sthyi.c23
-rw-r--r--arch/s390/kernel/suspend.c240
-rw-r--r--arch/s390/kernel/swsusp.S276
-rw-r--r--arch/s390/kernel/syscall.c (renamed from arch/s390/kernel/sys_s390.c)68
-rw-r--r--arch/s390/kernel/syscalls/Makefile3
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl58
-rw-r--r--arch/s390/kernel/sysinfo.c30
-rw-r--r--arch/s390/kernel/text_amode31.S159
-rw-r--r--arch/s390/kernel/time.c503
-rw-r--r--arch/s390/kernel/topology.c143
-rw-r--r--arch/s390/kernel/trace.c2
-rw-r--r--arch/s390/kernel/traps.c200
-rw-r--r--arch/s390/kernel/unwind_bc.c27
-rw-r--r--arch/s390/kernel/uprobes.c23
-rw-r--r--arch/s390/kernel/uv.c717
-rw-r--r--arch/s390/kernel/vdso.c406
-rw-r--r--arch/s390/kernel/vdso32/.gitignore2
-rw-r--r--arch/s390/kernel/vdso32/Makefile70
-rwxr-xr-xarch/s390/kernel/vdso32/gen_vdso_offsets.sh15
-rw-r--r--arch/s390/kernel/vdso32/note.S13
-rw-r--r--arch/s390/kernel/vdso32/vdso32.lds.S142
-rw-r--r--arch/s390/kernel/vdso32/vdso32_wrapper.S15
-rw-r--r--arch/s390/kernel/vdso32/vdso_user_wrapper.S22
-rw-r--r--arch/s390/kernel/vdso64/.gitignore1
-rw-r--r--arch/s390/kernel/vdso64/Makefile55
-rw-r--r--arch/s390/kernel/vdso64/clock_getres.S50
-rw-r--r--arch/s390/kernel/vdso64/clock_gettime.S163
-rwxr-xr-xarch/s390/kernel/vdso64/gen_vdso_offsets.sh15
-rw-r--r--arch/s390/kernel/vdso64/getcpu.S31
-rw-r--r--arch/s390/kernel/vdso64/getcpu.c21
-rw-r--r--arch/s390/kernel/vdso64/gettimeofday.S71
-rw-r--r--arch/s390/kernel/vdso64/vdso.h14
-rw-r--r--arch/s390/kernel/vdso64/vdso64.lds.S14
-rw-r--r--arch/s390/kernel/vdso64/vdso64_generic.c19
-rw-r--r--arch/s390/kernel/vdso64/vdso_user_wrapper.S57
-rw-r--r--arch/s390/kernel/vmlinux.lds.S60
-rw-r--r--arch/s390/kernel/vtime.c64
110 files changed, 11255 insertions, 7274 deletions
diff --git a/arch/s390/kernel/.gitignore b/arch/s390/kernel/.gitignore
index c5f676c3c224..bbb90f92d051 100644
--- a/arch/s390/kernel/.gitignore
+++ b/arch/s390/kernel/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
vmlinux.lds
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 2b1203cf7be6..7a562b4199c8 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -10,6 +10,7 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
# Do not trace early setup code
CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE)
endif
@@ -33,51 +34,52 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls
CFLAGS_dumpstack.o += -fno-optimize-sibling-calls
CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls
-#
-# Pass UTS_MACHINE for user_regset definition
-#
-CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"'
-
-obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
-obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
-obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o
-obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
+obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o
+obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
+obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o
+obj-y += sysinfo.o lgr.o os_info.o ctlreg.o
obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
-obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
+obj-y += entry.o reipl.o kdebugfs.o alternative.o
obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y += smp.o
+obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o
-extra-y += head64.o vmlinux.lds
+extra-y += vmlinux.lds
obj-$(CONFIG_SYSFS) += nospec-sysfs.o
CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE)
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o
-obj-$(CONFIG_HIBERNATION) += suspend.o swsusp.o
+obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_AUDIT) += audit.o
compat-obj-$(CONFIG_AUDIT) += compat_audit.o
obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o
obj-$(CONFIG_COMPAT) += $(compat-obj-y)
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o
+obj-$(CONFIG_KPROBES) += kprobes_insn_page.o
+obj-$(CONFIG_KPROBES) += mcount.o
+obj-$(CONFIG_RETHOOK) += rethook.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o
+obj-$(CONFIG_FUNCTION_TRACER) += mcount.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o
obj-$(CONFIG_UPROBES) += uprobes.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o
obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o
+obj-$(CONFIG_CERT_STORE) += cert_store.o
+obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
-obj-$(CONFIG_IMA) += ima_arch.o
-
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o
+obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o
obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o
-obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o
+obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o
obj-$(CONFIG_TRACEPOINTS) += trace.o
+obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
# vdso
obj-y += vdso64/
+obj-$(CONFIG_COMPAT) += vdso32/
diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c
new file mode 100644
index 000000000000..f9efc54ec4b7
--- /dev/null
+++ b/arch/s390/kernel/abs_lowcore.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pgtable.h>
+#include <asm/abs_lowcore.h>
+
+unsigned long __bootdata_preserved(__abs_lowcore);
+
+int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc)
+{
+ unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+ unsigned long phys = __pa(lc);
+ int rc, i;
+
+ for (i = 0; i < LC_PAGES; i++) {
+ rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc);
+ if (rc) {
+ /*
+ * Do not unmap allocated page tables in case the
+ * allocation was not requested. In such a case the
+ * request is expected coming from an atomic context,
+ * while the unmap attempt might sleep.
+ */
+ if (alloc) {
+ for (--i; i >= 0; i--) {
+ addr -= PAGE_SIZE;
+ vmem_unmap_4k_page(addr);
+ }
+ }
+ return rc;
+ }
+ addr += PAGE_SIZE;
+ phys += PAGE_SIZE;
+ }
+ return 0;
+}
+
+void abs_lowcore_unmap(int cpu)
+{
+ unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+ int i;
+
+ for (i = 0; i < LC_PAGES; i++) {
+ vmem_unmap_4k_page(addr);
+ addr += PAGE_SIZE;
+ }
+}
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 8e1f2aee85ef..e7bca29f9c34 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -1,11 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/facility.h>
#include <asm/nospec-branch.h>
-#define MAX_PATCH_LEN (255 - 1)
-
static int __initdata_or_module alt_instr_disabled;
static int __init disable_alternative_instructions(char *str)
@@ -16,86 +17,30 @@ static int __init disable_alternative_instructions(char *str)
early_param("noaltinstr", disable_alternative_instructions);
-struct brcl_insn {
- u16 opc;
- s32 disp;
-} __packed;
-
-static u16 __initdata_or_module nop16 = 0x0700;
-static u32 __initdata_or_module nop32 = 0x47000000;
-static struct brcl_insn __initdata_or_module nop48 = {
- 0xc004, 0
-};
-
-static const void *nops[] __initdata_or_module = {
- &nop16,
- &nop32,
- &nop48
-};
-
-static void __init_or_module add_jump_padding(void *insns, unsigned int len)
-{
- struct brcl_insn brcl = {
- 0xc0f4,
- len / 2
- };
-
- memcpy(insns, &brcl, sizeof(brcl));
- insns += sizeof(brcl);
- len -= sizeof(brcl);
-
- while (len > 0) {
- memcpy(insns, &nop16, 2);
- insns += 2;
- len -= 2;
- }
-}
-
-static void __init_or_module add_padding(void *insns, unsigned int len)
-{
- if (len > 6)
- add_jump_padding(insns, len);
- else if (len >= 2)
- memcpy(insns, nops[len / 2 - 1], len);
-}
-
static void __init_or_module __apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
struct alt_instr *a;
u8 *instr, *replacement;
- u8 insnbuf[MAX_PATCH_LEN];
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
*/
for (a = start; a < end; a++) {
- int insnbuf_sz = 0;
-
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- if (!__test_facility(a->facility,
- S390_lowcore.alt_stfle_fac_list))
+ if (!__test_facility(a->facility, alt_stfle_fac_list))
continue;
- if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
+ if (unlikely(a->instrlen % 2)) {
WARN_ONCE(1, "cpu alternatives instructions length is "
"odd, skipping patching\n");
continue;
}
- memcpy(insnbuf, replacement, a->replacementlen);
- insnbuf_sz = a->replacementlen;
-
- if (a->instrlen > a->replacementlen) {
- add_padding(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
- insnbuf_sz += a->instrlen - a->replacementlen;
- }
-
- s390_kernel_write(instr, insnbuf, insnbuf_sz);
+ s390_kernel_write(instr, replacement, a->instrlen);
}
}
@@ -111,3 +56,20 @@ void __init apply_alternative_instructions(void)
{
apply_alternatives(__alt_instructions, __alt_instructions_end);
}
+
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
+
+void text_poke_sync(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
+void text_poke_sync_lock(void)
+{
+ cpus_read_lock();
+ text_poke_sync();
+ cpus_read_unlock();
+}
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index ce33406cfe83..fa5f6885c74a 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -11,11 +11,10 @@
#include <linux/kvm_host.h>
#include <linux/sched.h>
#include <linux/purgatory.h>
+#include <linux/pgtable.h>
+#include <linux/ftrace.h>
#include <asm/idle.h>
-#include <asm/vdso.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
-#include <asm/nmi.h>
#include <asm/stacktrace.h>
int main(void)
@@ -27,80 +26,55 @@ int main(void)
BLANK();
/* thread struct offsets */
OFFSET(__THREAD_ksp, thread_struct, ksp);
- OFFSET(__THREAD_sysc_table, thread_struct, sys_call_table);
- OFFSET(__THREAD_last_break, thread_struct, last_break);
- OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc);
- OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs);
- OFFSET(__THREAD_per_cause, thread_struct, per_event.cause);
- OFFSET(__THREAD_per_address, thread_struct, per_event.address);
- OFFSET(__THREAD_per_paid, thread_struct, per_event.paid);
- OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb);
BLANK();
/* thread info offsets */
OFFSET(__TI_flags, task_struct, thread_info.flags);
BLANK();
/* pt_regs offsets */
- OFFSET(__PT_ARGS, pt_regs, args);
OFFSET(__PT_PSW, pt_regs, psw);
OFFSET(__PT_GPRS, pt_regs, gprs);
+ OFFSET(__PT_R0, pt_regs, gprs[0]);
+ OFFSET(__PT_R1, pt_regs, gprs[1]);
+ OFFSET(__PT_R2, pt_regs, gprs[2]);
+ OFFSET(__PT_R3, pt_regs, gprs[3]);
+ OFFSET(__PT_R4, pt_regs, gprs[4]);
+ OFFSET(__PT_R5, pt_regs, gprs[5]);
+ OFFSET(__PT_R6, pt_regs, gprs[6]);
+ OFFSET(__PT_R7, pt_regs, gprs[7]);
+ OFFSET(__PT_R8, pt_regs, gprs[8]);
+ OFFSET(__PT_R9, pt_regs, gprs[9]);
+ OFFSET(__PT_R10, pt_regs, gprs[10]);
+ OFFSET(__PT_R11, pt_regs, gprs[11]);
+ OFFSET(__PT_R12, pt_regs, gprs[12]);
+ OFFSET(__PT_R13, pt_regs, gprs[13]);
+ OFFSET(__PT_R14, pt_regs, gprs[14]);
+ OFFSET(__PT_R15, pt_regs, gprs[15]);
OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2);
- OFFSET(__PT_INT_CODE, pt_regs, int_code);
- OFFSET(__PT_INT_PARM, pt_regs, int_parm);
- OFFSET(__PT_INT_PARM_LONG, pt_regs, int_parm_long);
OFFSET(__PT_FLAGS, pt_regs, flags);
+ OFFSET(__PT_CR1, pt_regs, cr1);
+ OFFSET(__PT_LAST_BREAK, pt_regs, last_break);
DEFINE(__PT_SIZE, sizeof(struct pt_regs));
BLANK();
/* stack_frame offsets */
OFFSET(__SF_BACKCHAIN, stack_frame, back_chain);
OFFSET(__SF_GPRS, stack_frame, gprs);
- OFFSET(__SF_EMPTY, stack_frame, empty1);
- OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[0]);
- OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[1]);
- OFFSET(__SF_SIE_REASON, stack_frame, empty1[2]);
- OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[3]);
- BLANK();
- /* timeval/timezone offsets for use by vdso */
- OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count);
- OFFSET(__VDSO_XTIME_STAMP, vdso_data, xtime_tod_stamp);
- OFFSET(__VDSO_XTIME_SEC, vdso_data, xtime_clock_sec);
- OFFSET(__VDSO_XTIME_NSEC, vdso_data, xtime_clock_nsec);
- OFFSET(__VDSO_XTIME_CRS_SEC, vdso_data, xtime_coarse_sec);
- OFFSET(__VDSO_XTIME_CRS_NSEC, vdso_data, xtime_coarse_nsec);
- OFFSET(__VDSO_WTOM_SEC, vdso_data, wtom_clock_sec);
- OFFSET(__VDSO_WTOM_NSEC, vdso_data, wtom_clock_nsec);
- OFFSET(__VDSO_WTOM_CRS_SEC, vdso_data, wtom_coarse_sec);
- OFFSET(__VDSO_WTOM_CRS_NSEC, vdso_data, wtom_coarse_nsec);
- OFFSET(__VDSO_TIMEZONE, vdso_data, tz_minuteswest);
- OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available);
- OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult);
- OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift);
- OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir);
- OFFSET(__VDSO_TS_END, vdso_data, ts_end);
- OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base);
- OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time);
- OFFSET(__VDSO_GETCPU_VAL, vdso_per_cpu_data, getcpu_val);
- BLANK();
- /* constants used by the vdso */
- DEFINE(__CLOCK_REALTIME, CLOCK_REALTIME);
- DEFINE(__CLOCK_MONOTONIC, CLOCK_MONOTONIC);
- DEFINE(__CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
- DEFINE(__CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
- DEFINE(__CLOCK_THREAD_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID);
- DEFINE(__CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
- DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC);
+ OFFSET(__SF_EMPTY, stack_frame, empty[0]);
+ OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block);
+ OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea);
+ OFFSET(__SF_SIE_REASON, stack_frame, sie_reason);
+ OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags);
+ OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys);
+ DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
BLANK();
/* idle data offsets */
OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter);
- OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit);
OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter);
- OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit);
+ OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter);
BLANK();
/* hardware defined lowcore locations 0x000 - 0x1ff */
OFFSET(__LC_EXT_PARAMS, lowcore, ext_params);
OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr);
OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code);
- OFFSET(__LC_SVC_ILC, lowcore, svc_ilc);
- OFFSET(__LC_SVC_INT_CODE, lowcore, svc_code);
OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc);
OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code);
OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code);
@@ -118,12 +92,12 @@ int main(void)
OFFSET(__LC_SUBCHANNEL_NR, lowcore, subchannel_nr);
OFFSET(__LC_IO_INT_PARM, lowcore, io_int_parm);
OFFSET(__LC_IO_INT_WORD, lowcore, io_int_word);
- OFFSET(__LC_STFL_FAC_LIST, lowcore, stfl_fac_list);
- OFFSET(__LC_STFLE_FAC_LIST, lowcore, stfle_fac_list);
OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
- OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
+ OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break);
+ OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe);
+ OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe);
OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
OFFSET(__LC_EXT_OLD_PSW, lowcore, external_old_psw);
OFFSET(__LC_SVC_OLD_PSW, lowcore, svc_old_psw);
@@ -143,39 +117,33 @@ int main(void)
OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags);
OFFSET(__LC_RETURN_PSW, lowcore, return_psw);
OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw);
- OFFSET(__LC_SYNC_ENTER_TIMER, lowcore, sync_enter_timer);
- OFFSET(__LC_ASYNC_ENTER_TIMER, lowcore, async_enter_timer);
+ OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer);
OFFSET(__LC_MCCK_ENTER_TIMER, lowcore, mcck_enter_timer);
OFFSET(__LC_EXIT_TIMER, lowcore, exit_timer);
- OFFSET(__LC_USER_TIMER, lowcore, user_timer);
- OFFSET(__LC_SYSTEM_TIMER, lowcore, system_timer);
- OFFSET(__LC_STEAL_TIMER, lowcore, steal_timer);
OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer);
OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock);
OFFSET(__LC_INT_CLOCK, lowcore, int_clock);
- OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock);
- OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator);
OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock);
OFFSET(__LC_CURRENT, lowcore, current_task);
OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack);
OFFSET(__LC_ASYNC_STACK, lowcore, async_stack);
OFFSET(__LC_NODAT_STACK, lowcore, nodat_stack);
OFFSET(__LC_RESTART_STACK, lowcore, restart_stack);
+ OFFSET(__LC_MCCK_STACK, lowcore, mcck_stack);
OFFSET(__LC_RESTART_FN, lowcore, restart_fn);
OFFSET(__LC_RESTART_DATA, lowcore, restart_data);
OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source);
+ OFFSET(__LC_RESTART_FLAGS, lowcore, restart_flags);
+ OFFSET(__LC_KERNEL_ASCE, lowcore, kernel_asce);
OFFSET(__LC_USER_ASCE, lowcore, user_asce);
- OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce);
OFFSET(__LC_LPP, lowcore, lpp);
OFFSET(__LC_CURRENT_PID, lowcore, current_pid);
- OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset);
- OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data);
- OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags);
- OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count);
OFFSET(__LC_GMAP, lowcore, gmap);
- OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline);
+ OFFSET(__LC_LAST_BREAK, lowcore, last_break);
/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
+ OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info);
+ OFFSET(__LC_OS_INFO, lowcore, os_info);
/* hardware defined lowcore locations 0x1000 - 0x18ff */
OFFSET(__LC_MCESAD, lowcore, mcesad);
OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2);
@@ -187,13 +155,11 @@ int main(void)
OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area);
OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area);
OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area);
+ OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area);
OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area);
OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area);
OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
BLANK();
- /* extended machine check save area */
- OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area);
- BLANK();
/* gmap/sie offsets */
OFFSET(__GMAP_ASCE, gmap, asce);
OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
@@ -202,5 +168,23 @@ int main(void)
OFFSET(__KEXEC_SHA_REGION_START, kexec_sha_region, start);
OFFSET(__KEXEC_SHA_REGION_LEN, kexec_sha_region, len);
DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region));
+ /* sizeof kernel parameter area */
+ DEFINE(__PARMAREA_SIZE, sizeof(struct parmarea));
+ /* kernel parameter area offsets */
+ DEFINE(IPL_DEVICE, PARMAREA + offsetof(struct parmarea, ipl_device));
+ DEFINE(INITRD_START, PARMAREA + offsetof(struct parmarea, initrd_start));
+ DEFINE(INITRD_SIZE, PARMAREA + offsetof(struct parmarea, initrd_size));
+ DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base));
+ DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size));
+ DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line));
+ DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size));
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* function graph return value tracing */
+ OFFSET(__FGRAPH_RET_GPR2, fgraph_ret_regs, gpr2);
+ OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp);
+ DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs));
+#endif
+ OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs);
+ DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs));
return 0;
}
diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c
index d395c6c9944c..02051a596b87 100644
--- a/arch/s390/kernel/audit.c
+++ b/arch/s390/kernel/audit.c
@@ -47,15 +47,17 @@ int audit_classify_syscall(int abi, unsigned syscall)
#endif
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 0;
+ return AUDITSC_NATIVE;
}
}
diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S
deleted file mode 100644
index b79e0fd571f8..000000000000
--- a/arch/s390/kernel/base.S
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * arch/s390/kernel/base.S
- *
- * Copyright IBM Corp. 2006, 2007
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
- * Michael Holzheu <holzheu@de.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/nospec-insn.h>
-#include <asm/ptrace.h>
-#include <asm/sigp.h>
-
- GEN_BR_THUNK %r9
- GEN_BR_THUNK %r14
-
-ENTRY(s390_base_ext_handler)
- stmg %r0,%r15,__LC_SAVE_AREA_ASYNC
- basr %r13,0
-0: aghi %r15,-STACK_FRAME_OVERHEAD
- larl %r1,s390_base_ext_handler_fn
- lg %r9,0(%r1)
- ltgr %r9,%r9
- jz 1f
- BASR_EX %r14,%r9
-1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC
- ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit
- lpswe __LC_EXT_OLD_PSW
-ENDPROC(s390_base_ext_handler)
-
- .section .bss
- .align 8
- .globl s390_base_ext_handler_fn
-s390_base_ext_handler_fn:
- .quad 0
- .previous
-
-ENTRY(s390_base_pgm_handler)
- stmg %r0,%r15,__LC_SAVE_AREA_SYNC
- basr %r13,0
-0: aghi %r15,-STACK_FRAME_OVERHEAD
- larl %r1,s390_base_pgm_handler_fn
- lg %r9,0(%r1)
- ltgr %r9,%r9
- jz 1f
- BASR_EX %r14,%r9
- lmg %r0,%r15,__LC_SAVE_AREA_SYNC
- lpswe __LC_PGM_OLD_PSW
-1: lpswe disabled_wait_psw-0b(%r13)
-ENDPROC(s390_base_pgm_handler)
-
- .align 8
-disabled_wait_psw:
- .quad 0x0002000180000000,0x0000000000000000 + s390_base_pgm_handler
-
- .section .bss
- .align 8
- .globl s390_base_pgm_handler_fn
-s390_base_pgm_handler_fn:
- .quad 0
- .previous
diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c
index d66825e53fce..56254fa06f99 100644
--- a/arch/s390/kernel/cache.c
+++ b/arch/s390/kernel/cache.c
@@ -3,7 +3,6 @@
* Extract CPU cache information and expose them via sysfs.
*
* Copyright IBM Corp. 2012
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#include <linux/seq_file.h>
@@ -47,7 +46,7 @@ struct cache_info {
#define CACHE_MAX_LEVEL 8
union cache_topology {
struct cache_info ci[CACHE_MAX_LEVEL];
- unsigned long long raw;
+ unsigned long raw;
};
static const char * const cache_type_string[] = {
@@ -71,8 +70,6 @@ void show_cacheinfo(struct seq_file *m)
struct cacheinfo *cache;
int idx;
- if (!test_facility(34))
- return;
this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask));
for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
cache = this_cpu_ci->info_list + idx;
@@ -132,8 +129,6 @@ int init_cache_level(unsigned int cpu)
union cache_topology ct;
enum cache_type ctype;
- if (!test_facility(34))
- return -EOPNOTSUPP;
if (!this_cpu_ci)
return -EINVAL;
ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0);
@@ -157,8 +152,6 @@ int populate_cache_leaves(unsigned int cpu)
union cache_topology ct;
enum cache_type ctype;
- if (!test_facility(34))
- return -EOPNOTSUPP;
ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0);
for (idx = 0, level = 0; level < this_cpu_ci->num_levels &&
idx < this_cpu_ci->num_leaves; idx++, level++) {
diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c
new file mode 100644
index 000000000000..554447768bdd
--- /dev/null
+++ b/arch/s390/kernel/cert_store.c
@@ -0,0 +1,812 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DIAG 0x320 support and certificate store handling
+ *
+ * Copyright IBM Corp. 2023
+ * Author(s): Anastasia Eskova <anastasia.eskova@ibm.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key-type.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <crypto/sha2.h>
+#include <keys/user-type.h>
+#include <asm/debug.h>
+#include <asm/diag.h>
+#include <asm/ebcdic.h>
+#include <asm/sclp.h>
+
+#define DIAG_MAX_RETRIES 10
+
+#define VCE_FLAGS_VALID_MASK 0x80
+
+#define ISM_LEN_DWORDS 4
+#define VCSSB_LEN_BYTES 128
+#define VCSSB_LEN_NO_CERTS 4
+#define VCB_LEN_NO_CERTS 64
+#define VC_NAME_LEN_BYTES 64
+
+#define CERT_STORE_KEY_TYPE_NAME "cert_store_key"
+#define CERT_STORE_KEYRING_NAME "cert_store"
+
+static debug_info_t *cert_store_dbf;
+static debug_info_t *cert_store_hexdump;
+
+#define pr_dbf_msg(fmt, ...) \
+ debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__)
+
+enum diag320_subcode {
+ DIAG320_SUBCODES = 0,
+ DIAG320_STORAGE = 1,
+ DIAG320_CERT_BLOCK = 2,
+};
+
+enum diag320_rc {
+ DIAG320_RC_OK = 0x0001,
+ DIAG320_RC_CS_NOMATCH = 0x0306,
+};
+
+/* Verification Certificates Store Support Block (VCSSB). */
+struct vcssb {
+ u32 vcssb_length;
+ u8 pad_0x04[3];
+ u8 version;
+ u8 pad_0x08[8];
+ u32 cs_token;
+ u8 pad_0x14[12];
+ u16 total_vc_index_count;
+ u16 max_vc_index_count;
+ u8 pad_0x24[28];
+ u32 max_vce_length;
+ u32 max_vcxe_length;
+ u8 pad_0x48[8];
+ u32 max_single_vcb_length;
+ u32 total_vcb_length;
+ u32 max_single_vcxb_length;
+ u32 total_vcxb_length;
+ u8 pad_0x60[32];
+} __packed __aligned(8);
+
+/* Verification Certificate Entry (VCE) Header. */
+struct vce_header {
+ u32 vce_length;
+ u8 flags;
+ u8 key_type;
+ u16 vc_index;
+ u8 vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */
+ u8 vc_format;
+ u8 pad_0x49;
+ u16 key_id_length;
+ u8 pad_0x4c;
+ u8 vc_hash_type;
+ u16 vc_hash_length;
+ u8 pad_0x50[4];
+ u32 vc_length;
+ u8 pad_0x58[8];
+ u16 vc_hash_offset;
+ u16 vc_offset;
+ u8 pad_0x64[28];
+} __packed __aligned(4);
+
+/* Verification Certificate Block (VCB) Header. */
+struct vcb_header {
+ u32 vcb_input_length;
+ u8 pad_0x04[4];
+ u16 first_vc_index;
+ u16 last_vc_index;
+ u32 pad_0x0c;
+ u32 cs_token;
+ u8 pad_0x14[12];
+ u32 vcb_output_length;
+ u8 pad_0x24[3];
+ u8 version;
+ u16 stored_vc_count;
+ u16 remaining_vc_count;
+ u8 pad_0x2c[20];
+} __packed __aligned(4);
+
+/* Verification Certificate Block (VCB). */
+struct vcb {
+ struct vcb_header vcb_hdr;
+ u8 vcb_buf[];
+} __packed __aligned(4);
+
+/* Verification Certificate Entry (VCE). */
+struct vce {
+ struct vce_header vce_hdr;
+ u8 cert_data_buf[];
+} __packed __aligned(4);
+
+static void cert_store_key_describe(const struct key *key, struct seq_file *m)
+{
+ char ascii[VC_NAME_LEN_BYTES + 1];
+
+ /*
+ * First 64 bytes of the key description is key name in EBCDIC CP 500.
+ * Convert it to ASCII for displaying in /proc/keys.
+ */
+ strscpy(ascii, key->description, sizeof(ascii));
+ EBCASC_500(ascii, VC_NAME_LEN_BYTES);
+ seq_puts(m, ascii);
+
+ seq_puts(m, &key->description[VC_NAME_LEN_BYTES]);
+ if (key_is_positive(key))
+ seq_printf(m, ": %u", key->datalen);
+}
+
+/*
+ * Certificate store key type takes over properties of
+ * user key but cannot be updated.
+ */
+static struct key_type key_type_cert_store_key = {
+ .name = CERT_STORE_KEY_TYPE_NAME,
+ .preparse = user_preparse,
+ .free_preparse = user_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .revoke = user_revoke,
+ .destroy = user_destroy,
+ .describe = cert_store_key_describe,
+ .read = user_read,
+};
+
+/* Logging functions. */
+static void pr_dbf_vcb(const struct vcb *b)
+{
+ pr_dbf_msg("VCB Header:");
+ pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length);
+ pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index);
+ pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index);
+ pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token);
+ pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length);
+ pr_dbf_msg("version: %d", b->vcb_hdr.version);
+ pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count);
+ pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count);
+}
+
+static void pr_dbf_vce(const struct vce *e)
+{
+ unsigned char vc_name[VC_NAME_LEN_BYTES + 1];
+ char log_string[VC_NAME_LEN_BYTES + 40];
+
+ pr_dbf_msg("VCE Header:");
+ pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length);
+ pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags);
+ pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type);
+ pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index);
+ pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format);
+ pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length);
+ pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type);
+ pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length);
+ pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset);
+ pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length);
+ pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset);
+
+ /* Certificate name in ASCII. */
+ memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES);
+ EBCASC_500(vc_name, VC_NAME_LEN_BYTES);
+ vc_name[VC_NAME_LEN_BYTES] = '\0';
+
+ snprintf(log_string, sizeof(log_string),
+ "index: %d vce_hdr.vc_name (ASCII): %s",
+ e->vce_hdr.vc_index, vc_name);
+ debug_text_event(cert_store_hexdump, 3, log_string);
+
+ /* Certificate data. */
+ debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start");
+ debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128);
+ debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end");
+ debug_event(cert_store_hexdump, 3,
+ (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128);
+}
+
+static void pr_dbf_vcssb(const struct vcssb *s)
+{
+ debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1");
+ debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES);
+
+ pr_dbf_msg("VCSSB:");
+ pr_dbf_msg("vcssb_length: %u", s->vcssb_length);
+ pr_dbf_msg("version: %u", s->version);
+ pr_dbf_msg("cs_token: %u", s->cs_token);
+ pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count);
+ pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count);
+ pr_dbf_msg("max_vce_length: %u", s->max_vce_length);
+ pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length);
+ pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length);
+ pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length);
+ pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length);
+ pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length);
+}
+
+static int __diag320(unsigned long subcode, void *addr)
+{
+ union register_pair rp = { .even = (unsigned long)addr, };
+
+ asm volatile(
+ " diag %[rp],%[subcode],0x320\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ : [rp] "+d" (rp.pair)
+ : [subcode] "d" (subcode)
+ : "cc", "memory");
+
+ return rp.odd;
+}
+
+static int diag320(unsigned long subcode, void *addr)
+{
+ diag_stat_inc(DIAG_STAT_X320);
+
+ return __diag320(subcode, addr);
+}
+
+/*
+ * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in
+ * VCE. Return -EINVAL if hashes don't match.
+ */
+static int check_certificate_hash(const struct vce *vce)
+{
+ u8 hash[SHA256_DIGEST_SIZE];
+ u16 vc_hash_length;
+ u8 *vce_hash;
+
+ vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset;
+ vc_hash_length = vce->vce_hdr.vc_hash_length;
+ sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash);
+ if (memcmp(vce_hash, hash, vc_hash_length) == 0)
+ return 0;
+
+ pr_dbf_msg("SHA256 hash of received certificate does not match");
+ debug_text_event(cert_store_hexdump, 3, "VCE hash:");
+ debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE);
+ debug_text_event(cert_store_hexdump, 3, "Calculated hash:");
+ debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE);
+
+ return -EINVAL;
+}
+
+static int check_certificate_valid(const struct vce *vce)
+{
+ if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) {
+ pr_dbf_msg("Certificate entry is invalid");
+ return -EINVAL;
+ }
+ if (vce->vce_hdr.vc_format != 1) {
+ pr_dbf_msg("Certificate format is not supported");
+ return -EINVAL;
+ }
+ if (vce->vce_hdr.vc_hash_type != 1) {
+ pr_dbf_msg("Hash type is not supported");
+ return -EINVAL;
+ }
+
+ return check_certificate_hash(vce);
+}
+
+static struct key *get_user_session_keyring(void)
+{
+ key_ref_t us_keyring_ref;
+
+ us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING,
+ KEY_LOOKUP_CREATE, KEY_NEED_LINK);
+ if (IS_ERR(us_keyring_ref)) {
+ pr_dbf_msg("Couldn't get user session keyring: %ld",
+ PTR_ERR(us_keyring_ref));
+ return ERR_PTR(-ENOKEY);
+ }
+ key_ref_put(us_keyring_ref);
+ return key_ref_to_ptr(us_keyring_ref);
+}
+
+/* Invalidate all keys from cert_store keyring. */
+static int invalidate_keyring_keys(struct key *keyring)
+{
+ unsigned long num_keys, key_index;
+ size_t keyring_payload_len;
+ key_serial_t *key_array;
+ struct key *current_key;
+ int rc;
+
+ keyring_payload_len = key_type_keyring.read(keyring, NULL, 0);
+ num_keys = keyring_payload_len / sizeof(key_serial_t);
+ key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL);
+ if (!key_array)
+ return -ENOMEM;
+
+ rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len);
+ if (rc != keyring_payload_len) {
+ pr_dbf_msg("Couldn't read keyring payload");
+ goto out;
+ }
+
+ for (key_index = 0; key_index < num_keys; key_index++) {
+ current_key = key_lookup(key_array[key_index]);
+ pr_dbf_msg("Invalidating key %08x", current_key->serial);
+
+ key_invalidate(current_key);
+ key_put(current_key);
+ rc = key_unlink(keyring, current_key);
+ if (rc) {
+ pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc);
+ break;
+ }
+ }
+out:
+ kfree(key_array);
+ return rc;
+}
+
+static struct key *find_cs_keyring(void)
+{
+ key_ref_t cs_keyring_ref;
+ struct key *cs_keyring;
+
+ cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true),
+ &key_type_keyring, CERT_STORE_KEYRING_NAME,
+ false);
+ if (!IS_ERR(cs_keyring_ref)) {
+ cs_keyring = key_ref_to_ptr(cs_keyring_ref);
+ key_ref_put(cs_keyring_ref);
+ goto found;
+ }
+ /* Search default locations: thread, process, session keyrings */
+ cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL);
+ if (IS_ERR(cs_keyring))
+ return NULL;
+ key_put(cs_keyring);
+found:
+ return cs_keyring;
+}
+
+static void cleanup_cs_keys(void)
+{
+ struct key *cs_keyring;
+
+ cs_keyring = find_cs_keyring();
+ if (!cs_keyring)
+ return;
+
+ pr_dbf_msg("Found cert_store keyring. Purging...");
+ /*
+ * Remove cert_store_key_type in case invalidation
+ * of old cert_store keys failed (= severe error).
+ */
+ if (invalidate_keyring_keys(cs_keyring))
+ unregister_key_type(&key_type_cert_store_key);
+
+ keyring_clear(cs_keyring);
+ key_invalidate(cs_keyring);
+ key_put(cs_keyring);
+ key_unlink(get_user_session_keyring(), cs_keyring);
+}
+
+static struct key *create_cs_keyring(void)
+{
+ static struct key *cs_keyring;
+
+ /* Cleanup previous cs_keyring and all associated keys if any. */
+ cleanup_cs_keys();
+ cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID,
+ GLOBAL_ROOT_GID, current_cred(),
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP,
+ NULL, get_user_session_keyring());
+ if (IS_ERR(cs_keyring)) {
+ pr_dbf_msg("Can't allocate cert_store keyring");
+ return NULL;
+ }
+
+ pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial);
+
+ /*
+ * In case a previous clean-up ran into an
+ * error and unregistered key type.
+ */
+ register_key_type(&key_type_cert_store_key);
+
+ return cs_keyring;
+}
+
+/*
+ * Allocate memory and create key description in format
+ * [key name in EBCDIC]:[VCE index]:[CS token].
+ * Return a pointer to key description or NULL if memory
+ * allocation failed. Memory should be freed by caller.
+ */
+static char *get_key_description(struct vcssb *vcssb, const struct vce *vce)
+{
+ size_t len, name_len;
+ u32 cs_token;
+ char *desc;
+
+ cs_token = vcssb->cs_token;
+ /* Description string contains "%64s:%05u:%010u\0". */
+ name_len = sizeof(vce->vce_hdr.vc_name);
+ len = name_len + 1 + 5 + 1 + 10 + 1;
+ desc = kmalloc(len, GFP_KERNEL);
+ if (!desc)
+ return NULL;
+
+ memcpy(desc, vce->vce_hdr.vc_name, name_len);
+ snprintf(desc + name_len, len - name_len, ":%05u:%010u",
+ vce->vce_hdr.vc_index, cs_token);
+
+ return desc;
+}
+
+/*
+ * Create a key of type "cert_store_key" using the data from VCE for key
+ * payload and key description. Link the key to "cert_store" keyring.
+ */
+static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce,
+ struct key *keyring)
+{
+ key_ref_t newkey;
+ char *desc;
+ int rc;
+
+ desc = get_key_description(vcssb, vce);
+ if (!desc)
+ return -ENOMEM;
+
+ newkey = key_create_or_update(
+ make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME,
+ desc, (u8 *)vce + vce->vce_hdr.vc_offset,
+ vce->vce_hdr.vc_length,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+
+ rc = PTR_ERR_OR_ZERO(newkey);
+ if (rc) {
+ pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc);
+ rc = -ENOKEY;
+ goto out;
+ }
+
+ key_ref_put(newkey);
+out:
+ kfree(desc);
+ return rc;
+}
+
+/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */
+static int get_vcssb(struct vcssb *vcssb)
+{
+ int diag320_rc;
+
+ memset(vcssb, 0, sizeof(*vcssb));
+ vcssb->vcssb_length = VCSSB_LEN_BYTES;
+ diag320_rc = diag320(DIAG320_STORAGE, vcssb);
+ pr_dbf_vcssb(vcssb);
+
+ if (diag320_rc != DIAG320_RC_OK) {
+ pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc);
+ return -EIO;
+ }
+ if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) {
+ pr_dbf_msg("No certificates available for current configuration");
+ return -ENOKEY;
+ }
+
+ return 0;
+}
+
+static u32 get_4k_mult_vcb_size(struct vcssb *vcssb)
+{
+ return round_up(vcssb->max_single_vcb_length, PAGE_SIZE);
+}
+
+/* Fill input fields of single-entry VCB that will be read by LPAR. */
+static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index)
+{
+ memset(vcb, 0, sizeof(*vcb));
+ vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb);
+ vcb->vcb_hdr.cs_token = vcssb->cs_token;
+
+ /* Request single entry. */
+ vcb->vcb_hdr.first_vc_index = index;
+ vcb->vcb_hdr.last_vc_index = index;
+}
+
+static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce)
+{
+ struct vce *extracted_vce;
+
+ extracted_vce = (struct vce *)vcb->vcb_buf;
+ memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length);
+ pr_dbf_vce(vce);
+}
+
+static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb)
+{
+ int rc, diag320_rc;
+
+ fill_vcb_input(vcssb, vcb, index);
+
+ diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb);
+ pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc);
+ pr_dbf_vcb(vcb);
+
+ switch (diag320_rc) {
+ case DIAG320_RC_OK:
+ rc = 0;
+ if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) {
+ pr_dbf_msg("No certificate entry for index %u", index);
+ rc = -ENOKEY;
+ } else if (vcb->vcb_hdr.remaining_vc_count != 0) {
+ /* Retry on insufficient space. */
+ pr_dbf_msg("Couldn't get all requested certificates");
+ rc = -EAGAIN;
+ }
+ break;
+ case DIAG320_RC_CS_NOMATCH:
+ pr_dbf_msg("Certificate Store token mismatch");
+ rc = -EAGAIN;
+ break;
+ default:
+ pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc);
+ rc = -EINVAL;
+ break;
+ }
+
+ return rc;
+}
+
+/*
+ * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call,
+ * extract VCE and create a key from its' certificate.
+ */
+static int create_key_from_sevcb(struct vcssb *vcssb, u16 index,
+ struct key *keyring)
+{
+ struct vcb *vcb;
+ struct vce *vce;
+ int rc;
+
+ rc = -ENOMEM;
+ vcb = vmalloc(get_4k_mult_vcb_size(vcssb));
+ vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr));
+ if (!vcb || !vce)
+ goto out;
+
+ rc = get_sevcb(vcssb, index, vcb);
+ if (rc)
+ goto out;
+
+ extract_vce_from_sevcb(vcb, vce);
+ rc = check_certificate_valid(vce);
+ if (rc)
+ goto out;
+
+ rc = create_key_from_vce(vcssb, vce, keyring);
+ if (rc)
+ goto out;
+
+ pr_dbf_msg("Successfully created key from Certificate Entry %d", index);
+out:
+ vfree(vce);
+ vfree(vcb);
+ return rc;
+}
+
+/*
+ * Request a single-entry VCB for each VCE available for the partition.
+ * Create a key from it and link it to cert_store keyring. If no keys
+ * could be created (i.e. VCEs were invalid) return -ENOKEY.
+ */
+static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring)
+{
+ int rc, index, count, added;
+
+ count = 0;
+ added = 0;
+ /* Certificate Store entries indices start with 1 and have no gaps. */
+ for (index = 1; index < vcssb->total_vc_index_count + 1; index++) {
+ pr_dbf_msg("Creating key from VCE %u", index);
+ rc = create_key_from_sevcb(vcssb, index, keyring);
+ count++;
+
+ if (rc == -EAGAIN)
+ return rc;
+
+ if (rc)
+ pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc);
+ else
+ added++;
+ }
+
+ if (added == 0) {
+ pr_dbf_msg("Processed %d entries. No keys created", count);
+ return -ENOKEY;
+ }
+
+ pr_info("Added %d of %d keys to cert_store keyring", added, count);
+
+ /*
+ * Do not allow to link more keys to certificate store keyring after all
+ * the VCEs were processed.
+ */
+ rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL);
+ if (rc)
+ pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc);
+
+ return 0;
+}
+
+/*
+ * Check which DIAG320 subcodes are installed.
+ * Return -ENOENT if subcodes 1 or 2 are not available.
+ */
+static int query_diag320_subcodes(void)
+{
+ unsigned long ism[ISM_LEN_DWORDS];
+ int rc;
+
+ rc = diag320(0, ism);
+ if (rc != DIAG320_RC_OK) {
+ pr_dbf_msg("DIAG320 subcode query returned %04x", rc);
+ return -ENOENT;
+ }
+
+ debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0");
+ debug_event(cert_store_hexdump, 3, ism, sizeof(ism));
+
+ if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) {
+ pr_dbf_msg("Not all required DIAG320 subcodes are installed");
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/*
+ * Check if Certificate Store is supported by the firmware and DIAG320 subcodes
+ * 1 and 2 are installed. Create cert_store keyring and link all certificates
+ * available for the current partition to it as "cert_store_key" type
+ * keys. On refresh or error invalidate cert_store keyring and destroy
+ * all keys of "cert_store_key" type.
+ */
+static int fill_cs_keyring(void)
+{
+ struct key *cs_keyring;
+ struct vcssb *vcssb;
+ int rc;
+
+ rc = -ENOMEM;
+ vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL);
+ if (!vcssb)
+ goto cleanup_keys;
+
+ rc = -ENOENT;
+ if (!sclp.has_diag320) {
+ pr_dbf_msg("Certificate Store is not supported");
+ goto cleanup_keys;
+ }
+
+ rc = query_diag320_subcodes();
+ if (rc)
+ goto cleanup_keys;
+
+ rc = get_vcssb(vcssb);
+ if (rc)
+ goto cleanup_keys;
+
+ rc = -ENOMEM;
+ cs_keyring = create_cs_keyring();
+ if (!cs_keyring)
+ goto cleanup_keys;
+
+ rc = add_certificates_to_keyring(vcssb, cs_keyring);
+ if (rc)
+ goto cleanup_cs_keyring;
+
+ goto out;
+
+cleanup_cs_keyring:
+ key_put(cs_keyring);
+cleanup_keys:
+ cleanup_cs_keys();
+out:
+ kfree(vcssb);
+ return rc;
+}
+
+static DEFINE_MUTEX(cs_refresh_lock);
+static int cs_status_val = -1;
+
+static ssize_t cs_status_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ if (cs_status_val == -1)
+ return sysfs_emit(buf, "uninitialized\n");
+ else if (cs_status_val == 0)
+ return sysfs_emit(buf, "ok\n");
+
+ return sysfs_emit(buf, "failed (%d)\n", cs_status_val);
+}
+
+static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status);
+
+static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int rc, retries;
+
+ pr_dbf_msg("Refresh certificate store information requested");
+ rc = mutex_lock_interruptible(&cs_refresh_lock);
+ if (rc)
+ return rc;
+
+ for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) {
+ /* Request certificates from certificate store. */
+ rc = fill_cs_keyring();
+ if (rc)
+ pr_dbf_msg("Failed to refresh certificate store information (%d)", rc);
+ if (rc != -EAGAIN)
+ break;
+ }
+ cs_status_val = rc;
+ mutex_unlock(&cs_refresh_lock);
+
+ return rc ?: count;
+}
+
+static struct kobj_attribute refresh_attr = __ATTR_WO(refresh);
+
+static const struct attribute *cert_store_attrs[] __initconst = {
+ &cs_status_attr.attr,
+ &refresh_attr.attr,
+ NULL,
+};
+
+static struct kobject *cert_store_kobj;
+
+static int __init cert_store_init(void)
+{
+ int rc = -ENOMEM;
+
+ cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64);
+ if (!cert_store_dbf)
+ goto cleanup_dbf;
+
+ cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128);
+ if (!cert_store_hexdump)
+ goto cleanup_dbf;
+
+ debug_register_view(cert_store_hexdump, &debug_hex_ascii_view);
+ debug_register_view(cert_store_dbf, &debug_sprintf_view);
+
+ /* Create directory /sys/firmware/cert_store. */
+ cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj);
+ if (!cert_store_kobj)
+ goto cleanup_dbf;
+
+ rc = sysfs_create_files(cert_store_kobj, cert_store_attrs);
+ if (rc)
+ goto cleanup_kobj;
+
+ register_key_type(&key_type_cert_store_key);
+
+ return rc;
+
+cleanup_kobj:
+ kobject_put(cert_store_kobj);
+cleanup_dbf:
+ debug_unregister(cert_store_dbf);
+ debug_unregister(cert_store_hexdump);
+
+ return rc;
+}
+device_initcall(cert_store_init);
diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c
index 444fb1f66944..a7c46e8310f0 100644
--- a/arch/s390/kernel/compat_audit.c
+++ b/arch/s390/kernel/compat_audit.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#undef __s390x__
+#include <linux/audit_arch.h>
#include <asm/unistd.h>
#include "audit.h"
@@ -32,14 +33,16 @@ int s390_classify_syscall(unsigned syscall)
{
switch(syscall) {
case __NR_open:
- return 2;
+ return AUDITSC_OPEN;
case __NR_openat:
- return 3;
+ return AUDITSC_OPENAT;
case __NR_socketcall:
- return 4;
+ return AUDITSC_SOCKETCALL;
case __NR_execve:
- return 5;
+ return AUDITSC_EXECVE;
+ case __NR_openat2:
+ return AUDITSC_OPENAT2;
default:
- return 1;
+ return AUDITSC_COMPAT;
}
}
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index 64509e7dbd3b..ef23739b277c 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -5,69 +5,59 @@
#include <linux/compat.h>
#include <linux/socket.h>
#include <linux/syscalls.h>
+#include <asm/ptrace.h>
-/* Macro that masks the high order bit of an 32 bit pointer and converts it*/
-/* to a 64 bit pointer */
-#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL))
-#define AA(__x) \
- ((unsigned long)(__x))
+/*
+ * Macro that masks the high order bit of a 32 bit pointer and
+ * converts it to a 64 bit pointer.
+ */
+#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL))
+#define AA(__x) ((unsigned long)(__x))
/* Now 32bit compatibility types */
struct ipc_kludge_32 {
- __u32 msgp; /* pointer */
- __s32 msgtyp;
+ __u32 msgp; /* pointer */
+ __s32 msgtyp;
};
/* asm/sigcontext.h */
-typedef union
-{
- __u64 d;
- __u32 f;
+typedef union {
+ __u64 d;
+ __u32 f;
} freg_t32;
-typedef struct
-{
+typedef struct {
unsigned int fpc;
unsigned int pad;
- freg_t32 fprs[__NUM_FPRS];
+ freg_t32 fprs[__NUM_FPRS];
} _s390_fp_regs32;
-typedef struct
-{
- __u32 mask;
- __u32 addr;
-} _psw_t32 __attribute__ ((aligned(8)));
-
-typedef struct
-{
- _psw_t32 psw;
+typedef struct {
+ psw_t32 psw;
__u32 gprs[__NUM_GPRS];
__u32 acrs[__NUM_ACRS];
} _s390_regs_common32;
-typedef struct
-{
+typedef struct {
_s390_regs_common32 regs;
- _s390_fp_regs32 fpregs;
+ _s390_fp_regs32 fpregs;
} _sigregs32;
-typedef struct
-{
- __u32 gprs_high[__NUM_GPRS];
- __u64 vxrs_low[__NUM_VXRS_LOW];
- __vector128 vxrs_high[__NUM_VXRS_HIGH];
- __u8 __reserved[128];
+typedef struct {
+ __u32 gprs_high[__NUM_GPRS];
+ __u64 vxrs_low[__NUM_VXRS_LOW];
+ __vector128 vxrs_high[__NUM_VXRS_HIGH];
+ __u8 __reserved[128];
} _sigregs_ext32;
#define _SIGCONTEXT_NSIG32 64
#define _SIGCONTEXT_NSIG_BPW32 32
#define __SIGNAL_FRAMESIZE32 96
-#define _SIGMASK_COPY_SIZE32 (sizeof(u32)*2)
+#define _SIGMASK_COPY_SIZE32 (sizeof(u32) * 2)
-struct sigcontext32
-{
+struct sigcontext32 {
__u32 oldmask[_COMPAT_NSIG_WORDS];
- __u32 sregs; /* pointer */
+ __u32 sregs; /* pointer */
};
/* asm/signal.h */
@@ -75,11 +65,11 @@ struct sigcontext32
/* asm/ucontext.h */
struct ucontext32 {
__u32 uc_flags;
- __u32 uc_link; /* pointer */
+ __u32 uc_link; /* pointer */
compat_stack_t uc_stack;
_sigregs32 uc_mcontext;
compat_sigset_t uc_sigmask;
- /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */
+ /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */
unsigned char __unused[128 - sizeof(compat_sigset_t)];
_sigregs_ext32 uc_mcontext_ext;
};
@@ -88,25 +78,6 @@ struct stat64_emu31;
struct mmap_arg_struct_emu31;
struct fadvise64_64_args;
-long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group);
-long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group);
-long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group);
-long compat_sys_s390_setregid16(u16 rgid, u16 egid);
-long compat_sys_s390_setgid16(u16 gid);
-long compat_sys_s390_setreuid16(u16 ruid, u16 euid);
-long compat_sys_s390_setuid16(u16 uid);
-long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid);
-long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid);
-long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid);
-long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid);
-long compat_sys_s390_setfsuid16(u16 uid);
-long compat_sys_s390_setfsgid16(u16 gid);
-long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist);
-long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist);
-long compat_sys_s390_getuid16(void);
-long compat_sys_s390_geteuid16(void);
-long compat_sys_s390_getgid16(void);
-long compat_sys_s390_getegid16(void);
long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low);
long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low);
long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low);
@@ -118,8 +89,8 @@ long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbu
long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag);
long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg);
long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg);
-long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count);
-long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count);
+long compat_sys_s390_read(unsigned int fd, char __user *buf, compat_size_t count);
+long compat_sys_s390_write(unsigned int fd, const char __user *buf, compat_size_t count);
long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise);
long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args);
long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags);
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 38d4bdbc34b9..f8fc6c25d051 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -28,6 +28,8 @@
#include <linux/uaccess.h>
#include <asm/lowcore.h>
#include <asm/switch_to.h>
+#include <asm/vdso.h>
+#include <asm/fpu/api.h>
#include "compat_linux.h"
#include "compat_ptrace.h"
#include "entry.h"
@@ -88,7 +90,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
_sigregs32 user_sregs;
int i;
- /* Alwys make any pending restarted system call return -EINTR */
+ /* Always make any pending restarted system call return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs)))
@@ -97,10 +99,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI))
return -EINVAL;
- /* Test the floating-point-control word. */
- if (test_fp_ctl(user_sregs.fpregs.fpc))
- return -EINVAL;
-
/* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */
regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) |
(__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 |
@@ -136,9 +134,9 @@ static int save_sigregs_ext32(struct pt_regs *regs,
return -EFAULT;
/* Save vector registers to signal stack */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
+ vxrs[i] = current->thread.fpu.vxrs[i].low;
if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
sizeof(sregs_ext->vxrs_low)) ||
__copy_to_user(&sregs_ext->vxrs_high,
@@ -164,7 +162,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
*(__u32 *)&regs->gprs[i] = gprs_high[i];
/* Restore vector registers from signal stack */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
sizeof(sregs_ext->vxrs_low)) ||
__copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
@@ -172,7 +170,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
sizeof(sregs_ext->vxrs_high)))
return -EFAULT;
for (i = 0; i < __NUM_VXRS_LOW; i++)
- *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i];
+ current->thread.fpu.vxrs[i].low = vxrs[i];
}
return 0;
}
@@ -264,7 +262,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
* the machine supports it
*/
frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved);
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
frame_size -= sizeof(frame->sregs_ext.vxrs_low) +
sizeof(frame->sregs_ext.vxrs_high);
frame = get_sigframe(&ksig->ka, regs, frame_size);
@@ -303,11 +301,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set,
restorer = (unsigned long __force)
ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
} else {
- /* Signal frames without vectors registers are short ! */
- __u16 __user *svc = (void __user *) frame + frame_size - 2;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+ restorer = VDSO32_SYMBOL(current, sigreturn);
}
/* Set up registers for signal handler */
@@ -351,11 +345,12 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
* the machine supports it
*/
uc_flags = UC_GPRS_HIGH;
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
uc_flags |= UC_VXRS;
- } else
+ } else {
frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) +
sizeof(frame->uc.uc_mcontext_ext.vxrs_high);
+ }
frame = get_sigframe(&ksig->ka, regs, frame_size);
if (frame == (void __user *) -1UL)
return -EFAULT;
@@ -370,10 +365,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
restorer = (unsigned long __force)
ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE;
} else {
- __u16 __user *svc = &frame->svc_insn;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE;
+ restorer = VDSO32_SYMBOL(current, rt_sigreturn);
}
/* Create siginfo on the signal stack */
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index af013b4244d3..b210a29d3ee9 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -16,41 +16,45 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/io.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/cpcmd.h>
-#include <asm/io.h>
static DEFINE_SPINLOCK(cpcmd_lock);
static char cpcmd_buf[241];
static int diag8_noresponse(int cmdlen)
{
- register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
- register unsigned long reg3 asm ("3") = cmdlen;
-
asm volatile(
- " diag %1,%0,0x8\n"
- : "+d" (reg3) : "d" (reg2) : "cc");
- return reg3;
+ " diag %[rx],%[ry],0x8\n"
+ : [ry] "+&d" (cmdlen)
+ : [rx] "d" (__pa(cpcmd_buf))
+ : "cc");
+ return cmdlen;
}
static int diag8_response(int cmdlen, char *response, int *rlen)
{
- register unsigned long reg2 asm ("2") = (addr_t) cpcmd_buf;
- register unsigned long reg3 asm ("3") = (addr_t) response;
- register unsigned long reg4 asm ("4") = cmdlen | 0x40000000L;
- register unsigned long reg5 asm ("5") = *rlen;
+ union register_pair rx, ry;
+ int cc;
+ rx.even = __pa(cpcmd_buf);
+ rx.odd = __pa(response);
+ ry.even = cmdlen | 0x40000000L;
+ ry.odd = *rlen;
asm volatile(
- " diag %2,%0,0x8\n"
- " brc 8,1f\n"
- " agr %1,%4\n"
- "1:\n"
- : "+d" (reg4), "+d" (reg5)
- : "d" (reg2), "d" (reg3), "d" (*rlen) : "cc");
- *rlen = reg5;
- return reg4;
+ " diag %[rx],%[ry],0x8\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), [ry] "+&d" (ry.pair)
+ : [rx] "d" (rx.pair)
+ : "cc");
+ if (cc)
+ *rlen += ry.odd;
+ else
+ *rlen = ry.odd;
+ return ry.even;
}
/*
diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c
new file mode 100644
index 000000000000..1b2ae42a0c15
--- /dev/null
+++ b/arch/s390/kernel/cpufeature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2022
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/bug.h>
+#include <asm/elf.h>
+
+enum {
+ TYPE_HWCAP,
+ TYPE_FACILITY,
+};
+
+struct s390_cpu_feature {
+ unsigned int type : 4;
+ unsigned int num : 28;
+};
+
+static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = {
+ [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA},
+ [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS},
+ [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158},
+};
+
+/*
+ * cpu_have_feature - Test CPU features on module initialization
+ */
+int cpu_have_feature(unsigned int num)
+{
+ struct s390_cpu_feature *feature;
+
+ if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES))
+ return 0;
+ feature = &s390_cpu_features[num];
+ switch (feature->type) {
+ case TYPE_HWCAP:
+ return !!(elf_hwcap & BIT(feature->num));
+ case TYPE_FACILITY:
+ return test_facility(feature->num);
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+EXPORT_SYMBOL(cpu_have_feature);
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index f96a5857bbfd..5c46c2659305 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -15,11 +15,14 @@
#include <linux/slab.h>
#include <linux/memblock.h>
#include <linux/elf.h>
+#include <linux/uio.h>
#include <asm/asm-offsets.h>
#include <asm/os_info.h>
#include <asm/elf.h>
#include <asm/ipl.h>
#include <asm/sclp.h>
+#include <asm/maccess.h>
+#include <asm/fpu/api.h>
#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y)))
#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y)))
@@ -44,7 +47,7 @@ struct save_area {
u64 fprs[16];
u32 fpc;
u32 prefix;
- u64 todpreg;
+ u32 todpreg;
u64 timer;
u64 todcmp;
u64 vxrs_low[16];
@@ -60,9 +63,9 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu)
{
struct save_area *sa;
- sa = (void *) memblock_phys_alloc(sizeof(*sa), 8);
+ sa = memblock_alloc(sizeof(*sa), 8);
if (!sa)
- panic("Failed to allocate save area\n");
+ return NULL;
if (is_boot_cpu)
list_add(&sa->list, &dump_save_areas);
@@ -108,126 +111,65 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs)
/* Copy lower halves of vector registers 0-15 */
for (i = 0; i < 16; i++)
- memcpy(&sa->vxrs_low[i], &vxrs[i].u[2], 8);
+ sa->vxrs_low[i] = vxrs[i].low;
/* Copy vector registers 16-31 */
memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128));
}
-/*
- * Return physical address for virtual address
- */
-static inline void *load_real_addr(void *addr)
-{
- unsigned long real_addr;
-
- asm volatile(
- " lra %0,0(%1)\n"
- " jz 0f\n"
- " la %0,0\n"
- "0:"
- : "=a" (real_addr) : "a" (addr) : "cc");
- return (void *)real_addr;
-}
-
-/*
- * Copy memory of the old, dumped system to a kernel space virtual address
- */
-int copy_oldmem_kernel(void *dst, void *src, size_t count)
+static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count)
{
- unsigned long from, len;
- void *ra;
- int rc;
+ size_t len, copied, res = 0;
while (count) {
- from = __pa(src);
- if (!OLDMEM_BASE && from < sclp.hsa_size) {
- /* Copy from zfcpdump HSA area */
- len = min(count, sclp.hsa_size - from);
- rc = memcpy_hsa_kernel(dst, from, len);
- if (rc)
- return rc;
+ if (!oldmem_data.start && src < sclp.hsa_size) {
+ /* Copy from zfcp/nvme dump HSA area */
+ len = min(count, sclp.hsa_size - src);
+ copied = memcpy_hsa_iter(iter, src, len);
} else {
/* Check for swapped kdump oldmem areas */
- if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) {
- from -= OLDMEM_BASE;
- len = min(count, OLDMEM_SIZE - from);
- } else if (OLDMEM_BASE && from < OLDMEM_SIZE) {
- len = min(count, OLDMEM_SIZE - from);
- from += OLDMEM_BASE;
+ if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) {
+ src -= oldmem_data.start;
+ len = min(count, oldmem_data.size - src);
+ } else if (oldmem_data.start && src < oldmem_data.size) {
+ len = min(count, oldmem_data.size - src);
+ src += oldmem_data.start;
} else {
len = count;
}
- if (is_vmalloc_or_module_addr(dst)) {
- ra = load_real_addr(dst);
- len = min(PAGE_SIZE - offset_in_page(ra), len);
- } else {
- ra = dst;
- }
- if (memcpy_real(ra, (void *) from, len))
- return -EFAULT;
+ copied = memcpy_real_iter(iter, src, len);
}
- dst += len;
- src += len;
- count -= len;
+ count -= copied;
+ src += copied;
+ res += copied;
+ if (copied < len)
+ break;
}
- return 0;
+ return res;
}
-/*
- * Copy memory of the old, dumped system to a user space virtual address
- */
-static int copy_oldmem_user(void __user *dst, void *src, size_t count)
+int copy_oldmem_kernel(void *dst, unsigned long src, size_t count)
{
- unsigned long from, len;
- int rc;
+ struct iov_iter iter;
+ struct kvec kvec;
- while (count) {
- from = __pa(src);
- if (!OLDMEM_BASE && from < sclp.hsa_size) {
- /* Copy from zfcpdump HSA area */
- len = min(count, sclp.hsa_size - from);
- rc = memcpy_hsa_user(dst, from, len);
- if (rc)
- return rc;
- } else {
- /* Check for swapped kdump oldmem areas */
- if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) {
- from -= OLDMEM_BASE;
- len = min(count, OLDMEM_SIZE - from);
- } else if (OLDMEM_BASE && from < OLDMEM_SIZE) {
- len = min(count, OLDMEM_SIZE - from);
- from += OLDMEM_BASE;
- } else {
- len = count;
- }
- rc = copy_to_user_real(dst, (void *) from, count);
- if (rc)
- return rc;
- }
- dst += len;
- src += len;
- count -= len;
- }
+ kvec.iov_base = dst;
+ kvec.iov_len = count;
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
+ if (copy_oldmem_iter(&iter, src, count) < count)
+ return -EFAULT;
return 0;
}
/*
* Copy one page from "oldmem"
*/
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
- unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize,
+ unsigned long offset)
{
- void *src;
- int rc;
+ unsigned long src;
- if (!csize)
- return 0;
- src = (void *) (pfn << PAGE_SHIFT) + offset;
- if (userbuf)
- rc = copy_oldmem_user((void __force __user *) buf, src, csize);
- else
- rc = copy_oldmem_kernel((void *) buf, src, csize);
- return rc;
+ src = pfn_to_phys(pfn) + offset;
+ return copy_oldmem_iter(iter, src, csize);
}
/*
@@ -243,10 +185,10 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
unsigned long size_old;
int rc;
- if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) {
- size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT));
+ if (pfn < oldmem_data.size >> PAGE_SHIFT) {
+ size_old = min(size, oldmem_data.size - (pfn << PAGE_SHIFT));
rc = remap_pfn_range(vma, from,
- pfn + (OLDMEM_BASE >> PAGE_SHIFT),
+ pfn + (oldmem_data.start >> PAGE_SHIFT),
size_old, prot);
if (rc || size == size_old)
return rc;
@@ -258,7 +200,7 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
}
/*
- * Remap "oldmem" for zfcpdump
+ * Remap "oldmem" for zfcp/nvme dump
*
* We only map available memory above HSA size. Memory below HSA size
* is read on demand using the copy_oldmem_page() function.
@@ -283,12 +225,12 @@ static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma,
}
/*
- * Remap "oldmem" for kdump or zfcpdump
+ * Remap "oldmem" for kdump or zfcp/nvme dump
*/
int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
- if (OLDMEM_BASE)
+ if (oldmem_data.start)
return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot);
else
return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size,
@@ -365,7 +307,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa)
memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs));
memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw));
memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs));
- nt_prstatus.pr_pid = cpu;
+ nt_prstatus.common.pr_pid = cpu;
/* Prepare fpregset (floating point) note */
memset(&nt_fpregset, 0, sizeof(nt_fpregset));
memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc));
@@ -378,7 +320,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa)
ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg));
ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs));
ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix));
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
ptr = nt_init(ptr, NT_S390_VXRS_HIGH,
&sa->vxrs_high, sizeof(sa->vxrs_high));
ptr = nt_init(ptr, NT_S390_VXRS_LOW,
@@ -402,7 +344,7 @@ static size_t get_cpu_elf_notes_size(void)
size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg));
size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs));
size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix));
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high));
size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low));
}
@@ -429,10 +371,10 @@ static void *nt_prpsinfo(void *ptr)
static void *get_vmcoreinfo_old(unsigned long *size)
{
char nt_name[11], *vmcoreinfo;
+ unsigned long addr;
Elf64_Nhdr note;
- void *addr;
- if (copy_oldmem_kernel(&addr, &S390_lowcore.vmcore_info, sizeof(addr)))
+ if (copy_oldmem_kernel(&addr, __LC_VMCORE_INFO, sizeof(addr)))
return NULL;
memset(nt_name, 0, sizeof(nt_name));
if (copy_oldmem_kernel(&note, addr, sizeof(note)))
@@ -549,8 +491,7 @@ static int get_mem_chunk_cnt(void)
int cnt = 0;
u64 idx;
- for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
- MEMBLOCK_NONE, NULL, NULL, NULL)
+ for_each_physmem_range(idx, &oldmem_type, NULL, NULL)
cnt++;
return cnt;
}
@@ -558,17 +499,16 @@ static int get_mem_chunk_cnt(void)
/*
* Initialize ELF loads (new kernel)
*/
-static void loads_init(Elf64_Phdr *phdr, u64 loads_offset)
+static void loads_init(Elf64_Phdr *phdr)
{
phys_addr_t start, end;
u64 idx;
- for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
- MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_physmem_range(idx, &oldmem_type, &start, &end) {
phdr->p_filesz = end - start;
phdr->p_type = PT_LOAD;
phdr->p_offset = start;
- phdr->p_vaddr = start;
+ phdr->p_vaddr = (unsigned long)__va(start);
phdr->p_paddr = start;
phdr->p_memsz = end - start;
phdr->p_flags = PF_R | PF_W | PF_X;
@@ -629,23 +569,23 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
{
Elf64_Phdr *phdr_notes, *phdr_loads;
+ size_t alloc_size;
int mem_chunk_cnt;
void *ptr, *hdr;
- u32 alloc_size;
u64 hdr_off;
- /* If we are not in kdump or zfcpdump mode return */
- if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP)
+ /* If we are not in kdump or zfcp/nvme dump mode return */
+ if (!oldmem_data.start && !is_ipl_type_dump())
return 0;
- /* If we cannot get HSA size for zfcpdump return error */
- if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp.hsa_size)
+ /* If we cannot get HSA size for zfcp/nvme dump return error */
+ if (is_ipl_type_dump() && !sclp.hsa_size)
return -ENODEV;
/* For kdump, exclude previous crashkernel memory */
- if (OLDMEM_BASE) {
- oldmem_region.base = OLDMEM_BASE;
- oldmem_region.size = OLDMEM_SIZE;
- oldmem_type.total_size = OLDMEM_SIZE;
+ if (oldmem_data.start) {
+ oldmem_region.base = oldmem_data.start;
+ oldmem_region.size = oldmem_data.size;
+ oldmem_type.total_size = oldmem_data.size;
}
mem_chunk_cnt = get_mem_chunk_cnt();
@@ -673,7 +613,7 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off);
/* Init loads */
hdr_off = PTR_DIFF(ptr, hdr);
- loads_init(phdr_loads, hdr_off);
+ loads_init(phdr_loads);
*addr = (unsigned long long) hdr;
*size = (unsigned long long) hdr_off;
BUG_ON(elfcorehdr_size > alloc_size);
diff --git a/arch/s390/kernel/ctlreg.c b/arch/s390/kernel/ctlreg.c
new file mode 100644
index 000000000000..8cc26cf2c64a
--- /dev/null
+++ b/arch/s390/kernel/ctlreg.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/irqflags.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cache.h>
+#include <asm/abs_lowcore.h>
+#include <asm/ctlreg.h>
+
+/*
+ * ctl_lock guards access to global control register contents which
+ * are kept in the control register save area within absolute lowcore
+ * at physical address zero.
+ */
+static DEFINE_SPINLOCK(system_ctl_lock);
+
+void system_ctlreg_lock(void)
+ __acquires(&system_ctl_lock)
+{
+ spin_lock(&system_ctl_lock);
+}
+
+void system_ctlreg_unlock(void)
+ __releases(&system_ctl_lock)
+{
+ spin_unlock(&system_ctl_lock);
+}
+
+static bool system_ctlreg_area_init __ro_after_init;
+
+void __init system_ctlreg_init_save_area(struct lowcore *lc)
+{
+ struct lowcore *abs_lc;
+
+ abs_lc = get_abs_lowcore();
+ __local_ctl_store(0, 15, lc->cregs_save_area);
+ __local_ctl_store(0, 15, abs_lc->cregs_save_area);
+ put_abs_lowcore(abs_lc);
+ system_ctlreg_area_init = true;
+}
+
+struct ctlreg_parms {
+ unsigned long andval;
+ unsigned long orval;
+ unsigned long val;
+ int request;
+ int cr;
+};
+
+static void ctlreg_callback(void *info)
+{
+ struct ctlreg_parms *pp = info;
+ struct ctlreg regs[16];
+
+ __local_ctl_store(0, 15, regs);
+ if (pp->request == CTLREG_LOAD) {
+ regs[pp->cr].val = pp->val;
+ } else {
+ regs[pp->cr].val &= pp->andval;
+ regs[pp->cr].val |= pp->orval;
+ }
+ __local_ctl_load(0, 15, regs);
+}
+
+static void system_ctlreg_update(void *info)
+{
+ unsigned long flags;
+
+ if (system_state == SYSTEM_BOOTING) {
+ /*
+ * For very early calls do not call on_each_cpu()
+ * since not everything might be setup.
+ */
+ local_irq_save(flags);
+ ctlreg_callback(info);
+ local_irq_restore(flags);
+ } else {
+ on_each_cpu(ctlreg_callback, info, 1);
+ }
+}
+
+void system_ctlreg_modify(unsigned int cr, unsigned long data, int request)
+{
+ struct ctlreg_parms pp = { .cr = cr, .request = request, };
+ struct lowcore *abs_lc;
+
+ switch (request) {
+ case CTLREG_SET_BIT:
+ pp.orval = 1UL << data;
+ pp.andval = -1UL;
+ break;
+ case CTLREG_CLEAR_BIT:
+ pp.orval = 0;
+ pp.andval = ~(1UL << data);
+ break;
+ case CTLREG_LOAD:
+ pp.val = data;
+ break;
+ }
+ if (system_ctlreg_area_init) {
+ system_ctlreg_lock();
+ abs_lc = get_abs_lowcore();
+ if (request == CTLREG_LOAD) {
+ abs_lc->cregs_save_area[cr].val = pp.val;
+ } else {
+ abs_lc->cregs_save_area[cr].val &= pp.andval;
+ abs_lc->cregs_save_area[cr].val |= pp.orval;
+ }
+ put_abs_lowcore(abs_lc);
+ system_ctlreg_update(&pp);
+ system_ctlreg_unlock();
+ } else {
+ system_ctlreg_update(&pp);
+ }
+}
+EXPORT_SYMBOL(system_ctlreg_modify);
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 6d321f5f101d..85328a0ef3b6 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -2,7 +2,7 @@
/*
* S/390 debug facility
*
- * Copyright IBM Corp. 1999, 2012
+ * Copyright IBM Corp. 1999, 2020
*
* Author(s): Michael Holzheu (holzheu@de.ibm.com),
* Holger Smolinski (Holger.Smolinski@de.ibm.com)
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/minmax.h>
#include <linux/debugfs.h>
#include <asm/debug.h>
@@ -59,7 +60,7 @@ typedef struct {
* except of floats, and long long (32 bit)
*
*/
- long args[0];
+ long args[];
} debug_sprintf_entry_t;
/* internal function prototyes */
@@ -90,27 +91,13 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view,
size_t user_buf_size, loff_t *offset);
static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
char *out_buf, const char *in_buf);
-static int debug_raw_format_fn(debug_info_t *id,
- struct debug_view *view, char *out_buf,
- const char *in_buf);
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
- int area, debug_entry_t *entry, char *out_buf);
-
static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
- char *out_buf, debug_sprintf_entry_t *curr_event);
+ char *out_buf, const char *inbuf);
+static void debug_areas_swap(debug_info_t *a, debug_info_t *b);
+static void debug_events_append(debug_info_t *dest, debug_info_t *src);
/* globals */
-struct debug_view debug_raw_view = {
- "raw",
- NULL,
- &debug_raw_header_fn,
- &debug_raw_format_fn,
- NULL,
- NULL
-};
-EXPORT_SYMBOL(debug_raw_view);
-
struct debug_view debug_hex_ascii_view = {
"hex_ascii",
NULL,
@@ -152,7 +139,7 @@ struct debug_view debug_sprintf_view = {
"sprintf",
NULL,
&debug_dflt_header_fn,
- (debug_format_proc_t *)&debug_sprintf_format_fn,
+ &debug_sprintf_format_fn,
NULL,
NULL
};
@@ -198,9 +185,10 @@ static debug_entry_t ***debug_areas_alloc(int pages_per_area, int nr_areas)
if (!areas)
goto fail_malloc_areas;
for (i = 0; i < nr_areas; i++) {
+ /* GFP_NOWARN to avoid user triggerable WARN, we handle fails */
areas[i] = kmalloc_array(pages_per_area,
sizeof(debug_entry_t *),
- GFP_KERNEL);
+ GFP_KERNEL | __GFP_NOWARN);
if (!areas[i])
goto fail_malloc_areas2;
for (j = 0; j < pages_per_area; j++) {
@@ -262,7 +250,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area,
rc->level = level;
rc->buf_size = buf_size;
rc->entry_size = sizeof(debug_entry_t) + buf_size;
- strlcpy(rc->name, name, sizeof(rc->name));
+ strscpy(rc->name, name, sizeof(rc->name));
memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *));
memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *));
refcount_set(&(rc->ref_count), 0);
@@ -326,24 +314,6 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area,
goto out;
rc->mode = mode & ~S_IFMT;
-
- /* create root directory */
- rc->debugfs_root_entry = debugfs_create_dir(rc->name,
- debug_debugfs_root_entry);
-
- /* append new element to linked list */
- if (!debug_area_first) {
- /* first element in list */
- debug_area_first = rc;
- rc->prev = NULL;
- } else {
- /* append element to end of list */
- debug_area_last->next = rc;
- rc->prev = debug_area_last;
- }
- debug_area_last = rc;
- rc->next = NULL;
-
refcount_set(&rc->ref_count, 1);
out:
return rc;
@@ -403,27 +373,10 @@ static void debug_info_get(debug_info_t *db_info)
*/
static void debug_info_put(debug_info_t *db_info)
{
- int i;
-
if (!db_info)
return;
- if (refcount_dec_and_test(&db_info->ref_count)) {
- for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
- if (!db_info->views[i])
- continue;
- debugfs_remove(db_info->debugfs_entries[i]);
- }
- debugfs_remove(db_info->debugfs_root_entry);
- if (db_info == debug_area_first)
- debug_area_first = db_info->next;
- if (db_info == debug_area_last)
- debug_area_last = db_info->prev;
- if (db_info->prev)
- db_info->prev->next = db_info->next;
- if (db_info->next)
- db_info->next->prev = db_info->prev;
+ if (refcount_dec_and_test(&db_info->ref_count))
debug_info_free(db_info);
- }
}
/*
@@ -448,7 +401,7 @@ static int debug_format_entry(file_private_info_t *p_info)
act_entry = (debug_entry_t *) ((char *)id_snap->areas[p_info->act_area]
[p_info->act_page] + p_info->act_entry);
- if (act_entry->id.stck == 0LL)
+ if (act_entry->clock == 0LL)
goto out; /* empty entry */
if (view->header_proc)
len += view->header_proc(id_snap, view, p_info->act_area,
@@ -647,6 +600,31 @@ static int debug_close(struct inode *inode, struct file *file)
return 0; /* success */
}
+/* Create debugfs entries and add to internal list. */
+static void _debug_register(debug_info_t *id)
+{
+ /* create root directory */
+ id->debugfs_root_entry = debugfs_create_dir(id->name,
+ debug_debugfs_root_entry);
+
+ /* append new element to linked list */
+ if (!debug_area_first) {
+ /* first element in list */
+ debug_area_first = id;
+ id->prev = NULL;
+ } else {
+ /* append element to end of list */
+ debug_area_last->next = id;
+ id->prev = debug_area_last;
+ }
+ debug_area_last = id;
+ id->next = NULL;
+
+ debug_register_view(id, &debug_level_view);
+ debug_register_view(id, &debug_flush_view);
+ debug_register_view(id, &debug_pages_view);
+}
+
/**
* debug_register_mode() - creates and initializes debug area.
*
@@ -676,19 +654,16 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area,
if ((uid != 0) || (gid != 0))
pr_warn("Root becomes the owner of all s390dbf files in sysfs\n");
BUG_ON(!initialized);
- mutex_lock(&debug_mutex);
/* create new debug_info */
rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode);
- if (!rc)
- goto out;
- debug_register_view(rc, &debug_level_view);
- debug_register_view(rc, &debug_flush_view);
- debug_register_view(rc, &debug_pages_view);
-out:
- if (!rc)
+ if (rc) {
+ mutex_lock(&debug_mutex);
+ _debug_register(rc);
+ mutex_unlock(&debug_mutex);
+ } else {
pr_err("Registering debug feature %s failed\n", name);
- mutex_unlock(&debug_mutex);
+ }
return rc;
}
EXPORT_SYMBOL(debug_register_mode);
@@ -718,6 +693,82 @@ debug_info_t *debug_register(const char *name, int pages_per_area,
EXPORT_SYMBOL(debug_register);
/**
+ * debug_register_static() - registers a static debug area
+ *
+ * @id: Handle for static debug area
+ * @pages_per_area: Number of pages per area
+ * @nr_areas: Number of debug areas
+ *
+ * Register debug_info_t defined using DEFINE_STATIC_DEBUG_INFO.
+ *
+ * Note: This function is called automatically via an initcall generated by
+ * DEFINE_STATIC_DEBUG_INFO.
+ */
+void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas)
+{
+ unsigned long flags;
+ debug_info_t *copy;
+
+ if (!initialized) {
+ pr_err("Tried to register debug feature %s too early\n",
+ id->name);
+ return;
+ }
+
+ copy = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size,
+ id->level, ALL_AREAS);
+ if (!copy) {
+ pr_err("Registering debug feature %s failed\n", id->name);
+
+ /* Clear pointers to prevent tracing into released initdata. */
+ spin_lock_irqsave(&id->lock, flags);
+ id->areas = NULL;
+ id->active_pages = NULL;
+ id->active_entries = NULL;
+ spin_unlock_irqrestore(&id->lock, flags);
+
+ return;
+ }
+
+ /* Replace static trace area with dynamic copy. */
+ spin_lock_irqsave(&id->lock, flags);
+ debug_events_append(copy, id);
+ debug_areas_swap(id, copy);
+ spin_unlock_irqrestore(&id->lock, flags);
+
+ /* Clear pointers to initdata and discard copy. */
+ copy->areas = NULL;
+ copy->active_pages = NULL;
+ copy->active_entries = NULL;
+ debug_info_free(copy);
+
+ mutex_lock(&debug_mutex);
+ _debug_register(id);
+ mutex_unlock(&debug_mutex);
+}
+
+/* Remove debugfs entries and remove from internal list. */
+static void _debug_unregister(debug_info_t *id)
+{
+ int i;
+
+ for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
+ if (!id->views[i])
+ continue;
+ debugfs_remove(id->debugfs_entries[i]);
+ }
+ debugfs_remove(id->debugfs_root_entry);
+ if (id == debug_area_first)
+ debug_area_first = id->next;
+ if (id == debug_area_last)
+ debug_area_last = id->prev;
+ if (id->prev)
+ id->prev->next = id->next;
+ if (id->next)
+ id->next->prev = id->prev;
+}
+
+/**
* debug_unregister() - give back debug area.
*
* @id: handle for debug log
@@ -730,8 +781,10 @@ void debug_unregister(debug_info_t *id)
if (!id)
return;
mutex_lock(&debug_mutex);
- debug_info_put(id);
+ _debug_unregister(id);
mutex_unlock(&debug_mutex);
+
+ debug_info_put(id);
}
EXPORT_SYMBOL(debug_unregister);
@@ -741,35 +794,28 @@ EXPORT_SYMBOL(debug_unregister);
*/
static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area)
{
- debug_entry_t ***new_areas;
+ debug_info_t *new_id;
unsigned long flags;
- int rc = 0;
if (!id || (nr_areas <= 0) || (pages_per_area < 0))
return -EINVAL;
- if (pages_per_area > 0) {
- new_areas = debug_areas_alloc(pages_per_area, nr_areas);
- if (!new_areas) {
- pr_info("Allocating memory for %i pages failed\n",
- pages_per_area);
- rc = -ENOMEM;
- goto out;
- }
- } else {
- new_areas = NULL;
+
+ new_id = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size,
+ id->level, ALL_AREAS);
+ if (!new_id) {
+ pr_info("Allocating memory for %i pages failed\n",
+ pages_per_area);
+ return -ENOMEM;
}
+
spin_lock_irqsave(&id->lock, flags);
- debug_areas_free(id);
- id->areas = new_areas;
- id->nr_areas = nr_areas;
- id->pages_per_area = pages_per_area;
- id->active_area = 0;
- memset(id->active_entries, 0, sizeof(int)*id->nr_areas);
- memset(id->active_pages, 0, sizeof(int)*id->nr_areas);
+ debug_events_append(new_id, id);
+ debug_areas_swap(new_id, id);
+ debug_info_free(new_id);
spin_unlock_irqrestore(&id->lock, flags);
pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area);
-out:
- return rc;
+
+ return 0;
}
/**
@@ -787,16 +833,17 @@ void debug_set_level(debug_info_t *id, int new_level)
if (!id)
return;
- spin_lock_irqsave(&id->lock, flags);
+
if (new_level == DEBUG_OFF_LEVEL) {
- id->level = DEBUG_OFF_LEVEL;
pr_info("%s: switched off\n", id->name);
} else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) {
pr_info("%s: level %i is out of range (%i - %i)\n",
id->name, new_level, 0, DEBUG_MAX_LEVEL);
- } else {
- id->level = new_level;
+ return;
}
+
+ spin_lock_irqsave(&id->lock, flags);
+ id->level = new_level;
spin_unlock_irqrestore(&id->lock, flags);
}
EXPORT_SYMBOL(debug_set_level);
@@ -836,6 +883,42 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id)
id->active_entries[id->active_area]);
}
+/* Swap debug areas of a and b. */
+static void debug_areas_swap(debug_info_t *a, debug_info_t *b)
+{
+ swap(a->nr_areas, b->nr_areas);
+ swap(a->pages_per_area, b->pages_per_area);
+ swap(a->areas, b->areas);
+ swap(a->active_area, b->active_area);
+ swap(a->active_pages, b->active_pages);
+ swap(a->active_entries, b->active_entries);
+}
+
+/* Append all debug events in active area from source to destination log. */
+static void debug_events_append(debug_info_t *dest, debug_info_t *src)
+{
+ debug_entry_t *from, *to, *last;
+
+ if (!src->areas || !dest->areas)
+ return;
+
+ /* Loop over all entries in src, starting with oldest. */
+ from = get_active_entry(src);
+ last = from;
+ do {
+ if (from->clock != 0LL) {
+ to = get_active_entry(dest);
+ memset(to, 0, dest->entry_size);
+ memcpy(to, from, min(src->entry_size,
+ dest->entry_size));
+ proceed_active_entry(dest);
+ }
+
+ proceed_active_entry(src);
+ from = get_active_entry(src);
+ } while (from != last);
+}
+
/*
* debug_finish_entry:
* - set timestamp, caller address, cpu number etc.
@@ -844,12 +927,17 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id)
static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active,
int level, int exception)
{
- active->id.stck = get_tod_clock_fast() -
- *(unsigned long long *) &tod_clock_base[1];
- active->id.fields.cpuid = smp_processor_id();
+ unsigned long timestamp;
+ union tod_clock clk;
+
+ store_tod_clock_ext(&clk);
+ timestamp = clk.us;
+ timestamp -= TOD_UNIX_EPOCH >> 12;
+ active->clock = timestamp;
+ active->cpu = smp_processor_id();
active->caller = __builtin_return_address(0);
- active->id.fields.exception = exception;
- active->id.fields.level = level;
+ active->exception = exception;
+ active->level = level;
proceed_active_entry(id);
if (exception)
proceed_active_area(id);
@@ -867,7 +955,7 @@ static int debug_active = 1;
* if debug_active is already off
*/
static int s390dbf_procactive(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!write || debug_stoppable || !debug_active)
return proc_dointvec(table, write, buffer, lenp, ppos);
@@ -890,17 +978,6 @@ static struct ctl_table s390dbf_table[] = {
.mode = S_IRUGO | S_IWUSR,
.proc_handler = s390dbf_procactive,
},
- { }
-};
-
-static struct ctl_table s390dbf_dir_table[] = {
- {
- .procname = "s390dbf",
- .maxlen = 0,
- .mode = S_IRUGO | S_IXUGO,
- .child = s390dbf_table,
- },
- { }
};
static struct ctl_table_header *s390dbf_sysctl_header;
@@ -1121,16 +1198,17 @@ int debug_register_view(debug_info_t *id, struct debug_view *view)
break;
}
if (i == DEBUG_MAX_VIEWS) {
- pr_err("Registering view %s/%s would exceed the maximum "
- "number of views %i\n", id->name, view->name, i);
rc = -1;
} else {
id->views[i] = view;
id->debugfs_entries[i] = pde;
}
spin_unlock_irqrestore(&id->lock, flags);
- if (rc)
+ if (rc) {
+ pr_err("Registering view %s/%s would exceed the maximum "
+ "number of views %i\n", id->name, view->name, i);
debugfs_remove(pde);
+ }
out:
return rc;
}
@@ -1385,32 +1463,6 @@ out:
}
/*
- * prints debug header in raw format
- */
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
- int area, debug_entry_t *entry, char *out_buf)
-{
- int rc;
-
- rc = sizeof(debug_entry_t);
- memcpy(out_buf, entry, sizeof(debug_entry_t));
- return rc;
-}
-
-/*
- * prints debug data in raw format
- */
-static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view,
- char *out_buf, const char *in_buf)
-{
- int rc;
-
- rc = id->buf_size;
- memcpy(out_buf, in_buf, id->buf_size);
- return rc;
-}
-
-/*
* prints debug data in hex/ascii format
*/
static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
@@ -1439,25 +1491,24 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view,
int area, debug_entry_t *entry, char *out_buf)
{
- unsigned long base, sec, usec;
+ unsigned long sec, usec;
unsigned long caller;
unsigned int level;
char *except_str;
int rc = 0;
- level = entry->id.fields.level;
- base = (*(unsigned long *) &tod_clock_base[0]) >> 4;
- sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12);
+ level = entry->level;
+ sec = entry->clock;
usec = do_div(sec, USEC_PER_SEC);
- if (entry->id.fields.exception)
+ if (entry->exception)
except_str = "*";
else
except_str = "-";
caller = (unsigned long) entry->caller;
- rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %02i %pK ",
+ rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ",
area, sec, usec, level, except_str,
- entry->id.fields.cpuid, (void *)caller);
+ entry->cpu, (void *)caller);
return rc;
}
EXPORT_SYMBOL(debug_dflt_header_fn);
@@ -1470,8 +1521,9 @@ EXPORT_SYMBOL(debug_dflt_header_fn);
#define DEBUG_SPRINTF_MAX_ARGS 10
static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
- char *out_buf, debug_sprintf_entry_t *curr_event)
+ char *out_buf, const char *inbuf)
{
+ debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf;
int num_longs, num_used_args = 0, i, rc = 0;
int index[DEBUG_SPRINTF_MAX_ARGS];
@@ -1511,7 +1563,7 @@ out:
*/
static int __init debug_init(void)
{
- s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table);
+ s390dbf_sysctl_header = register_sysctl("s390dbf", s390dbf_table);
mutex_lock(&debug_mutex);
debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL);
initialized = 1;
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index e9dac9a24d3f..92fdc35f028c 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -11,9 +11,12 @@
#include <linux/cpu.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
+#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/trace/diag.h>
#include <asm/sections.h>
+#include "entry.h"
struct diag_stat {
unsigned int counter[NR_DIAG_STAT];
@@ -33,6 +36,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
[DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" },
[DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" },
[DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" },
+ [DIAG_STAT_X08C] = { .code = 0x08c, .name = "Access 3270 Display Device Information" },
[DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" },
[DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" },
[DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" },
@@ -47,11 +51,24 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = {
[DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" },
[DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" },
[DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" },
+ [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" },
[DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" },
};
-struct diag_ops __bootdata_preserved(diag_dma_ops);
-struct diag210 *__bootdata_preserved(__diag210_tmp_dma);
+struct diag_ops __amode31_ref diag_amode31_ops = {
+ .diag210 = _diag210_amode31,
+ .diag26c = _diag26c_amode31,
+ .diag14 = _diag14_amode31,
+ .diag0c = _diag0c_amode31,
+ .diag8c = _diag8c_amode31,
+ .diag308_reset = _diag308_reset_amode31
+};
+
+static struct diag210 _diag210_tmp_amode31 __section(".amode31.data");
+struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31;
+
+static struct diag8c _diag8c_tmp_amode31 __section(".amode31.data");
+static struct diag8c __amode31_ref *__diag8c_tmp_amode31 = &_diag8c_tmp_amode31;
static int show_diag_stat(struct seq_file *m, void *v)
{
@@ -59,7 +76,7 @@ static int show_diag_stat(struct seq_file *m, void *v)
unsigned long n = (unsigned long) v - 1;
int cpu, prec, tmp;
- get_online_cpus();
+ cpus_read_lock();
if (n == 0) {
seq_puts(m, " ");
@@ -78,13 +95,13 @@ static int show_diag_stat(struct seq_file *m, void *v)
}
seq_printf(m, " %s\n", diag_map[n-1].name);
}
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
static void *show_diag_stat_start(struct seq_file *m, loff_t *pos)
{
- return *pos <= nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL;
+ return *pos <= NR_DIAG_STAT ? (void *)((unsigned long) *pos + 1) : NULL;
}
static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos)
@@ -104,18 +121,7 @@ static const struct seq_operations show_diag_stat_sops = {
.show = show_diag_stat,
};
-static int show_diag_stat_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &show_diag_stat_sops);
-}
-
-static const struct file_operations show_diag_stat_fops = {
- .open = show_diag_stat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
+DEFINE_SEQ_ATTRIBUTE(show_diag_stat);
static int __init show_diag_stat_init(void)
{
@@ -133,7 +139,7 @@ void diag_stat_inc(enum diag_stat_enum nr)
}
EXPORT_SYMBOL(diag_stat_inc);
-void diag_stat_inc_norecursion(enum diag_stat_enum nr)
+void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr)
{
this_cpu_inc(diag_stat.counter[nr]);
trace_s390_diagnose_norecursion(diag_map[nr].code);
@@ -146,26 +152,46 @@ EXPORT_SYMBOL(diag_stat_inc_norecursion);
int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
{
diag_stat_inc(DIAG_STAT_X014);
- return diag_dma_ops.diag14(rx, ry1, subcode);
+ return diag_amode31_ops.diag14(rx, ry1, subcode);
}
EXPORT_SYMBOL(diag14);
static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
{
- register unsigned long _subcode asm("0") = *subcode;
- register unsigned long _size asm("1") = size;
+ union register_pair rp = { .even = *subcode, .odd = size };
asm volatile(
- " diag %2,%0,0x204\n"
+ " diag %[addr],%[rp],0x204\n"
"0: nopr %%r7\n"
EX_TABLE(0b,0b)
- : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
- *subcode = _subcode;
- return _size;
+ : [rp] "+&d" (rp.pair) : [addr] "d" (addr) : "memory");
+ *subcode = rp.even;
+ return rp.odd;
}
+/**
+ * diag204() - Issue diagnose 204 call.
+ * @subcode: Subcode of diagnose 204 to be executed.
+ * @size: Size of area in pages which @area points to, if given.
+ * @addr: Vmalloc'ed memory area where the result is written to.
+ *
+ * Execute diagnose 204 with the given subcode and write the result to the
+ * memory area specified with @addr. For subcodes which do not write a
+ * result to memory both @size and @addr must be zero. If @addr is
+ * specified it must be page aligned and must have been allocated with
+ * vmalloc(). Conversion to real / physical addresses will be handled by
+ * this function if required.
+ */
int diag204(unsigned long subcode, unsigned long size, void *addr)
{
+ if (addr) {
+ if (WARN_ON_ONCE(!is_vmalloc_addr(addr)))
+ return -1;
+ if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE)))
+ return -1;
+ }
+ if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4)
+ addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr));
diag_stat_inc(DIAG_STAT_X204);
size = __diag204(&subcode, size, addr);
if (subcode)
@@ -184,20 +210,42 @@ int diag210(struct diag210 *addr)
int ccode;
spin_lock_irqsave(&diag210_lock, flags);
- *__diag210_tmp_dma = *addr;
+ *__diag210_tmp_amode31 = *addr;
diag_stat_inc(DIAG_STAT_X210);
- ccode = diag_dma_ops.diag210(__diag210_tmp_dma);
+ ccode = diag_amode31_ops.diag210(__diag210_tmp_amode31);
- *addr = *__diag210_tmp_dma;
+ *addr = *__diag210_tmp_amode31;
spin_unlock_irqrestore(&diag210_lock, flags);
return ccode;
}
EXPORT_SYMBOL(diag210);
+/*
+ * Diagnose 8C: Access 3270 Display Device Information
+ */
+int diag8c(struct diag8c *addr, struct ccw_dev_id *devno)
+{
+ static DEFINE_SPINLOCK(diag8c_lock);
+ unsigned long flags;
+ int ccode;
+
+ spin_lock_irqsave(&diag8c_lock, flags);
+
+ diag_stat_inc(DIAG_STAT_X08C);
+ ccode = diag_amode31_ops.diag8c(__diag8c_tmp_amode31, devno, sizeof(*addr));
+
+ *addr = *__diag8c_tmp_amode31;
+ spin_unlock_irqrestore(&diag8c_lock, flags);
+
+ return ccode;
+}
+EXPORT_SYMBOL(diag8c);
+
int diag224(void *ptr)
{
+ unsigned long addr = __pa(ptr);
int rc = -EOPNOTSUPP;
diag_stat_inc(DIAG_STAT_X224);
@@ -206,7 +254,7 @@ int diag224(void *ptr)
"0: lhi %0,0x0\n"
"1:\n"
EX_TABLE(0b,1b)
- : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+ : "+d" (rc) :"d" (0), "d" (addr) : "memory");
return rc;
}
EXPORT_SYMBOL(diag224);
@@ -217,6 +265,6 @@ EXPORT_SYMBOL(diag224);
int diag26c(void *req, void *resp, enum diag26c_sc subcode)
{
diag_stat_inc(DIAG_STAT_X26C);
- return diag_dma_ops.diag26c(req, resp, subcode);
+ return diag_amode31_ops.diag26c(req, resp, subcode);
}
EXPORT_SYMBOL(diag26c);
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index f304802ecf7b..89dc826a8d2e 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -24,8 +24,8 @@
#include <linux/kdebug.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
+#include <linux/io.h>
#include <asm/dis.h>
-#include <asm/io.h>
#include <asm/cpcmd.h>
#include <asm/lowcore.h>
#include <asm/debug.h>
@@ -278,6 +278,7 @@ static const unsigned char formats[][6] = {
[INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 },
[INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 },
[INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 },
+ [INSTR_SIY_RD] = { D20_20, B_16, 0, 0, 0, 0 },
[INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 },
[INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 },
[INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 },
@@ -312,10 +313,12 @@ static const unsigned char formats[][6] = {
[INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 },
[INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 },
[INSTR_VRR_VV0U0U] = { V_8, V_12, U4_32, U4_24, 0, 0 },
+ [INSTR_VRR_VV0U2] = { V_8, V_12, U4_24, 0, 0, 0 },
[INSTR_VRR_VV0UU2] = { V_8, V_12, U4_32, U4_28, 0, 0 },
[INSTR_VRR_VV0UUU] = { V_8, V_12, U4_32, U4_28, U4_24, 0 },
[INSTR_VRR_VVV] = { V_8, V_12, V_16, 0, 0, 0 },
[INSTR_VRR_VVV0U] = { V_8, V_12, V_16, U4_32, 0, 0 },
+ [INSTR_VRR_VVV0U0] = { V_8, V_12, V_16, U4_24, 0, 0 },
[INSTR_VRR_VVV0U0U] = { V_8, V_12, V_16, U4_32, U4_24, 0 },
[INSTR_VRR_VVV0UU] = { V_8, V_12, V_16, U4_32, U4_28, 0 },
[INSTR_VRR_VVV0UUU] = { V_8, V_12, V_16, U4_32, U4_28, U4_24 },
@@ -482,32 +485,38 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
return (int) (ptr - buffer);
}
+static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len)
+{
+ if (user_mode(regs)) {
+ if (copy_from_user(dst, (char __user *)src, len))
+ return -EFAULT;
+ } else {
+ if (copy_from_kernel_nofault(dst, src, len))
+ return -EFAULT;
+ }
+ return 0;
+}
+
void show_code(struct pt_regs *regs)
{
char *mode = user_mode(regs) ? "User" : "Krnl";
unsigned char code[64];
char buffer[128], *ptr;
- mm_segment_t old_fs;
unsigned long addr;
int start, end, opsize, hops, i;
/* Get a snapshot of the 64 bytes surrounding the fault address. */
- old_fs = get_fs();
- set_fs(user_mode(regs) ? USER_DS : KERNEL_DS);
for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) {
addr = regs->psw.addr - 34 + start;
- if (__copy_from_user(code + start - 2,
- (char __user *) addr, 2))
+ if (copy_from_regs(regs, code + start - 2, (void *)addr, 2))
break;
}
for (end = 32; end < 64; end += 2) {
addr = regs->psw.addr + end - 32;
- if (__copy_from_user(code + end,
- (char __user *) addr, 2))
+ if (copy_from_regs(regs, code + end, (void *)addr, 2))
break;
}
- set_fs(old_fs);
- /* Code snapshot useable ? */
+ /* Code snapshot usable ? */
if ((regs->psw.addr & 1) || start >= end) {
printk("%s Code: Bad PSW.\n", mode);
return;
@@ -557,7 +566,7 @@ void show_code(struct pt_regs *regs)
void print_fn_code(unsigned char *code, unsigned long len)
{
- char buffer[64], *ptr;
+ char buffer[128], *ptr;
int opsize, i;
while (len) {
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index d306fe04489a..d2012635b093 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -41,51 +41,50 @@ const char *stack_type_name(enum stack_type type)
EXPORT_SYMBOL_GPL(stack_type_name);
static inline bool in_stack(unsigned long sp, struct stack_info *info,
- enum stack_type type, unsigned long low,
- unsigned long high)
+ enum stack_type type, unsigned long stack)
{
- if (sp < low || sp >= high)
+ if (sp < stack || sp >= stack + THREAD_SIZE)
return false;
info->type = type;
- info->begin = low;
- info->end = high;
+ info->begin = stack;
+ info->end = stack + THREAD_SIZE;
return true;
}
static bool in_task_stack(unsigned long sp, struct task_struct *task,
struct stack_info *info)
{
- unsigned long stack;
+ unsigned long stack = (unsigned long)task_stack_page(task);
- stack = (unsigned long) task_stack_page(task);
- return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE);
+ return in_stack(sp, info, STACK_TYPE_TASK, stack);
}
static bool in_irq_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long frame_size, top;
+ unsigned long stack = S390_lowcore.async_stack - STACK_INIT_OFFSET;
- frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
- top = S390_lowcore.async_stack + frame_size;
- return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top);
+ return in_stack(sp, info, STACK_TYPE_IRQ, stack);
}
static bool in_nodat_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long frame_size, top;
+ unsigned long stack = S390_lowcore.nodat_stack - STACK_INIT_OFFSET;
- frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
- top = S390_lowcore.nodat_stack + frame_size;
- return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top);
+ return in_stack(sp, info, STACK_TYPE_NODAT, stack);
+}
+
+static bool in_mcck_stack(unsigned long sp, struct stack_info *info)
+{
+ unsigned long stack = S390_lowcore.mcck_stack - STACK_INIT_OFFSET;
+
+ return in_stack(sp, info, STACK_TYPE_MCCK, stack);
}
static bool in_restart_stack(unsigned long sp, struct stack_info *info)
{
- unsigned long frame_size, top;
+ unsigned long stack = S390_lowcore.restart_stack - STACK_INIT_OFFSET;
- frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs);
- top = S390_lowcore.restart_stack + frame_size;
- return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top);
+ return in_stack(sp, info, STACK_TYPE_RESTART, stack);
}
int get_stack_info(unsigned long sp, struct task_struct *task,
@@ -108,7 +107,8 @@ int get_stack_info(unsigned long sp, struct task_struct *task,
/* Check per-cpu stacks */
if (!in_irq_stack(sp, info) &&
!in_nodat_stack(sp, info) &&
- !in_restart_stack(sp, info))
+ !in_restart_stack(sp, info) &&
+ !in_mcck_stack(sp, info))
goto unknown;
recursion_check:
@@ -126,22 +126,29 @@ unknown:
return -EINVAL;
}
-void show_stack(struct task_struct *task, unsigned long *stack)
+void show_stack(struct task_struct *task, unsigned long *stack,
+ const char *loglvl)
{
struct unwind_state state;
- printk("Call Trace:\n");
+ printk("%sCall Trace:\n", loglvl);
unwind_for_each_frame(&state, task, NULL, (unsigned long) stack)
- printk(state.reliable ? " [<%016lx>] %pSR \n" :
- "([<%016lx>] %pSR)\n",
- state.ip, (void *) state.ip);
+ printk(state.reliable ? "%s [<%016lx>] %pSR \n" :
+ "%s([<%016lx>] %pSR)\n",
+ loglvl, state.ip, (void *) state.ip);
debug_show_held_locks(task ? : current);
}
static void show_last_breaking_event(struct pt_regs *regs)
{
printk("Last Breaking-Event-Address:\n");
- printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]);
+ printk(" [<%016lx>] ", regs->last_break);
+ if (user_mode(regs)) {
+ print_vma_addr(KERN_CONT, regs->last_break);
+ pr_cont("\n");
+ } else {
+ pr_cont("%pSR\n", (void *)regs->last_break);
+ }
}
void show_registers(struct pt_regs *regs)
@@ -175,13 +182,13 @@ void show_regs(struct pt_regs *regs)
show_registers(regs);
/* Show stack backtrace if pt_regs is from kernel mode */
if (!user_mode(regs))
- show_stack(NULL, (unsigned long *) regs->gprs[15]);
+ show_stack(NULL, (unsigned long *) regs->gprs[15], KERN_DEFAULT);
show_last_breaking_event(regs);
}
static DEFINE_SPINLOCK(die_lock);
-void die(struct pt_regs *regs, const char *str)
+void __noreturn die(struct pt_regs *regs, const char *str)
{
static int die_counter;
@@ -195,6 +202,8 @@ void die(struct pt_regs *regs, const char *str)
regs->int_code >> 17, ++die_counter);
#ifdef CONFIG_PREEMPT
pr_cont("PREEMPT ");
+#elif defined(CONFIG_PREEMPT_RT)
+ pr_cont("PREEMPT_RT ");
#endif
pr_cont("SMP ");
if (debug_pagealloc_enabled())
@@ -211,5 +220,5 @@ void die(struct pt_regs *regs, const char *str)
if (panic_on_oops)
panic("Fatal exception: panic_on_oops");
oops_exit();
- do_exit(SIGSEGV);
+ make_task_dead(SIGSEGV);
}
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index db32a55daaec..2345ea332b97 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -2,7 +2,6 @@
/*
* Copyright IBM Corp. 2007, 2009
* Author(s): Hongjie Yang <hongjie@us.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#define KMSG_COMPONENT "setup"
@@ -18,6 +17,8 @@
#include <linux/pfn.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
+#include <asm/asm-extable.h>
+#include <linux/memblock.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/ipl.h>
@@ -33,18 +34,43 @@
#include <asm/switch_to.h>
#include "entry.h"
+#define decompressor_handled_param(param) \
+static int __init ignore_decompressor_param_##param(char *s) \
+{ \
+ return 0; \
+} \
+early_param(#param, ignore_decompressor_param_##param)
+
+decompressor_handled_param(mem);
+decompressor_handled_param(vmalloc);
+decompressor_handled_param(dfltcc);
+decompressor_handled_param(facilities);
+decompressor_handled_param(nokaslr);
+decompressor_handled_param(cmma);
+#if IS_ENABLED(CONFIG_KVM)
+decompressor_handled_param(prot_virt);
+#endif
+
+static void __init kasan_early_init(void)
+{
+#ifdef CONFIG_KASAN
+ init_task.kasan_depth = 0;
+ sclp_early_printk("KernelAddressSanitizer initialized\n");
+#endif
+}
+
static void __init reset_tod_clock(void)
{
- u64 time;
+ union tod_clock clk;
- if (store_tod_clock(&time) == 0)
+ if (store_tod_clock_ext_cc(&clk) == 0)
return;
/* TOD clock not running. Set the clock to Unix Epoch. */
- if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0)
+ if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk))
disabled_wait();
- memset(tod_clock_base, 0, 16);
- *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH;
+ memset(&tod_clock_base, 0, sizeof(tod_clock_base));
+ tod_clock_base.tod = TOD_UNIX_EPOCH;
S390_lowcore.last_update_clock = TOD_UNIX_EPOCH;
}
@@ -147,44 +173,27 @@ static __init void setup_topology(void)
topology_max_mnest = max_mnest;
}
-static void early_pgm_check_handler(void)
+void __do_early_pgm_check(struct pt_regs *regs)
{
- const struct exception_table_entry *fixup;
- unsigned long cr0, cr0_new;
- unsigned long addr;
-
- addr = S390_lowcore.program_old_psw.addr;
- fixup = s390_search_extables(addr);
- if (!fixup)
+ if (!fixup_exception(regs))
disabled_wait();
- /* Disable low address protection before storing into lowcore. */
- __ctl_store(cr0, 0, 0);
- cr0_new = cr0 & ~(1UL << 28);
- __ctl_load(cr0_new, 0, 0);
- S390_lowcore.program_old_psw.addr = extable_fixup(fixup);
- __ctl_load(cr0, 0, 0);
}
static noinline __init void setup_lowcore_early(void)
{
psw_t psw;
- psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA;
- psw.addr = (unsigned long) s390_base_ext_handler;
- S390_lowcore.external_new_psw = psw;
- psw.addr = (unsigned long) s390_base_pgm_handler;
+ psw.addr = (unsigned long)early_pgm_check_handler;
+ psw.mask = PSW_KERNEL_BITS;
S390_lowcore.program_new_psw = psw;
- s390_base_pgm_handler_fn = early_pgm_check_handler;
S390_lowcore.preempt_count = INIT_PREEMPT_COUNT;
}
static noinline __init void setup_facility_list(void)
{
- memcpy(S390_lowcore.alt_stfle_fac_list,
- S390_lowcore.stfle_fac_list,
- sizeof(S390_lowcore.alt_stfle_fac_list));
+ memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list));
if (!IS_ENABLED(CONFIG_KERNEL_NOBP))
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
static __init void detect_diag9c(void)
@@ -204,26 +213,11 @@ static __init void detect_diag9c(void)
S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C;
}
-static __init void detect_diag44(void)
-{
- int rc;
-
- diag_stat_inc(DIAG_STAT_X044);
- asm volatile(
- " diag 0,0,0x44\n"
- "0: la %0,0\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "=d" (rc) : "0" (-EOPNOTSUPP) : "cc");
- if (!rc)
- S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG44;
-}
-
static __init void detect_machine_facilities(void)
{
if (test_facility(8)) {
S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1;
- __ctl_set_bit(0, 23);
+ system_ctl_set_bit(0, CR0_EDAT_BIT);
}
if (test_facility(78))
S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2;
@@ -231,26 +225,28 @@ static __init void detect_machine_facilities(void)
S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE;
if (test_facility(50) && test_facility(73)) {
S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
- __ctl_set_bit(0, 55);
+ system_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT);
}
if (test_facility(51))
S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC;
- if (test_facility(129)) {
- S390_lowcore.machine_flags |= MACHINE_FLAG_VX;
- __ctl_set_bit(0, 17);
- }
- if (test_facility(130) && !noexec_disabled) {
+ if (test_facility(129))
+ system_ctl_set_bit(0, CR0_VECTOR_BIT);
+ if (test_facility(130))
S390_lowcore.machine_flags |= MACHINE_FLAG_NX;
- __ctl_set_bit(0, 20);
- }
if (test_facility(133))
S390_lowcore.machine_flags |= MACHINE_FLAG_GS;
- if (test_facility(139) && (tod_clock_base[1] & 0x80)) {
+ if (test_facility(139) && (tod_clock_base.tod >> 63)) {
/* Enabled signed clock comparator comparisons */
S390_lowcore.machine_flags |= MACHINE_FLAG_SCC;
clock_comparator_max = -1ULL >> 1;
- __ctl_set_bit(0, 53);
+ system_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT);
}
+ if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) {
+ S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO;
+ /* the control bit is set during PCI initialization */
+ }
+ if (test_facility(194))
+ S390_lowcore.machine_flags |= MACHINE_FLAG_RDP;
}
static inline void save_vector_registers(void)
@@ -261,15 +257,9 @@ static inline void save_vector_registers(void)
#endif
}
-static inline void setup_control_registers(void)
+static inline void setup_low_address_protection(void)
{
- unsigned long reg;
-
- __ctl_store(reg, 0, 0);
- reg |= CR0_LOW_ADDRESS_PROTECTION;
- reg |= CR0_EMERGENCY_SIGNAL_SUBMASK;
- reg |= CR0_EXTERNAL_CALL_SUBMASK;
- __ctl_load(reg, 0, 0);
+ system_ctl_set_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
}
static inline void setup_access_registers(void)
@@ -279,64 +269,37 @@ static inline void setup_access_registers(void)
restore_access_regs(acrs);
}
-static int __init disable_vector_extension(char *str)
-{
- S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX;
- __ctl_clear_bit(0, 17);
- return 0;
-}
-early_param("novx", disable_vector_extension);
-
-static int __init cad_setup(char *str)
-{
- bool enabled;
- int rc;
-
- rc = kstrtobool(str, &enabled);
- if (!rc && enabled && test_facility(128))
- /* Enable problem state CAD. */
- __ctl_set_bit(2, 3);
- return rc;
-}
-early_param("cad", cad_setup);
-
char __bootdata(early_command_line)[COMMAND_LINE_SIZE];
static void __init setup_boot_command_line(void)
{
/* copy arch command line */
- strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE);
}
-static void __init check_image_bootable(void)
+static void __init sort_amode31_extable(void)
{
- if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING)))
- return;
-
- sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n");
- sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n");
- sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n");
- disabled_wait();
+ sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table);
}
void __init startup_init(void)
{
+ kasan_early_init();
reset_tod_clock();
- check_image_bootable();
time_early_init();
init_kernel_storage_key();
lockdep_off();
+ sort_amode31_extable();
setup_lowcore_early();
setup_facility_list();
detect_machine_type();
setup_arch_string();
setup_boot_command_line();
detect_diag9c();
- detect_diag44();
detect_machine_facilities();
save_vector_registers();
setup_topology();
sclp_early_detect();
- setup_control_registers();
+ setup_low_address_protection();
setup_access_registers();
lockdep_on();
}
diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c
index 6f24d83bc5dc..d9d53f44008a 100644
--- a/arch/s390/kernel/early_printk.c
+++ b/arch/s390/kernel/early_printk.c
@@ -10,7 +10,7 @@
static void sclp_early_write(struct console *con, const char *s, unsigned int len)
{
- __sclp_early_printk(s, len, 0);
+ __sclp_early_printk(s, len);
}
static struct console sclp_early_console = {
diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S
new file mode 100644
index 000000000000..c634871f0d90
--- /dev/null
+++ b/arch/s390/kernel/earlypgm.S
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 2006, 2007
+ * Author(s): Michael Holzheu <holzheu@de.ibm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+SYM_CODE_START(early_pgm_check_handler)
+ stmg %r8,%r15,__LC_SAVE_AREA_SYNC
+ aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE)
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ stmg %r0,%r7,__PT_R0(%r11)
+ mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW
+ mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
+ lgr %r2,%r11
+ brasl %r14,__do_early_pgm_check
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ lpswe __LC_RETURN_PSW
+SYM_CODE_END(early_pgm_check_handler)
diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c
index 7f8246c9be08..0e51fa537262 100644
--- a/arch/s390/kernel/ebcdic.c
+++ b/arch/s390/kernel/ebcdic.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * ECBDIC -> ASCII, ASCII -> ECBDIC,
+ * EBCDIC -> ASCII, ASCII -> EBCDIC,
* upper to lower case (EBCDIC) conversion tables.
*
* S390 version
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 270d1d145761..49a11f6dd7ae 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -6,15 +6,15 @@
* Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com),
* Hartmut Penner (hp@de.ibm.com),
* Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com),
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*/
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/linkage.h>
+#include <asm/asm-extable.h>
#include <asm/alternative-asm.h>
#include <asm/processor.h>
#include <asm/cache.h>
-#include <asm/ctl_reg.h>
#include <asm/dwarf.h>
#include <asm/errno.h>
#include <asm/ptrace.h>
@@ -27,67 +27,29 @@
#include <asm/vx-insn.h>
#include <asm/setup.h>
#include <asm/nmi.h>
-#include <asm/export.h>
#include <asm/nospec-insn.h>
-__PT_R0 = __PT_GPRS
-__PT_R1 = __PT_GPRS + 8
-__PT_R2 = __PT_GPRS + 16
-__PT_R3 = __PT_GPRS + 24
-__PT_R4 = __PT_GPRS + 32
-__PT_R5 = __PT_GPRS + 40
-__PT_R6 = __PT_GPRS + 48
-__PT_R7 = __PT_GPRS + 56
-__PT_R8 = __PT_GPRS + 64
-__PT_R9 = __PT_GPRS + 72
-__PT_R10 = __PT_GPRS + 80
-__PT_R11 = __PT_GPRS + 88
-__PT_R12 = __PT_GPRS + 96
-__PT_R13 = __PT_GPRS + 104
-__PT_R14 = __PT_GPRS + 112
-__PT_R15 = __PT_GPRS + 120
-
-STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER
-STACK_SIZE = 1 << STACK_SHIFT
-STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
-
-_TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
- _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING)
-_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \
- _TIF_SYSCALL_TRACEPOINT)
-_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \
- _CIF_ASCE_SECONDARY | _CIF_FPU)
-_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
-
_LPP_OFFSET = __LC_LPP
-#define BASED(name) name-cleanup_critical(%r13)
+ .macro STBEAR address
+ ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193
+ .endm
- .macro TRACE_IRQS_ON
-#ifdef CONFIG_TRACE_IRQFLAGS
- basr %r2,%r0
- brasl %r14,trace_hardirqs_on_caller
-#endif
+ .macro LBEAR address
+ ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193
.endm
- .macro TRACE_IRQS_OFF
-#ifdef CONFIG_TRACE_IRQFLAGS
- basr %r2,%r0
- brasl %r14,trace_hardirqs_off_caller
-#endif
+ .macro LPSWEY address,lpswe
+ ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193
.endm
- .macro LOCKDEP_SYS_EXIT
-#ifdef CONFIG_LOCKDEP
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jz .+10
- brasl %r14,lockdep_sys_exit
-#endif
+ .macro MBEAR reg
+ ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193
.endm
.macro CHECK_STACK savearea
#ifdef CONFIG_CHECK_STACK
- tml %r15,STACK_SIZE - CONFIG_STACK_GUARD
+ tml %r15,THREAD_SIZE - CONFIG_STACK_GUARD
lghi %r14,\savearea
jz stack_overflow
#endif
@@ -96,12 +58,14 @@ _LPP_OFFSET = __LC_LPP
.macro CHECK_VMAP_STACK savearea,oklabel
#ifdef CONFIG_VMAP_STACK
lgr %r14,%r15
- nill %r14,0x10000 - STACK_SIZE
- oill %r14,STACK_INIT
+ nill %r14,0x10000 - THREAD_SIZE
+ oill %r14,STACK_INIT_OFFSET
clg %r14,__LC_KERNEL_STACK
je \oklabel
clg %r14,__LC_ASYNC_STACK
je \oklabel
+ clg %r14,__LC_MCCK_STACK
+ je \oklabel
clg %r14,__LC_NODAT_STACK
je \oklabel
clg %r14,__LC_RESTART_STACK
@@ -113,56 +77,6 @@ _LPP_OFFSET = __LC_LPP
#endif
.endm
- .macro SWITCH_ASYNC savearea,timer
- tmhh %r8,0x0001 # interrupting from user ?
- jnz 1f
- lgr %r14,%r9
- slg %r14,BASED(.Lcritical_start)
- clg %r14,BASED(.Lcritical_length)
- jhe 0f
- lghi %r11,\savearea # inside critical section, do cleanup
- brasl %r14,cleanup_critical
- tmhh %r8,0x0001 # retest problem state after cleanup
- jnz 1f
-0: lg %r14,__LC_ASYNC_STACK # are we already on the target stack?
- slgr %r14,%r15
- srag %r14,%r14,STACK_SHIFT
- jnz 2f
- CHECK_STACK \savearea
- aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
- j 3f
-1: UPDATE_VTIME %r14,%r15,\timer
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
-2: lg %r15,__LC_ASYNC_STACK # load async stack
-3: la %r11,STACK_FRAME_OVERHEAD(%r15)
- .endm
-
- .macro UPDATE_VTIME w1,w2,enter_timer
- lg \w1,__LC_EXIT_TIMER
- lg \w2,__LC_LAST_UPDATE_TIMER
- slg \w1,\enter_timer
- slg \w2,__LC_EXIT_TIMER
- alg \w1,__LC_USER_TIMER
- alg \w2,__LC_SYSTEM_TIMER
- stg \w1,__LC_USER_TIMER
- stg \w2,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),\enter_timer
- .endm
-
- .macro REENABLE_IRQS
- stg %r8,__LC_RETURN_PSW
- ni __LC_RETURN_PSW,0xbf
- ssm __LC_RETURN_PSW
- .endm
-
- .macro STCK savearea
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
- .insn s,0xb27c0000,\savearea # store clock fast
-#else
- .insn s,0xb2050000,\savearea # store clock
-#endif
- .endm
-
/*
* The TSTMSK macro generates a test-under-mask instruction by
* calculating the memory offset for the specified mask value.
@@ -186,46 +100,76 @@ _LPP_OFFSET = __LC_LPP
.endm
.macro BPOFF
- ALTERNATIVE "", ".long 0xb2e8c000", 82
+ ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82
.endm
.macro BPON
- ALTERNATIVE "", ".long 0xb2e8d000", 82
+ ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82
.endm
.macro BPENTER tif_ptr,tif_mask
- ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .long 0xb2e8d000", \
- "", 82
+ ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \
+ "j .+12; nop; nop", 82
.endm
.macro BPEXIT tif_ptr,tif_mask
TSTMSK \tif_ptr,\tif_mask
- ALTERNATIVE "jz .+8; .long 0xb2e8c000", \
- "jnz .+8; .long 0xb2e8d000", 82
+ ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \
+ "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82
+ .endm
+
+#if IS_ENABLED(CONFIG_KVM)
+ /*
+ * The OUTSIDE macro jumps to the provided label in case the value
+ * in the provided register is outside of the provided range. The
+ * macro is useful for checking whether a PSW stored in a register
+ * pair points inside or outside of a block of instructions.
+ * @reg: register to check
+ * @start: start of the range
+ * @end: end of the range
+ * @outside_label: jump here if @reg is outside of [@start..@end)
+ */
+ .macro OUTSIDE reg,start,end,outside_label
+ lgr %r14,\reg
+ larl %r13,\start
+ slgr %r14,%r13
+ clgfrl %r14,.Lrange_size\@
+ jhe \outside_label
+ .section .rodata, "a"
+ .balign 4
+.Lrange_size\@:
+ .long \end - \start
+ .previous
+ .endm
+
+ .macro SIEEXIT
+ lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer
+ ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
+ lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
+ larl %r9,sie_exit # skip forward to sie_exit
+ .endm
+#endif
+
+ .macro STACKLEAK_ERASE
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+ brasl %r14,stackleak_erase_on_task_stack
+#endif
.endm
- GEN_BR_THUNK %r9
GEN_BR_THUNK %r14
- GEN_BR_THUNK %r14,%r11
.section .kprobes.text, "ax"
.Ldummy:
/*
- * This nop exists only in order to avoid that __switch_to starts at
- * the beginning of the kprobes text section. In that case we would
- * have several symbols at the same address. E.g. objdump would take
- * an arbitrary symbol name when disassembling this code.
- * With the added nop in between the __switch_to symbol is unique
- * again.
+ * The following nop exists only in order to avoid that the next
+ * symbol starts at the beginning of the kprobes text section.
+ * In that case there would be several symbols at the same address.
+ * E.g. objdump would take an arbitrary symbol when disassembling
+ * the code.
+ * With the added nop in between this cannot happen.
*/
nop 0
-ENTRY(__bpon)
- .globl __bpon
- BPON
- BR_EX %r14
-ENDPROC(__bpon)
-
/*
* Scheduler resume function, called by switch_to
* gpr2 = (task_struct *) prev
@@ -233,11 +177,11 @@ ENDPROC(__bpon)
* Returns:
* gpr2 = prev
*/
-ENTRY(__switch_to)
+SYM_FUNC_START(__switch_to)
stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task
lghi %r4,__TASK_stack
lghi %r1,__TASK_thread
- llill %r5,STACK_INIT
+ llill %r5,STACK_INIT_OFFSET
stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev
lg %r15,0(%r4,%r3) # start of kernel stack of next
agr %r15,%r5 # end of kernel stack of next
@@ -247,30 +191,26 @@ ENTRY(__switch_to)
aghi %r3,__TASK_pid
mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next
lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task
- ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
+ ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
BR_EX %r14
-ENDPROC(__switch_to)
-
-.L__critical_start:
+SYM_FUNC_END(__switch_to)
#if IS_ENABLED(CONFIG_KVM)
/*
- * sie64a calling convention:
- * %r2 pointer to sie control block
- * %r3 guest register save area
+ * __sie64a calling convention:
+ * %r2 pointer to sie control block phys
+ * %r3 pointer to sie control block virt
+ * %r4 guest register save area
*/
-ENTRY(sie64a)
+SYM_FUNC_START(__sie64a)
stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers
lg %r12,__LC_CURRENT
- stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer
- stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area
+ stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical..
+ stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses
+ stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area
xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU # load guest fp/vx registers ?
- jno .Lsie_load_guest_gprs
- brasl %r14,load_fpu_regs # load guest fp/vx regs
-.Lsie_load_guest_gprs:
- lmg %r0,%r13,0(%r3) # load guest gprs 0-13
+ lmg %r0,%r13,0(%r4) # load guest gprs 0-13
lg %r14,__LC_GMAP # get gmap pointer
ltgr %r14,%r14
jz .Lsie_gmap
@@ -282,35 +222,37 @@ ENTRY(sie64a)
jnz .Lsie_skip
TSTMSK __LC_CPU_FLAGS,_CIF_FPU
jo .Lsie_skip # exit if fp/vx regs changed
- BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+ lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr
+ BPEXIT __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
.Lsie_entry:
sie 0(%r14)
-.Lsie_exit:
+# Let the next instruction be NOP to avoid triggering a machine check
+# and handling it in a guest as result of the instruction execution.
+ nopr 7
+.Lsie_leave:
BPOFF
- BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+ BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
.Lsie_skip:
+ lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
+ lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce
.Lsie_done:
# some program checks are suppressing. C code (e.g. do_protection_exception)
# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
-# Other instructions between sie64a and .Lsie_done should not cause program
+# Other instructions between __sie64a and .Lsie_done should not cause program
# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
-# See also .Lcleanup_sie
.Lrewind_pad6:
nopr 7
.Lrewind_pad4:
nopr 7
.Lrewind_pad2:
nopr 7
- .globl sie_exit
-sie_exit:
+SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL)
lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area
stmg %r0,%r13,0(%r14) # save guest gprs 0-13
xgr %r0,%r0 # clear guest registers to
xgr %r1,%r1 # prevent speculative use
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
@@ -326,838 +268,244 @@ sie_exit:
EX_TABLE(.Lrewind_pad4,.Lsie_fault)
EX_TABLE(.Lrewind_pad2,.Lsie_fault)
EX_TABLE(sie_exit,.Lsie_fault)
-ENDPROC(sie64a)
-EXPORT_SYMBOL(sie64a)
+SYM_FUNC_END(__sie64a)
+EXPORT_SYMBOL(__sie64a)
EXPORT_SYMBOL(sie_exit)
#endif
/*
* SVC interrupt handler routine. System calls are synchronous events and
- * are executed with interrupts enabled.
+ * are entered with interrupts disabled.
*/
-ENTRY(system_call)
- stpt __LC_SYNC_ENTER_TIMER
-.Lsysc_stmg:
+SYM_CODE_START(system_call)
+ stpt __LC_SYS_ENTER_TIMER
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
BPOFF
- lg %r12,__LC_CURRENT
- lghi %r13,__TASK_thread
- lghi %r14,_PIF_SYSCALL
+ lghi %r14,0
.Lsysc_per:
+ STBEAR __LC_LAST_BREAK
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
lg %r15,__LC_KERNEL_STACK
- la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs
-.Lsysc_vtime:
- UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
- stmg %r0,%r7,__PT_R0(%r11)
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
- mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW
- mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC
- stg %r14,__PT_FLAGS(%r11)
-.Lsysc_do_svc:
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
# clear user controlled register to prevent speculative use
xgr %r0,%r0
- # load address of system call table
- lg %r10,__THREAD_sysc_table(%r13,%r12)
- llgh %r8,__PT_INT_CODE+2(%r11)
- slag %r8,%r8,3 # shift and test for svc 0
- jnz .Lsysc_nr_ok
- # svc 0: system call number in %r1
- llgfr %r1,%r1 # clear high word in r1
- cghi %r1,NR_syscalls
- jnl .Lsysc_nr_ok
- sth %r1,__PT_INT_CODE+2(%r11)
- slag %r8,%r1,3
-.Lsysc_nr_ok:
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- stg %r2,__PT_ORIG_GPR2(%r11)
- stg %r7,STACK_FRAME_OVERHEAD(%r15)
- lg %r9,0(%r8,%r10) # get system call add.
- TSTMSK __TI_flags(%r12),_TIF_TRACE
- jnz .Lsysc_tracesys
- BASR_EX %r14,%r9 # call sys_xxxx
- stg %r2,__PT_R2(%r11) # store return value
-
-.Lsysc_return:
-#ifdef CONFIG_DEBUG_RSEQ
- lgr %r2,%r11
- brasl %r14,rseq_syscall
-#endif
- LOCKDEP_SYS_EXIT
-.Lsysc_tif:
- TSTMSK __PT_FLAGS(%r11),_PIF_WORK
- jnz .Lsysc_work
- TSTMSK __TI_flags(%r12),_TIF_WORK
- jnz .Lsysc_work # check for work
- TSTMSK __LC_CPU_FLAGS,_CIF_WORK
- jnz .Lsysc_work
- BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
-.Lsysc_restore:
- lg %r14,__LC_VDSO_PER_CPU
- lmg %r0,%r10,__PT_R0(%r11)
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
-.Lsysc_exit_timer:
+ xgr %r1,%r1
+ xgr %r4,%r4
+ xgr %r5,%r5
+ xgr %r6,%r6
+ xgr %r7,%r7
+ xgr %r8,%r8
+ xgr %r9,%r9
+ xgr %r10,%r10
+ xgr %r11,%r11
+ la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs
+ mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC
+ MBEAR %r2
+ lgr %r3,%r14
+ brasl %r14,__do_syscall
+ STACKLEAK_ERASE
+ lctlg %c1,%c1,__LC_USER_ASCE
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ BPON
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
- lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_PSW
-.Lsysc_done:
-
-#
-# One of the work bits is on. Find out which one.
-#
-.Lsysc_work:
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jo .Lsysc_mcck_pending
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jo .Lsysc_reschedule
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
- jo .Lsysc_syscall_restart
-#ifdef CONFIG_UPROBES
- TSTMSK __TI_flags(%r12),_TIF_UPROBE
- jo .Lsysc_uprobe_notify
-#endif
- TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE
- jo .Lsysc_guarded_storage
- TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP
- jo .Lsysc_singlestep
-#ifdef CONFIG_LIVEPATCH
- TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING
- jo .Lsysc_patch_pending # handle live patching just before
- # signals and possible syscall restart
-#endif
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART
- jo .Lsysc_syscall_restart
- TSTMSK __TI_flags(%r12),_TIF_SIGPENDING
- jo .Lsysc_sigpending
- TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME
- jo .Lsysc_notify_resume
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lsysc_vxrs
- TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
- jnz .Lsysc_asce
- j .Lsysc_return # beware of critical section cleanup
-
-#
-# _TIF_NEED_RESCHED is set, call schedule
-#
-.Lsysc_reschedule:
- larl %r14,.Lsysc_return
- jg schedule
-
-#
-# _CIF_MCCK_PENDING is set, call handler
-#
-.Lsysc_mcck_pending:
- larl %r14,.Lsysc_return
- jg s390_handle_mcck # TIF bit will be cleared by handler
-
-#
-# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce
-#
-.Lsysc_asce:
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
- lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce
- TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
- jz .Lsysc_return
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
- tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ?
- jnz .Lsysc_set_fs_fixup
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- j .Lsysc_return
-.Lsysc_set_fs_fixup:
-#endif
- larl %r14,.Lsysc_return
- jg set_fs_fixup
-
-#
-# CIF_FPU is set, restore floating-point controls and floating-point registers.
-#
-.Lsysc_vxrs:
- larl %r14,.Lsysc_return
- jg load_fpu_regs
-
-#
-# _TIF_SIGPENDING is set, call do_signal
-#
-.Lsysc_sigpending:
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_signal
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL
- jno .Lsysc_return
-.Lsysc_do_syscall:
- lghi %r13,__TASK_thread
- lmg %r2,%r7,__PT_R2(%r11) # load svc arguments
- lghi %r1,0 # svc 0 returns -ENOSYS
- j .Lsysc_do_svc
-
-#
-# _TIF_NOTIFY_RESUME is set, call do_notify_resume
-#
-.Lsysc_notify_resume:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_notify_resume
-
-#
-# _TIF_UPROBE is set, call uprobe_notify_resume
-#
-#ifdef CONFIG_UPROBES
-.Lsysc_uprobe_notify:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg uprobe_notify_resume
-#endif
-
-#
-# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
-#
-.Lsysc_guarded_storage:
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg gs_load_bc_cb
-#
-# _TIF_PATCH_PENDING is set, call klp_update_patch_state
-#
-#ifdef CONFIG_LIVEPATCH
-.Lsysc_patch_pending:
- lg %r2,__LC_CURRENT # pass pointer to task struct
- larl %r14,.Lsysc_return
- jg klp_update_patch_state
-#endif
-
-#
-# _PIF_PER_TRAP is set, call do_per_trap
-#
-.Lsysc_singlestep:
- ni __PT_FLAGS+7(%r11),255-_PIF_PER_TRAP
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_per_trap
-
-#
-# _PIF_SYSCALL_RESTART is set, repeat the current system call
-#
-.Lsysc_syscall_restart:
- ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART
- lmg %r1,%r7,__PT_R1(%r11) # load svc arguments
- lg %r2,__PT_ORIG_GPR2(%r11)
- j .Lsysc_do_svc
-
-#
-# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before
-# and after the system call
-#
-.Lsysc_tracesys:
- lgr %r2,%r11 # pass pointer to pt_regs
- la %r3,0
- llgh %r0,__PT_INT_CODE+2(%r11)
- stg %r0,__PT_R2(%r11)
- brasl %r14,do_syscall_trace_enter
- lghi %r0,NR_syscalls
- clgr %r0,%r2
- jnh .Lsysc_tracenogo
- sllg %r8,%r2,3
- lg %r9,0(%r8,%r10)
-.Lsysc_tracego:
- lmg %r3,%r7,__PT_R3(%r11)
- stg %r7,STACK_FRAME_OVERHEAD(%r15)
- lg %r2,__PT_ORIG_GPR2(%r11)
- BASR_EX %r14,%r9 # call sys_xxx
- stg %r2,__PT_R2(%r11) # store return value
-.Lsysc_tracenogo:
- TSTMSK __TI_flags(%r12),_TIF_TRACE
- jz .Lsysc_return
- lgr %r2,%r11 # pass pointer to pt_regs
- larl %r14,.Lsysc_return
- jg do_syscall_trace_exit
-ENDPROC(system_call)
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(system_call)
#
# a new process exits the kernel with ret_from_fork
#
-ENTRY(ret_from_fork)
- la %r11,STACK_FRAME_OVERHEAD(%r15)
- lg %r12,__LC_CURRENT
- brasl %r14,schedule_tail
- TRACE_IRQS_ON
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- tm __PT_PSW+1(%r11),0x01 # forking a kernel thread ?
- jne .Lsysc_tracenogo
- # it's a kernel thread
- lmg %r9,%r10,__PT_R9(%r11) # load gprs
- la %r2,0(%r10)
- BASR_EX %r14,%r9
- j .Lsysc_tracenogo
-ENDPROC(ret_from_fork)
-
-ENTRY(kernel_thread_starter)
- la %r2,0(%r10)
- BASR_EX %r14,%r9
- j .Lsysc_tracenogo
-ENDPROC(kernel_thread_starter)
+SYM_CODE_START(ret_from_fork)
+ lgr %r3,%r11
+ brasl %r14,__ret_from_fork
+ STACKLEAK_ERASE
+ lctlg %c1,%c1,__LC_USER_ASCE
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ BPON
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ stpt __LC_EXIT_TIMER
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(ret_from_fork)
/*
* Program check handler routine
*/
-ENTRY(pgm_check_handler)
- stpt __LC_SYNC_ENTER_TIMER
+SYM_CODE_START(pgm_check_handler)
+ stpt __LC_SYS_ENTER_TIMER
BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_SYNC
- lg %r10,__LC_LAST_BREAK
- lg %r12,__LC_CURRENT
- lghi %r11,0
- larl %r13,cleanup_critical
+ lghi %r10,0
lmg %r8,%r9,__LC_PGM_OLD_PSW
- tmhh %r8,0x0001 # test problem state bit
- jnz 2f # -> fault in user space
+ tmhh %r8,0x0001 # coming from user space?
+ jno .Lpgm_skip_asce
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
+ j 3f # -> fault in user space
+.Lpgm_skip_asce:
#if IS_ENABLED(CONFIG_KVM)
- # cleanup critical section for program checks in sie64a
- lgr %r14,%r9
- slg %r14,BASED(.Lsie_critical_start)
- clg %r14,BASED(.Lsie_critical_length)
- jhe 0f
- lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer
- ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- larl %r9,sie_exit # skip forward to sie_exit
- lghi %r11,_PIF_GUEST_FAULT
+ # cleanup critical section for program checks in __sie64a
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f
+ BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+ SIEEXIT
+ lghi %r10,_PIF_GUEST_FAULT
#endif
-0: tmhh %r8,0x4000 # PER bit set in old PSW ?
- jnz 1f # -> enabled, can't be a double fault
+1: tmhh %r8,0x4000 # PER bit set in old PSW ?
+ jnz 2f # -> enabled, can't be a double fault
tm __LC_PGM_ILC+3,0x80 # check for per exception
jnz .Lpgm_svcper # -> single stepped svc
-1: CHECK_STACK __LC_SAVE_AREA_SYNC
+2: CHECK_STACK __LC_SAVE_AREA_SYNC
aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
# CHECK_VMAP_STACK branches to stack_overflow or 4f
CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f
-2: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
- lg %r15,__LC_KERNEL_STACK
- lgr %r14,%r12
- aghi %r14,__TASK_thread # pointer to thread_struct
- lghi %r13,__LC_PGM_TDB
- tm __LC_PGM_ILC+2,0x02 # check for transaction abort
- jz 3f
- mvc __THREAD_trap_tdb(256,%r14),0(%r13)
-3: stg %r10,__THREAD_last_break(%r14)
-4: lgr %r13,%r11
- la %r11,STACK_FRAME_OVERHEAD(%r15)
+3: lg %r15,__LC_KERNEL_STACK
+4: la %r11,STACK_FRAME_OVERHEAD(%r15)
+ stg %r10,__PT_FLAGS(%r11)
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
stmg %r0,%r7,__PT_R0(%r11)
+ mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
+ mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK
+ stmg %r8,%r9,__PT_PSW(%r11)
+
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
xgr %r6,%r6
xgr %r7,%r7
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
- stmg %r8,%r9,__PT_PSW(%r11)
- mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC
- mvc __PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE
- stg %r13,__PT_FLAGS(%r11)
- stg %r10,__PT_ARGS(%r11)
- tm __LC_PGM_ILC+3,0x80 # check for per exception
- jz 5f
- tmhh %r8,0x0001 # kernel per event ?
- jz .Lpgm_kprobe
- oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP
- mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS
- mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE
- mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID
-5: REENABLE_IRQS
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- larl %r1,pgm_check_table
- llgh %r10,__PT_INT_CODE+2(%r11)
- nill %r10,0x007f
- sll %r10,3
- je .Lpgm_return
- lg %r9,0(%r10,%r1) # load address of handler routine
- lgr %r2,%r11 # pass pointer to pt_regs
- BASR_EX %r14,%r9 # branch to interrupt-handler
-.Lpgm_return:
- LOCKDEP_SYS_EXIT
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lsysc_restore
- TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL
- jo .Lsysc_do_syscall
- j .Lsysc_tif
-
-#
-# PER event in supervisor state, must be kprobes
-#
-.Lpgm_kprobe:
- REENABLE_IRQS
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_per_trap
- j .Lpgm_return
+ lgr %r2,%r11
+ brasl %r14,__do_pgm_check
+ tmhh %r8,0x0001 # returning to user space?
+ jno .Lpgm_exit_kernel
+ STACKLEAK_ERASE
+ lctlg %c1,%c1,__LC_USER_ASCE
+ BPON
+ stpt __LC_EXIT_TIMER
+.Lpgm_exit_kernel:
+ mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+ LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+ lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
#
# single stepped system call
#
.Lpgm_svcper:
mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW
- lghi %r13,__TASK_thread
larl %r14,.Lsysc_per
stg %r14,__LC_RETURN_PSW+8
- lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP
- lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs
-ENDPROC(pgm_check_handler)
+ lghi %r14,1
+ LBEAR __LC_PGM_LAST_BREAK
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per
+SYM_CODE_END(pgm_check_handler)
/*
- * IO interrupt handler routine
+ * Interrupt handler macro used for external and IO interrupts.
*/
-ENTRY(io_int_handler)
- STCK __LC_INT_CLOCK
- stpt __LC_ASYNC_ENTER_TIMER
+.macro INT_HANDLER name,lc_old_psw,handler
+SYM_CODE_START(\name)
+ stckf __LC_INT_CLOCK
+ stpt __LC_SYS_ENTER_TIMER
+ STBEAR __LC_LAST_BREAK
BPOFF
stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
- lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
- lmg %r8,%r9,__LC_IO_OLD_PSW
- SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
+ lmg %r8,%r9,\lc_old_psw
+ tmhh %r8,0x0001 # interrupting from user ?
+ jnz 1f
+#if IS_ENABLED(CONFIG_KVM)
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f
+ BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+ SIEEXIT
+#endif
+0: CHECK_STACK __LC_SAVE_AREA_ASYNC
+ aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
+ j 2f
+1: lctlg %c1,%c1,__LC_KERNEL_ASCE
+ lg %r15,__LC_KERNEL_STACK
+2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r7,__PT_R0(%r11)
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
xgr %r6,%r6
xgr %r7,%r7
xgr %r10,%r10
+ xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
+ MBEAR %r11
stmg %r8,%r9,__PT_PSW(%r11)
- mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
- xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
- TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ
- jo .Lio_restore
- TRACE_IRQS_OFF
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-.Lio_loop:
lgr %r2,%r11 # pass pointer to pt_regs
- lghi %r3,IO_INTERRUPT
- tm __PT_INT_CODE+8(%r11),0x80 # adapter interrupt ?
- jz .Lio_call
- lghi %r3,THIN_INTERRUPT
-.Lio_call:
- brasl %r14,do_IRQ
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_LPAR
- jz .Lio_return
- tpi 0
- jz .Lio_return
- mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
- j .Lio_loop
-.Lio_return:
- LOCKDEP_SYS_EXIT
- TRACE_IRQS_ON
-.Lio_tif:
- TSTMSK __TI_flags(%r12),_TIF_WORK
- jnz .Lio_work # there is work to do (signals etc.)
- TSTMSK __LC_CPU_FLAGS,_CIF_WORK
- jnz .Lio_work
-.Lio_restore:
- lg %r14,__LC_VDSO_PER_CPU
- lmg %r0,%r10,__PT_R0(%r11)
+ brasl %r14,\handler
mvc __LC_RETURN_PSW(16),__PT_PSW(%r11)
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lio_exit_kernel
- BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
-.Lio_exit_timer:
+ tmhh %r8,0x0001 # returning to user ?
+ jno 2f
+ STACKLEAK_ERASE
+ lctlg %c1,%c1,__LC_USER_ASCE
+ BPON
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-.Lio_exit_kernel:
- lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_PSW
-.Lio_done:
-
-#
-# There is work todo, find out in which context we have been interrupted:
-# 1) if we return to user space we can do all _TIF_WORK work
-# 2) if we return to kernel code and kvm is enabled check if we need to
-# modify the psw to leave SIE
-# 3) if we return to kernel code and preemptive scheduling is enabled check
-# the preemption counter and if it is zero call preempt_schedule_irq
-# Before any work can be done, a switch to the kernel stack is required.
-#
-.Lio_work:
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jo .Lio_work_user # yes -> do resched & signal
-#ifdef CONFIG_PREEMPT
- # check for preemptive scheduling
- icm %r0,15,__LC_PREEMPT_COUNT
- jnz .Lio_restore # preemption is disabled
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jno .Lio_restore
- # switch to kernel stack
- lg %r1,__PT_R15(%r11)
- aghi %r1,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
- mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
- xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
- la %r11,STACK_FRAME_OVERHEAD(%r1)
- lgr %r15,%r1
- # TRACE_IRQS_ON already done at .Lio_return, call
- # TRACE_IRQS_OFF to keep things symmetrical
- TRACE_IRQS_OFF
- brasl %r14,preempt_schedule_irq
- j .Lio_return
-#else
- j .Lio_restore
-#endif
-
-#
-# Need to do work before returning to userspace, switch to kernel stack
-#
-.Lio_work_user:
- lg %r1,__LC_KERNEL_STACK
- mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
- xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
- la %r11,STACK_FRAME_OVERHEAD(%r1)
- lgr %r15,%r1
-
-#
-# One of the work bits is on. Find out which one.
-#
-.Lio_work_tif:
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jo .Lio_mcck_pending
- TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED
- jo .Lio_reschedule
-#ifdef CONFIG_LIVEPATCH
- TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING
- jo .Lio_patch_pending
-#endif
- TSTMSK __TI_flags(%r12),_TIF_SIGPENDING
- jo .Lio_sigpending
- TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME
- jo .Lio_notify_resume
- TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE
- jo .Lio_guarded_storage
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lio_vxrs
- TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY)
- jnz .Lio_asce
- j .Lio_return # beware of critical section cleanup
-
-#
-# _CIF_MCCK_PENDING is set, call handler
-#
-.Lio_mcck_pending:
- # TRACE_IRQS_ON already done at .Lio_return
- brasl %r14,s390_handle_mcck # TIF bit will be cleared by handler
- TRACE_IRQS_OFF
- j .Lio_return
+2: LBEAR __PT_LAST_BREAK(%r11)
+ lmg %r0,%r15,__PT_R0(%r11)
+ LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE
+SYM_CODE_END(\name)
+.endm
-#
-# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce
-#
-.Lio_asce:
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY
- lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce
- TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY
- jz .Lio_return
-#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES
- tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ?
- jnz .Lio_set_fs_fixup
- ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- j .Lio_return
-.Lio_set_fs_fixup:
-#endif
- larl %r14,.Lio_return
- jg set_fs_fixup
-
-#
-# CIF_FPU is set, restore floating-point controls and floating-point registers.
-#
-.Lio_vxrs:
- larl %r14,.Lio_return
- jg load_fpu_regs
-
-#
-# _TIF_GUARDED_STORAGE is set, call guarded_storage_load
-#
-.Lio_guarded_storage:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,gs_load_bc_cb
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_NEED_RESCHED is set, call schedule
-#
-.Lio_reschedule:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- brasl %r14,schedule # call scheduler
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_PATCH_PENDING is set, call klp_update_patch_state
-#
-#ifdef CONFIG_LIVEPATCH
-.Lio_patch_pending:
- lg %r2,__LC_CURRENT # pass pointer to task struct
- larl %r14,.Lio_return
- jg klp_update_patch_state
-#endif
-
-#
-# _TIF_SIGPENDING or is set, call do_signal
-#
-.Lio_sigpending:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_signal
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-
-#
-# _TIF_NOTIFY_RESUME or is set, call do_notify_resume
-#
-.Lio_notify_resume:
- # TRACE_IRQS_ON already done at .Lio_return
- ssm __LC_SVC_NEW_PSW # reenable interrupts
- lgr %r2,%r11 # pass pointer to pt_regs
- brasl %r14,do_notify_resume
- ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts
- TRACE_IRQS_OFF
- j .Lio_return
-ENDPROC(io_int_handler)
+INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq
+INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq
/*
- * External interrupt handler routine
+ * Load idle PSW.
*/
-ENTRY(ext_int_handler)
- STCK __LC_INT_CLOCK
- stpt __LC_ASYNC_ENTER_TIMER
- BPOFF
- stmg %r8,%r15,__LC_SAVE_AREA_ASYNC
- lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
- lmg %r8,%r9,__LC_EXT_OLD_PSW
- SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
- stmg %r0,%r7,__PT_R0(%r11)
- # clear user controlled registers to prevent speculative use
- xgr %r0,%r0
- xgr %r1,%r1
- xgr %r2,%r2
- xgr %r3,%r3
- xgr %r4,%r4
- xgr %r5,%r5
- xgr %r6,%r6
- xgr %r7,%r7
- xgr %r10,%r10
- mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
- stmg %r8,%r9,__PT_PSW(%r11)
- lghi %r1,__LC_EXT_PARAMS2
- mvc __PT_INT_CODE(4,%r11),__LC_EXT_CPU_ADDR
- mvc __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS
- mvc __PT_INT_PARM_LONG(8,%r11),0(%r1)
- xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
- TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ
- jo .Lio_restore
- TRACE_IRQS_OFF
- xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
- lgr %r2,%r11 # pass pointer to pt_regs
- lghi %r3,EXT_INTERRUPT
- brasl %r14,do_IRQ
- j .Lio_return
-ENDPROC(ext_int_handler)
-
-/*
- * Load idle PSW. The second "half" of this function is in .Lcleanup_idle.
- */
-ENTRY(psw_idle)
+SYM_FUNC_START(psw_idle)
+ stg %r14,(__SF_GPRS+8*8)(%r15)
stg %r3,__SF_EMPTY(%r15)
- larl %r1,.Lpsw_idle_lpsw+4
+ larl %r1,psw_idle_exit
stg %r1,__SF_EMPTY+8(%r15)
larl %r1,smp_cpu_mtid
llgf %r1,0(%r1)
ltgr %r1,%r1
jz .Lpsw_idle_stcctm
- .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15)
+ .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2)
.Lpsw_idle_stcctm:
oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT
BPON
- STCK __CLOCK_IDLE_ENTER(%r2)
+ stckf __CLOCK_IDLE_ENTER(%r2)
stpt __TIMER_IDLE_ENTER(%r2)
-.Lpsw_idle_lpsw:
lpswe __SF_EMPTY(%r15)
+SYM_INNER_LABEL(psw_idle_exit, SYM_L_GLOBAL)
BR_EX %r14
-.Lpsw_idle_end:
-ENDPROC(psw_idle)
-
-/*
- * Store floating-point controls and floating-point or vector register
- * depending whether the vector facility is available. A critical section
- * cleanup assures that the registers are stored even if interrupted for
- * some other work. The CIF_FPU flag is set to trigger a lazy restore
- * of the register contents at return from io or a system call.
- */
-ENTRY(save_fpu_regs)
- lg %r2,__LC_CURRENT
- aghi %r2,__TASK_thread
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lsave_fpu_regs_exit
- stfpc __THREAD_FPU_fpc(%r2)
- lg %r3,__THREAD_FPU_regs(%r2)
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- jz .Lsave_fpu_regs_fp # no -> store FP regs
- VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3)
- VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3)
- j .Lsave_fpu_regs_done # -> set CIF_FPU flag
-.Lsave_fpu_regs_fp:
- std 0,0(%r3)
- std 1,8(%r3)
- std 2,16(%r3)
- std 3,24(%r3)
- std 4,32(%r3)
- std 5,40(%r3)
- std 6,48(%r3)
- std 7,56(%r3)
- std 8,64(%r3)
- std 9,72(%r3)
- std 10,80(%r3)
- std 11,88(%r3)
- std 12,96(%r3)
- std 13,104(%r3)
- std 14,112(%r3)
- std 15,120(%r3)
-.Lsave_fpu_regs_done:
- oi __LC_CPU_FLAGS+7,_CIF_FPU
-.Lsave_fpu_regs_exit:
- BR_EX %r14
-.Lsave_fpu_regs_end:
-ENDPROC(save_fpu_regs)
-EXPORT_SYMBOL(save_fpu_regs)
-
-/*
- * Load floating-point controls and floating-point or vector registers.
- * A critical section cleanup assures that the register contents are
- * loaded even if interrupted for some other work.
- *
- * There are special calling conventions to fit into sysc and io return work:
- * %r15: <kernel stack>
- * The function requires:
- * %r4
- */
-load_fpu_regs:
- lg %r4,__LC_CURRENT
- aghi %r4,__TASK_thread
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jno .Lload_fpu_regs_exit
- lfpc __THREAD_FPU_fpc(%r4)
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area
- jz .Lload_fpu_regs_fp # -> no VX, load FP regs
- VLM %v0,%v15,0,%r4
- VLM %v16,%v31,256,%r4
- j .Lload_fpu_regs_done
-.Lload_fpu_regs_fp:
- ld 0,0(%r4)
- ld 1,8(%r4)
- ld 2,16(%r4)
- ld 3,24(%r4)
- ld 4,32(%r4)
- ld 5,40(%r4)
- ld 6,48(%r4)
- ld 7,56(%r4)
- ld 8,64(%r4)
- ld 9,72(%r4)
- ld 10,80(%r4)
- ld 11,88(%r4)
- ld 12,96(%r4)
- ld 13,104(%r4)
- ld 14,112(%r4)
- ld 15,120(%r4)
-.Lload_fpu_regs_done:
- ni __LC_CPU_FLAGS+7,255-_CIF_FPU
-.Lload_fpu_regs_exit:
- BR_EX %r14
-.Lload_fpu_regs_end:
-ENDPROC(load_fpu_regs)
-
-.L__critical_end:
+SYM_FUNC_END(psw_idle)
/*
* Machine check handler routines
*/
-ENTRY(mcck_int_handler)
- STCK __LC_MCCK_CLOCK
+SYM_CODE_START(mcck_int_handler)
BPOFF
la %r1,4095 # validate r1
spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer
- sckc __LC_CLOCK_COMPARATOR # validate comparator
- lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs
- lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs
- lg %r12,__LC_CURRENT
- larl %r13,cleanup_critical
+ LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear
+ lmg %r0,%r15,__LC_GPREGS_SAVE_AREA # validate gprs
lmg %r8,%r9,__LC_MCK_OLD_PSW
TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE
jo .Lmcck_panic # yes -> rest of mcck code invalid
TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID
jno .Lmcck_panic # control registers invalid -> panic
- la %r14,4095
- lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs
+ lctlg %c0,%c15,__LC_CREGS_SAVE_AREA # validate ctl regs
ptlb
- lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area
- nill %r11,0xfc00 # MCESA_ORIGIN_MASK
- TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE
- jno 0f
- TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID
- jno 0f
- .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC
-0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14)
- TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID
- jo 0f
- sr %r14,%r14
-0: sfpc %r14
- TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX
- jo 0f
- lghi %r14,__LC_FPREGS_SAVE_AREA
- ld %f0,0(%r14)
- ld %f1,8(%r14)
- ld %f2,16(%r14)
- ld %f3,24(%r14)
- ld %f4,32(%r14)
- ld %f5,40(%r14)
- ld %f6,48(%r14)
- ld %f7,56(%r14)
- ld %f8,64(%r14)
- ld %f9,72(%r14)
- ld %f10,80(%r14)
- ld %f11,88(%r14)
- ld %f12,96(%r14)
- ld %f13,104(%r14)
- ld %f14,112(%r14)
- ld %f15,120(%r14)
- j 1f
-0: VLM %v0,%v15,0,%r11
- VLM %v16,%v31,256,%r11
-1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA
+ lghi %r14,__LC_CPU_TIMER_SAVE_AREA
mvc __LC_MCCK_ENTER_TIMER(8),0(%r14)
TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
jo 3f
- la %r14,__LC_SYNC_ENTER_TIMER
- clc 0(8,%r14),__LC_ASYNC_ENTER_TIMER
- jl 0f
- la %r14,__LC_ASYNC_ENTER_TIMER
-0: clc 0(8,%r14),__LC_EXIT_TIMER
+ la %r14,__LC_SYS_ENTER_TIMER
+ clc 0(8,%r14),__LC_EXIT_TIMER
jl 1f
la %r14,__LC_EXIT_TIMER
1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER
@@ -1168,18 +516,27 @@ ENTRY(mcck_int_handler)
3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
jno .Lmcck_panic
tmhh %r8,0x0001 # interrupting from user ?
- jnz 4f
+ jnz .Lmcck_user
TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
jno .Lmcck_panic
-4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off
- SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER
-.Lmcck_skip:
+#if IS_ENABLED(CONFIG_KVM)
+ OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_user
+ OUTSIDE %r9,.Lsie_entry,.Lsie_leave,4f
+ oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
+4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+ SIEEXIT
+#endif
+.Lmcck_user:
+ lg %r15,__LC_MCCK_STACK
+ la %r11,STACK_FRAME_OVERHEAD(%r15)
+ stctg %c1,%c1,__PT_CR1(%r11)
+ lctlg %c1,%c1,__LC_KERNEL_ASCE
+ xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lghi %r14,__LC_GPREGS_SAVE_AREA+64
stmg %r0,%r7,__PT_R0(%r11)
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
- xgr %r2,%r2
xgr %r3,%r3
xgr %r4,%r4
xgr %r5,%r5
@@ -1192,42 +549,57 @@ ENTRY(mcck_int_handler)
xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lgr %r2,%r11 # pass pointer to pt_regs
brasl %r14,s390_do_machine_check
- tm __PT_PSW+1(%r11),0x01 # returning to user ?
- jno .Lmcck_return
- lg %r1,__LC_KERNEL_STACK # switch to kernel stack
- mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11)
- xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1)
- la %r11,STACK_FRAME_OVERHEAD(%r1)
- lgr %r15,%r1
- TSTMSK __LC_CPU_FLAGS,_CIF_MCCK_PENDING
- jno .Lmcck_return
- TRACE_IRQS_OFF
- brasl %r14,s390_handle_mcck
- TRACE_IRQS_ON
-.Lmcck_return:
- lg %r14,__LC_VDSO_PER_CPU
+ lctlg %c1,%c1,__PT_CR1(%r11)
lmg %r0,%r10,__PT_R0(%r11)
mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
jno 0f
- BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
+ BPON
stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
-0: lmg %r11,%r15,__PT_R11(%r11)
- lpswe __LC_RETURN_MCCK_PSW
+0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193
+ LBEAR 0(%r12)
+ lmg %r11,%r15,__PT_R11(%r11)
+ LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE
.Lmcck_panic:
- lg %r15,__LC_NODAT_STACK
- la %r11,STACK_FRAME_OVERHEAD(%r15)
- j .Lmcck_skip
-ENDPROC(mcck_int_handler)
-
-#
-# PSW restart interrupt handler
-#
-ENTRY(restart_int_handler)
- ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40
+ /*
+ * Iterate over all possible CPU addresses in the range 0..0xffff
+ * and stop each CPU using signal processor. Use compare and swap
+ * to allow just one CPU-stopper and prevent concurrent CPUs from
+ * stopping each other while leaving the others running.
+ */
+ lhi %r5,0
+ lhi %r6,1
+ larl %r7,stop_lock
+ cs %r5,%r6,0(%r7) # single CPU-stopper only
+ jnz 4f
+ larl %r7,this_cpu
+ stap 0(%r7) # this CPU address
+ lh %r4,0(%r7)
+ nilh %r4,0
+ lhi %r0,1
+ sll %r0,16 # CPU counter
+ lhi %r3,0 # next CPU address
+0: cr %r3,%r4
+ je 2f
+1: sigp %r1,%r3,SIGP_STOP # stop next CPU
+ brc SIGP_CC_BUSY,1b
+2: ahi %r3,1
+ brct %r0,0b
+3: sigp %r1,%r4,SIGP_STOP # stop this CPU
+ brc SIGP_CC_BUSY,3b
+4: j 4b
+SYM_CODE_END(mcck_int_handler)
+
+SYM_CODE_START(restart_int_handler)
+ ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
stg %r15,__LC_SAVE_AREA_RESTART
+ TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4
+ jz 0f
+ lctlg %c0,%c15,__LC_CREGS_SAVE_AREA
+0: larl %r15,daton_psw
+ lpswe 0(%r15) # turn dat on, keep irqs off
+.Ldaton:
lg %r15,__LC_RESTART_STACK
xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
@@ -1236,7 +608,7 @@ ENTRY(restart_int_handler)
xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15)
lg %r1,__LC_RESTART_FN # load fn, parm & source cpu
lg %r2,__LC_RESTART_DATA
- lg %r3,__LC_RESTART_SOURCE
+ lgf %r3,__LC_RESTART_SOURCE
ltgr %r3,%r3 # test source cpu address
jm 1f # negative -> skip source stop
0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu
@@ -1247,7 +619,7 @@ ENTRY(restart_int_handler)
2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu
brc 2,2b
3: j 3b
-ENDPROC(restart_int_handler)
+SYM_CODE_END(restart_int_handler)
.section .kprobes.text, "ax"
@@ -1257,7 +629,7 @@ ENDPROC(restart_int_handler)
* No need to properly save the registers, we are going to panic anyway.
* Setup a pt_regs so that show_trace can provide a good call trace.
*/
-ENTRY(stack_overflow)
+SYM_CODE_START(stack_overflow)
lg %r15,__LC_NODAT_STACK # change to panic stack
la %r11,STACK_FRAME_OVERHEAD(%r15)
stmg %r0,%r7,__PT_R0(%r11)
@@ -1267,278 +639,31 @@ ENTRY(stack_overflow)
xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
lgr %r2,%r11 # pass pointer to pt_regs
jg kernel_stack_overflow
-ENDPROC(stack_overflow)
+SYM_CODE_END(stack_overflow)
#endif
-ENTRY(cleanup_critical)
-#if IS_ENABLED(CONFIG_KVM)
- clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap
- jl 0f
- clg %r9,BASED(.Lcleanup_table_sie+8)# .Lsie_done
- jl .Lcleanup_sie
-#endif
- clg %r9,BASED(.Lcleanup_table) # system_call
- jl 0f
- clg %r9,BASED(.Lcleanup_table+8) # .Lsysc_do_svc
- jl .Lcleanup_system_call
- clg %r9,BASED(.Lcleanup_table+16) # .Lsysc_tif
- jl 0f
- clg %r9,BASED(.Lcleanup_table+24) # .Lsysc_restore
- jl .Lcleanup_sysc_tif
- clg %r9,BASED(.Lcleanup_table+32) # .Lsysc_done
- jl .Lcleanup_sysc_restore
- clg %r9,BASED(.Lcleanup_table+40) # .Lio_tif
- jl 0f
- clg %r9,BASED(.Lcleanup_table+48) # .Lio_restore
- jl .Lcleanup_io_tif
- clg %r9,BASED(.Lcleanup_table+56) # .Lio_done
- jl .Lcleanup_io_restore
- clg %r9,BASED(.Lcleanup_table+64) # psw_idle
- jl 0f
- clg %r9,BASED(.Lcleanup_table+72) # .Lpsw_idle_end
- jl .Lcleanup_idle
- clg %r9,BASED(.Lcleanup_table+80) # save_fpu_regs
- jl 0f
- clg %r9,BASED(.Lcleanup_table+88) # .Lsave_fpu_regs_end
- jl .Lcleanup_save_fpu_regs
- clg %r9,BASED(.Lcleanup_table+96) # load_fpu_regs
- jl 0f
- clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end
- jl .Lcleanup_load_fpu_regs
-0: BR_EX %r14,%r11
-ENDPROC(cleanup_critical)
-
- .align 8
-.Lcleanup_table:
- .quad system_call
- .quad .Lsysc_do_svc
- .quad .Lsysc_tif
- .quad .Lsysc_restore
- .quad .Lsysc_done
- .quad .Lio_tif
- .quad .Lio_restore
- .quad .Lio_done
- .quad psw_idle
- .quad .Lpsw_idle_end
- .quad save_fpu_regs
- .quad .Lsave_fpu_regs_end
- .quad load_fpu_regs
- .quad .Lload_fpu_regs_end
-
-#if IS_ENABLED(CONFIG_KVM)
-.Lcleanup_table_sie:
- .quad .Lsie_gmap
- .quad .Lsie_done
-
-.Lcleanup_sie:
- cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt?
- je 1f
- slg %r9,BASED(.Lsie_crit_mcck_start)
- clg %r9,BASED(.Lsie_crit_mcck_length)
- jh 1f
- oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
-1: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
- lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer
- ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE
- lctlg %c1,%c1,__LC_USER_ASCE # load primary asce
- larl %r9,sie_exit # skip forward to sie_exit
- BR_EX %r14,%r11
-#endif
+ .section .data, "aw"
+ .balign 4
+SYM_DATA_LOCAL(stop_lock, .long 0)
+SYM_DATA_LOCAL(this_cpu, .short 0)
+ .balign 8
+SYM_DATA_START_LOCAL(daton_psw)
+ .quad PSW_KERNEL_BITS
+ .quad .Ldaton
+SYM_DATA_END(daton_psw)
-.Lcleanup_system_call:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_system_call_insn)
- jh 0f
- mvc __LC_SYNC_ENTER_TIMER(8),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __LC_SYNC_ENTER_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: # check if stmg has been executed
- clg %r9,BASED(.Lcleanup_system_call_insn+8)
- jh 0f
- mvc __LC_SAVE_AREA_SYNC(64),0(%r11)
-0: # check if base register setup + TIF bit load has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+16)
- jhe 0f
- # set up saved register r12 task struct pointer
- stg %r12,32(%r11)
- # set up saved register r13 __TASK_thread offset
- mvc 40(8,%r11),BASED(.Lcleanup_system_call_const)
-0: # check if the user time update has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+24)
- jh 0f
- lg %r15,__LC_EXIT_TIMER
- slg %r15,__LC_SYNC_ENTER_TIMER
- alg %r15,__LC_USER_TIMER
- stg %r15,__LC_USER_TIMER
-0: # check if the system time update has been done
- clg %r9,BASED(.Lcleanup_system_call_insn+32)
- jh 0f
- lg %r15,__LC_LAST_UPDATE_TIMER
- slg %r15,__LC_EXIT_TIMER
- alg %r15,__LC_SYSTEM_TIMER
- stg %r15,__LC_SYSTEM_TIMER
-0: # update accounting time stamp
- mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER
- BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
- # set up saved register r11
- lg %r15,__LC_KERNEL_STACK
- la %r9,STACK_FRAME_OVERHEAD(%r15)
- stg %r9,24(%r11) # r11 pt_regs pointer
- # fill pt_regs
- mvc __PT_R8(64,%r9),__LC_SAVE_AREA_SYNC
- stmg %r0,%r7,__PT_R0(%r9)
- mvc __PT_PSW(16,%r9),__LC_SVC_OLD_PSW
- mvc __PT_INT_CODE(4,%r9),__LC_SVC_ILC
- xc __PT_FLAGS(8,%r9),__PT_FLAGS(%r9)
- mvi __PT_FLAGS+7(%r9),_PIF_SYSCALL
- # setup saved register r15
- stg %r15,56(%r11) # r15 stack pointer
- # set new psw address and exit
- larl %r9,.Lsysc_do_svc
- BR_EX %r14,%r11
-.Lcleanup_system_call_insn:
- .quad system_call
- .quad .Lsysc_stmg
- .quad .Lsysc_per
- .quad .Lsysc_vtime+36
- .quad .Lsysc_vtime+42
-.Lcleanup_system_call_const:
- .quad __TASK_thread
-
-.Lcleanup_sysc_tif:
- larl %r9,.Lsysc_tif
- BR_EX %r14,%r11
-
-.Lcleanup_sysc_restore:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_sysc_restore_insn)
- jh 0f
- mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8)
- je 1f
- lg %r9,24(%r11) # get saved pointer to pt_regs
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r9)
- mvc 0(64,%r11),__PT_R8(%r9)
- lmg %r0,%r7,__PT_R0(%r9)
-1: lmg %r8,%r9,__LC_RETURN_PSW
- BR_EX %r14,%r11
-.Lcleanup_sysc_restore_insn:
- .quad .Lsysc_exit_timer
- .quad .Lsysc_done - 4
-
-.Lcleanup_io_tif:
- larl %r9,.Lio_tif
- BR_EX %r14,%r11
-
-.Lcleanup_io_restore:
- # check if stpt has been executed
- clg %r9,BASED(.Lcleanup_io_restore_insn)
- jh 0f
- mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
-0: clg %r9,BASED(.Lcleanup_io_restore_insn+8)
- je 1f
- lg %r9,24(%r11) # get saved r11 pointer to pt_regs
- mvc __LC_RETURN_PSW(16),__PT_PSW(%r9)
- mvc 0(64,%r11),__PT_R8(%r9)
- lmg %r0,%r7,__PT_R0(%r9)
-1: lmg %r8,%r9,__LC_RETURN_PSW
- BR_EX %r14,%r11
-.Lcleanup_io_restore_insn:
- .quad .Lio_exit_timer
- .quad .Lio_done - 4
-
-.Lcleanup_idle:
- ni __LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT
- # copy interrupt clock & cpu timer
- mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_INT_CLOCK
- mvc __TIMER_IDLE_EXIT(8,%r2),__LC_ASYNC_ENTER_TIMER
- cghi %r11,__LC_SAVE_AREA_ASYNC
- je 0f
- mvc __CLOCK_IDLE_EXIT(8,%r2),__LC_MCCK_CLOCK
- mvc __TIMER_IDLE_EXIT(8,%r2),__LC_MCCK_ENTER_TIMER
-0: # check if stck & stpt have been executed
- clg %r9,BASED(.Lcleanup_idle_insn)
- jhe 1f
- mvc __CLOCK_IDLE_ENTER(8,%r2),__CLOCK_IDLE_EXIT(%r2)
- mvc __TIMER_IDLE_ENTER(8,%r2),__TIMER_IDLE_EXIT(%r2)
-1: # calculate idle cycles
- clg %r9,BASED(.Lcleanup_idle_insn)
- jl 3f
- larl %r1,smp_cpu_mtid
- llgf %r1,0(%r1)
- ltgr %r1,%r1
- jz 3f
- .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+80(%r15)
- larl %r3,mt_cycles
- ag %r3,__LC_PERCPU_OFFSET
- la %r4,__SF_EMPTY+16(%r15)
-2: lg %r0,0(%r3)
- slg %r0,0(%r4)
- alg %r0,64(%r4)
- stg %r0,0(%r3)
- la %r3,8(%r3)
- la %r4,8(%r4)
- brct %r1,2b
-3: # account system time going idle
- lg %r9,__LC_STEAL_TIMER
- alg %r9,__CLOCK_IDLE_ENTER(%r2)
- slg %r9,__LC_LAST_UPDATE_CLOCK
- stg %r9,__LC_STEAL_TIMER
- mvc __LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2)
- lg %r9,__LC_SYSTEM_TIMER
- alg %r9,__LC_LAST_UPDATE_TIMER
- slg %r9,__TIMER_IDLE_ENTER(%r2)
- stg %r9,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2)
- # prepare return psw
- nihh %r8,0xfcfd # clear irq & wait state bits
- lg %r9,48(%r11) # return from psw_idle
- BR_EX %r14,%r11
-.Lcleanup_idle_insn:
- .quad .Lpsw_idle_lpsw
-
-.Lcleanup_save_fpu_regs:
- larl %r9,save_fpu_regs
- BR_EX %r14,%r11
-
-.Lcleanup_load_fpu_regs:
- larl %r9,load_fpu_regs
- BR_EX %r14,%r11
-
-/*
- * Integer constants
- */
- .align 8
-.Lcritical_start:
- .quad .L__critical_start
-.Lcritical_length:
- .quad .L__critical_end - .L__critical_start
-#if IS_ENABLED(CONFIG_KVM)
-.Lsie_critical_start:
- .quad .Lsie_gmap
-.Lsie_critical_length:
- .quad .Lsie_done - .Lsie_gmap
-.Lsie_crit_mcck_start:
- .quad .Lsie_entry
-.Lsie_crit_mcck_length:
- .quad .Lsie_skip - .Lsie_entry
-#endif
.section .rodata, "a"
#define SYSCALL(esame,emu) .quad __s390x_ ## esame
- .globl sys_call_table
-sys_call_table:
+SYM_DATA_START(sys_call_table)
#include "asm/syscall_table.h"
+SYM_DATA_END(sys_call_table)
#undef SYSCALL
#ifdef CONFIG_COMPAT
#define SYSCALL(esame,emu) .quad __s390_ ## emu
- .globl sys_call_table_emu
-sys_call_table_emu:
+SYM_DATA_START(sys_call_table_emu)
#include "asm/syscall_table.h"
+SYM_DATA_END(sys_call_table_emu)
#undef SYSCALL
#endif
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index b2956d49b6ad..9f41853f36b9 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -5,11 +5,11 @@
#include <linux/percpu.h>
#include <linux/types.h>
#include <linux/signal.h>
+#include <asm/extable.h>
#include <asm/ptrace.h>
#include <asm/idle.h>
extern void *restart_stack;
-extern unsigned long suspend_zero_pages;
void system_call(void);
void pgm_check_handler(void);
@@ -17,53 +17,29 @@ void ext_int_handler(void);
void io_int_handler(void);
void mcck_int_handler(void);
void restart_int_handler(void);
-void restart_call_handler(void);
+void early_pgm_check_handler(void);
-asmlinkage long do_syscall_trace_enter(struct pt_regs *regs);
-asmlinkage void do_syscall_trace_exit(struct pt_regs *regs);
+void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs);
+void __do_pgm_check(struct pt_regs *regs);
+void __do_syscall(struct pt_regs *regs, int per_trap);
+void __do_early_pgm_check(struct pt_regs *regs);
void do_protection_exception(struct pt_regs *regs);
void do_dat_exception(struct pt_regs *regs);
-
-void addressing_exception(struct pt_regs *regs);
-void data_exception(struct pt_regs *regs);
-void default_trap_handler(struct pt_regs *regs);
-void divide_exception(struct pt_regs *regs);
-void execute_exception(struct pt_regs *regs);
-void hfp_divide_exception(struct pt_regs *regs);
-void hfp_overflow_exception(struct pt_regs *regs);
-void hfp_significance_exception(struct pt_regs *regs);
-void hfp_sqrt_exception(struct pt_regs *regs);
-void hfp_underflow_exception(struct pt_regs *regs);
-void illegal_op(struct pt_regs *regs);
-void operand_exception(struct pt_regs *regs);
-void overflow_exception(struct pt_regs *regs);
-void privileged_op(struct pt_regs *regs);
-void space_switch_exception(struct pt_regs *regs);
-void special_op_exception(struct pt_regs *regs);
-void specification_exception(struct pt_regs *regs);
-void transaction_exception(struct pt_regs *regs);
-void translation_exception(struct pt_regs *regs);
-void vector_exception(struct pt_regs *regs);
-
-void do_per_trap(struct pt_regs *regs);
+void do_secure_storage_access(struct pt_regs *regs);
+void do_non_secure_storage_access(struct pt_regs *regs);
+void do_secure_storage_violation(struct pt_regs *regs);
void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str);
-void syscall_trace(struct pt_regs *regs, int entryexit);
void kernel_stack_overflow(struct pt_regs * regs);
-void do_signal(struct pt_regs *regs);
void handle_signal32(struct ksignal *ksig, sigset_t *oldset,
struct pt_regs *regs);
-void do_notify_resume(struct pt_regs *regs);
-void __init init_IRQ(void);
-void do_IRQ(struct pt_regs *regs, int irq);
-void do_restart(void);
-void __init startup_init_nobss(void);
+void do_io_irq(struct pt_regs *regs);
+void do_ext_irq(struct pt_regs *regs);
+void do_restart(void *arg);
void __init startup_init(void);
void die(struct pt_regs *regs, const char *str);
int setup_profiling_timer(unsigned int multiplier);
-void __init time_init(void);
-void s390_early_resume(void);
unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip);
struct s390_mmap_arg_struct;
@@ -82,10 +58,18 @@ long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user
DECLARE_PER_CPU(u64, mt_cycles[8]);
-void gs_load_bc_cb(struct pt_regs *regs);
-void set_fs_fixup(void);
-
unsigned long stack_alloc(void);
void stack_free(unsigned long stack);
+extern char kprobes_insn_page[];
+
+extern char _samode31[], _eamode31[];
+extern char _stext_amode31[], _etext_amode31[];
+extern struct exception_table_entry _start_amode31_ex_table[];
+extern struct exception_table_entry _stop_amode31_ex_table[];
+
+#define __amode31_data __section(".amode31.data")
+#define __amode31_ref __section(".amode31.refs")
+extern long _start_amode31_refs[], _end_amode31_refs[];
+
#endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c
new file mode 100644
index 000000000000..f02127219a27
--- /dev/null
+++ b/arch/s390/kernel/facility.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 2023
+ */
+
+#include <asm/facility.h>
+
+unsigned int stfle_size(void)
+{
+ static unsigned int size;
+ unsigned int r;
+ u64 dummy;
+
+ r = READ_ONCE(size);
+ if (!r) {
+ r = __stfle_asm(&dummy, 1) + 1;
+ WRITE_ONCE(size, r);
+ }
+ return r;
+}
+EXPORT_SYMBOL(stfle_size);
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
index 0da378e2eb25..a4f3449cc814 100644
--- a/arch/s390/kernel/fpu.c
+++ b/arch/s390/kernel/fpu.c
@@ -10,8 +10,7 @@
#include <linux/sched.h>
#include <asm/fpu/types.h>
#include <asm/fpu/api.h>
-
-asm(".include \"asm/vx-insn.h\"\n");
+#include <asm/vx-insn.h>
void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
{
@@ -25,7 +24,7 @@ void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
/* Save floating point control */
asm volatile("stfpc %0" : "=Q" (state->fpc));
- if (!MACHINE_HAS_VX) {
+ if (!cpu_has_vx()) {
if (flags & KERNEL_VXR_V0V7) {
/* Save floating-point registers */
asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
@@ -107,7 +106,7 @@ void __kernel_fpu_end(struct kernel_fpu *state, u32 flags)
/* Restore floating-point controls */
asm volatile("lfpc %0" : : "Q" (state->fpc));
- if (!MACHINE_HAS_VX) {
+ if (!cpu_has_vx()) {
if (flags & KERNEL_VXR_V0V7) {
/* Restore floating-point registers */
asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
@@ -175,3 +174,90 @@ void __kernel_fpu_end(struct kernel_fpu *state, u32 flags)
: "1", "cc");
}
EXPORT_SYMBOL(__kernel_fpu_end);
+
+void __load_fpu_regs(void)
+{
+ unsigned long *regs = current->thread.fpu.regs;
+ struct fpu *state = &current->thread.fpu;
+
+ sfpc_safe(state->fpc);
+ if (likely(cpu_has_vx())) {
+ asm volatile("lgr 1,%0\n"
+ "VLM 0,15,0,1\n"
+ "VLM 16,31,256,1\n"
+ :
+ : "d" (regs)
+ : "1", "cc", "memory");
+ } else {
+ asm volatile("ld 0,%0" : : "Q" (regs[0]));
+ asm volatile("ld 1,%0" : : "Q" (regs[1]));
+ asm volatile("ld 2,%0" : : "Q" (regs[2]));
+ asm volatile("ld 3,%0" : : "Q" (regs[3]));
+ asm volatile("ld 4,%0" : : "Q" (regs[4]));
+ asm volatile("ld 5,%0" : : "Q" (regs[5]));
+ asm volatile("ld 6,%0" : : "Q" (regs[6]));
+ asm volatile("ld 7,%0" : : "Q" (regs[7]));
+ asm volatile("ld 8,%0" : : "Q" (regs[8]));
+ asm volatile("ld 9,%0" : : "Q" (regs[9]));
+ asm volatile("ld 10,%0" : : "Q" (regs[10]));
+ asm volatile("ld 11,%0" : : "Q" (regs[11]));
+ asm volatile("ld 12,%0" : : "Q" (regs[12]));
+ asm volatile("ld 13,%0" : : "Q" (regs[13]));
+ asm volatile("ld 14,%0" : : "Q" (regs[14]));
+ asm volatile("ld 15,%0" : : "Q" (regs[15]));
+ }
+ clear_cpu_flag(CIF_FPU);
+}
+
+void load_fpu_regs(void)
+{
+ raw_local_irq_disable();
+ __load_fpu_regs();
+ raw_local_irq_enable();
+}
+EXPORT_SYMBOL(load_fpu_regs);
+
+void save_fpu_regs(void)
+{
+ unsigned long flags, *regs;
+ struct fpu *state;
+
+ local_irq_save(flags);
+
+ if (test_cpu_flag(CIF_FPU))
+ goto out;
+
+ state = &current->thread.fpu;
+ regs = current->thread.fpu.regs;
+
+ asm volatile("stfpc %0" : "=Q" (state->fpc));
+ if (likely(cpu_has_vx())) {
+ asm volatile("lgr 1,%0\n"
+ "VSTM 0,15,0,1\n"
+ "VSTM 16,31,256,1\n"
+ :
+ : "d" (regs)
+ : "1", "cc", "memory");
+ } else {
+ asm volatile("std 0,%0" : "=Q" (regs[0]));
+ asm volatile("std 1,%0" : "=Q" (regs[1]));
+ asm volatile("std 2,%0" : "=Q" (regs[2]));
+ asm volatile("std 3,%0" : "=Q" (regs[3]));
+ asm volatile("std 4,%0" : "=Q" (regs[4]));
+ asm volatile("std 5,%0" : "=Q" (regs[5]));
+ asm volatile("std 6,%0" : "=Q" (regs[6]));
+ asm volatile("std 7,%0" : "=Q" (regs[7]));
+ asm volatile("std 8,%0" : "=Q" (regs[8]));
+ asm volatile("std 9,%0" : "=Q" (regs[9]));
+ asm volatile("std 10,%0" : "=Q" (regs[10]));
+ asm volatile("std 11,%0" : "=Q" (regs[11]));
+ asm volatile("std 12,%0" : "=Q" (regs[12]));
+ asm volatile("std 13,%0" : "=Q" (regs[13]));
+ asm volatile("std 14,%0" : "=Q" (regs[14]));
+ asm volatile("std 15,%0" : "=Q" (regs[15]));
+ }
+ set_cpu_flag(CIF_FPU);
+out:
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(save_fpu_regs);
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 1bb85f60c0dd..c46381ea04ec 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -4,8 +4,7 @@
*
* Copyright IBM Corp. 2009,2014
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- * Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*/
#include <linux/moduleloader.h>
@@ -17,179 +16,217 @@
#include <linux/kprobes.h>
#include <trace/syscall.h>
#include <asm/asm-offsets.h>
+#include <asm/text-patching.h>
#include <asm/cacheflush.h>
+#include <asm/ftrace.lds.h>
+#include <asm/nospec-branch.h>
#include <asm/set_memory.h>
#include "entry.h"
+#include "ftrace.h"
/*
- * The mcount code looks like this:
- * stg %r14,8(%r15) # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
- * Total length is 24 bytes. Only the first instruction will be patched
- * by ftrace_make_call / ftrace_make_nop.
- * The enabled ftrace code block looks like this:
+ * To generate function prologue either gcc's hotpatch feature (since gcc 4.8)
+ * or a combination of -pg -mrecord-mcount -mnop-mcount -mfentry flags
+ * (since gcc 9 / clang 10) is used.
+ * In both cases the original and also the disabled function prologue contains
+ * only a single six byte instruction and looks like this:
+ * > brcl 0,0 # offset 0
+ * To enable ftrace the code gets patched like above and afterwards looks
+ * like this:
* > brasl %r0,ftrace_caller # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
+ *
+ * The instruction will be patched by ftrace_make_call / ftrace_make_nop.
* The ftrace function gets called with a non-standard C function call ABI
* where r0 contains the return address. It is also expected that the called
* function only clobbers r0 and r1, but restores r2-r15.
* For module code we can't directly jump to ftrace caller, but need a
* trampoline (ftrace_plt), which clobbers also r1.
- * The return point of the ftrace function has offset 24, so execution
- * continues behind the mcount block.
- * The disabled ftrace code block looks like this:
- * > jg .+24 # offset 0
- * larl %r1,<&counter> # offset 6
- * brasl %r14,_mcount # offset 12
- * lg %r14,8(%r15) # offset 18
- * The jg instruction branches to offset 24 to skip as many instructions
- * as possible.
- * In case we use gcc's hotpatch feature the original and also the disabled
- * function prologue contains only a single six byte instruction and looks
- * like this:
- * > brcl 0,0 # offset 0
- * To enable ftrace the code gets patched like above and afterwards looks
- * like this:
- * > brasl %r0,ftrace_caller # offset 0
*/
-unsigned long ftrace_plt;
+void *ftrace_func __read_mostly = ftrace_stub;
+struct ftrace_insn {
+ u16 opc;
+ s32 disp;
+} __packed;
-static inline void ftrace_generate_orig_insn(struct ftrace_insn *insn)
+#ifdef CONFIG_MODULES
+static char *ftrace_plt;
+#endif /* CONFIG_MODULES */
+
+static const char *ftrace_shared_hotpatch_trampoline(const char **end)
{
-#if defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT)
- /* brcl 0,0 */
- insn->opc = 0xc004;
- insn->disp = 0;
-#else
- /* stg r14,8(r15) */
- insn->opc = 0xe3e0;
- insn->disp = 0xf0080024;
-#endif
+ const char *tstart, *tend;
+
+ tstart = ftrace_shared_hotpatch_trampoline_br;
+ tend = ftrace_shared_hotpatch_trampoline_br_end;
+#ifdef CONFIG_EXPOLINE
+ if (!nospec_disable) {
+ tstart = ftrace_shared_hotpatch_trampoline_exrl;
+ tend = ftrace_shared_hotpatch_trampoline_exrl_end;
+ }
+#endif /* CONFIG_EXPOLINE */
+ if (end)
+ *end = tend;
+ return tstart;
}
-static inline int is_kprobe_on_ftrace(struct ftrace_insn *insn)
+bool ftrace_need_init_nop(void)
{
-#ifdef CONFIG_KPROBES
- if (insn->opc == BREAKPOINT_INSTRUCTION)
- return 1;
-#endif
- return 0;
+ return true;
}
-static inline void ftrace_generate_kprobe_nop_insn(struct ftrace_insn *insn)
+int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
{
-#ifdef CONFIG_KPROBES
- insn->opc = BREAKPOINT_INSTRUCTION;
- insn->disp = KPROBE_ON_FTRACE_NOP;
+ static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline =
+ __ftrace_hotpatch_trampolines_start;
+ static const char orig[6] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
+ static struct ftrace_hotpatch_trampoline *trampoline;
+ struct ftrace_hotpatch_trampoline **next_trampoline;
+ struct ftrace_hotpatch_trampoline *trampolines_end;
+ struct ftrace_hotpatch_trampoline tmp;
+ struct ftrace_insn *insn;
+ const char *shared;
+ s32 disp;
+
+ BUILD_BUG_ON(sizeof(struct ftrace_hotpatch_trampoline) !=
+ SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE);
+
+ next_trampoline = &next_vmlinux_trampoline;
+ trampolines_end = __ftrace_hotpatch_trampolines_end;
+ shared = ftrace_shared_hotpatch_trampoline(NULL);
+#ifdef CONFIG_MODULES
+ if (mod) {
+ next_trampoline = &mod->arch.next_trampoline;
+ trampolines_end = mod->arch.trampolines_end;
+ shared = ftrace_plt;
+ }
#endif
+
+ if (WARN_ON_ONCE(*next_trampoline >= trampolines_end))
+ return -ENOMEM;
+ trampoline = (*next_trampoline)++;
+
+ /* Check for the compiler-generated fentry nop (brcl 0, .). */
+ if (WARN_ON_ONCE(memcmp((const void *)rec->ip, &orig, sizeof(orig))))
+ return -EINVAL;
+
+ /* Generate the trampoline. */
+ tmp.brasl_opc = 0xc015; /* brasl %r1, shared */
+ tmp.brasl_disp = (shared - (const char *)&trampoline->brasl_opc) / 2;
+ tmp.interceptor = FTRACE_ADDR;
+ tmp.rest_of_intercepted_function = rec->ip + sizeof(struct ftrace_insn);
+ s390_kernel_write(trampoline, &tmp, sizeof(tmp));
+
+ /* Generate a jump to the trampoline. */
+ disp = ((char *)trampoline - (char *)rec->ip) / 2;
+ insn = (struct ftrace_insn *)rec->ip;
+ s390_kernel_write(&insn->disp, &disp, sizeof(disp));
+
+ return 0;
}
-static inline void ftrace_generate_kprobe_call_insn(struct ftrace_insn *insn)
+static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec)
{
-#ifdef CONFIG_KPROBES
- insn->opc = BREAKPOINT_INSTRUCTION;
- insn->disp = KPROBE_ON_FTRACE_CALL;
-#endif
+ struct ftrace_hotpatch_trampoline *trampoline;
+ struct ftrace_insn insn;
+ s64 disp;
+ u16 opc;
+
+ if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn)))
+ return ERR_PTR(-EFAULT);
+ disp = (s64)insn.disp * 2;
+ trampoline = (void *)(rec->ip + disp);
+ if (get_kernel_nofault(opc, &trampoline->brasl_opc))
+ return ERR_PTR(-EFAULT);
+ if (opc != 0xc015)
+ return ERR_PTR(-EINVAL);
+ return trampoline;
}
int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
+ struct ftrace_hotpatch_trampoline *trampoline;
+ u64 old;
+
+ trampoline = ftrace_get_trampoline(rec);
+ if (IS_ERR(trampoline))
+ return PTR_ERR(trampoline);
+ if (get_kernel_nofault(old, &trampoline->interceptor))
+ return -EFAULT;
+ if (old != old_addr)
+ return -EINVAL;
+ s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr));
return 0;
}
-int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
- unsigned long addr)
+static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable)
{
- struct ftrace_insn orig, new, old;
+ u16 old;
+ u8 op;
- if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
+ if (get_kernel_nofault(old, addr))
return -EFAULT;
- if (addr == MCOUNT_ADDR) {
- /* Initial code replacement */
- ftrace_generate_orig_insn(&orig);
- ftrace_generate_nop_insn(&new);
- } else if (is_kprobe_on_ftrace(&old)) {
- /*
- * If we find a breakpoint instruction, a kprobe has been
- * placed at the beginning of the function. We write the
- * constant KPROBE_ON_FTRACE_NOP into the remaining four
- * bytes of the original instruction so that the kprobes
- * handler can execute a nop, if it reaches this breakpoint.
- */
- ftrace_generate_kprobe_call_insn(&orig);
- ftrace_generate_kprobe_nop_insn(&new);
- } else {
- /* Replace ftrace call with a nop. */
- ftrace_generate_call_insn(&orig, rec->ip);
- ftrace_generate_nop_insn(&new);
- }
- /* Verify that the to be replaced code matches what we expect. */
- if (memcmp(&orig, &old, sizeof(old)))
+ if (old != expected)
return -EINVAL;
- s390_kernel_write((void *) rec->ip, &new, sizeof(new));
+ /* set mask field to all ones or zeroes */
+ op = enable ? 0xf4 : 0x04;
+ s390_kernel_write((char *)addr + 1, &op, sizeof(op));
return 0;
}
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
+ unsigned long addr)
+{
+ /* Expect brcl 0xf,... */
+ return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false);
+}
+
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- struct ftrace_insn orig, new, old;
+ struct ftrace_hotpatch_trampoline *trampoline;
- if (probe_kernel_read(&old, (void *) rec->ip, sizeof(old)))
- return -EFAULT;
- if (is_kprobe_on_ftrace(&old)) {
- /*
- * If we find a breakpoint instruction, a kprobe has been
- * placed at the beginning of the function. We write the
- * constant KPROBE_ON_FTRACE_CALL into the remaining four
- * bytes of the original instruction so that the kprobes
- * handler can execute a brasl if it reaches this breakpoint.
- */
- ftrace_generate_kprobe_nop_insn(&orig);
- ftrace_generate_kprobe_call_insn(&new);
- } else {
- /* Replace nop with an ftrace call. */
- ftrace_generate_nop_insn(&orig);
- ftrace_generate_call_insn(&new, rec->ip);
- }
- /* Verify that the to be replaced code matches what we expect. */
- if (memcmp(&orig, &old, sizeof(old)))
- return -EINVAL;
- s390_kernel_write((void *) rec->ip, &new, sizeof(new));
- return 0;
+ trampoline = ftrace_get_trampoline(rec);
+ if (IS_ERR(trampoline))
+ return PTR_ERR(trampoline);
+ s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr));
+ /* Expect brcl 0x0,... */
+ return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true);
}
int ftrace_update_ftrace_func(ftrace_func_t func)
{
+ ftrace_func = func;
return 0;
}
-int __init ftrace_dyn_arch_init(void)
+void arch_ftrace_update_code(int command)
{
- return 0;
+ ftrace_modify_all_code(command);
+}
+
+void ftrace_arch_code_modify_post_process(void)
+{
+ /*
+ * Flush any pre-fetched instructions on all
+ * CPUs to make the new code visible.
+ */
+ text_poke_sync_lock();
}
#ifdef CONFIG_MODULES
static int __init ftrace_plt_init(void)
{
- unsigned int *ip;
+ const char *start, *end;
- ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE);
+ ftrace_plt = module_alloc(PAGE_SIZE);
if (!ftrace_plt)
panic("cannot allocate ftrace plt\n");
- ip = (unsigned int *) ftrace_plt;
- ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
- ip[1] = 0x100a0004;
- ip[2] = 0x07f10000;
- ip[3] = FTRACE_ADDR >> 32;
- ip[4] = FTRACE_ADDR & 0xffffffff;
- set_memory_ro(ftrace_plt, 1);
+
+ start = ftrace_shared_hotpatch_trampoline(&end);
+ memcpy(ftrace_plt, start, end - start);
+ set_memory_rox((unsigned long)ftrace_plt, 1);
return 0;
}
device_initcall(ftrace_plt_init);
@@ -226,18 +263,78 @@ NOKPROBE_SYMBOL(prepare_ftrace_return);
*/
int ftrace_enable_ftrace_graph_caller(void)
{
- u8 op = 0x04; /* set mask field to zero */
+ int rc;
- s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op));
+ /* Expect brc 0xf,... */
+ rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false);
+ if (rc)
+ return rc;
+ text_poke_sync_lock();
return 0;
}
int ftrace_disable_ftrace_graph_caller(void)
{
- u8 op = 0xf4; /* set mask field to all ones */
+ int rc;
- s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op));
+ /* Expect brc 0x0,... */
+ rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true);
+ if (rc)
+ return rc;
+ text_poke_sync_lock();
return 0;
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#ifdef CONFIG_KPROBES_ON_FTRACE
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct kprobe_ctlblk *kcb;
+ struct pt_regs *regs;
+ struct kprobe *p;
+ int bit;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ regs = ftrace_get_regs(fregs);
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (!regs || unlikely(!p) || kprobe_disabled(p))
+ goto out;
+
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ goto out;
+ }
+
+ __this_cpu_write(current_kprobe, p);
+
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ instruction_pointer_set(regs, ip);
+
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+
+ instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE);
+
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ }
+ __this_cpu_write(current_kprobe, NULL);
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ return 0;
+}
+#endif
diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h
new file mode 100644
index 000000000000..7f75a9616406
--- /dev/null
+++ b/arch/s390/kernel/ftrace.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FTRACE_H
+#define _FTRACE_H
+
+#include <asm/types.h>
+
+struct ftrace_hotpatch_trampoline {
+ u16 brasl_opc;
+ s32 brasl_disp;
+ s16: 16;
+ u64 rest_of_intercepted_function;
+ u64 interceptor;
+} __packed;
+
+extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_start[];
+extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_end[];
+extern const char ftrace_shared_hotpatch_trampoline_br[];
+extern const char ftrace_shared_hotpatch_trampoline_br_end[];
+extern const char ftrace_shared_hotpatch_trampoline_exrl[];
+extern const char ftrace_shared_hotpatch_trampoline_exrl_end[];
+extern const char ftrace_plt_template[];
+extern const char ftrace_plt_template_end[];
+
+#endif /* _FTRACE_H */
diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c
index d14dd1c2e524..0b68168d9566 100644
--- a/arch/s390/kernel/guarded_storage.c
+++ b/arch/s390/kernel/guarded_storage.c
@@ -28,7 +28,7 @@ static int gs_enable(void)
return -ENOMEM;
gs_cb->gsd = 25;
preempt_disable();
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
load_gs_cb(gs_cb);
current->thread.gs_cb = gs_cb;
preempt_enable();
@@ -42,7 +42,7 @@ static int gs_disable(void)
preempt_disable();
kfree(current->thread.gs_cb);
current->thread.gs_cb = NULL;
- __ctl_clear_bit(2, 4);
+ local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT);
preempt_enable();
}
return 0;
@@ -84,7 +84,7 @@ void gs_load_bc_cb(struct pt_regs *regs)
if (gs_cb) {
kfree(current->thread.gs_cb);
current->thread.gs_bc_cb = NULL;
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
load_gs_cb(gs_cb);
current->thread.gs_cb = gs_cb;
}
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 8b88dbbda7df..45413b04efc5 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -5,7 +5,6 @@
* Author(s): Hartmut Penner <hp@de.ibm.com>
* Martin Schwidefsky <schwidefsky@de.ibm.com>
* Rob van der Heij <rvdhei@iae.nl>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
*
*/
@@ -17,32 +16,25 @@
#include <asm/ptrace.h>
__HEAD
-ENTRY(startup_continue)
- tm __LC_STFLE_FAC_LIST+5,0x80 # LPP available ?
- jz 0f
- xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid
- mvi __LC_LPP,0x80 # and set LPP_MAGIC
- .insn s,0xb2800000,__LC_LPP # load program parameter
-0: larl %r1,tod_clock_base
+SYM_CODE_START(startup_continue)
+ larl %r1,tod_clock_base
mvc 0(16,%r1),__LC_BOOT_CLOCK
- larl %r13,.LPG1 # get base
#
# Setup stack
#
larl %r14,init_task
stg %r14,__LC_CURRENT
- larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD-__PT_SIZE
-#ifdef CONFIG_KASAN
- brasl %r14,kasan_early_init
-#endif
+ larl %r15,init_thread_union+STACK_INIT_OFFSET
+ stg %r15,__LC_KERNEL_STACK
+ brasl %r14,sclp_early_adjust_va # allow sclp_early_printk
brasl %r14,startup_init # s390 specific early init
brasl %r14,start_kernel # common init code
#
# We returned from start_kernel ?!? PANIK
#
basr %r13,0
- lpswe .Ldw-.(%r13) # load disabled wait psw
+ lpswe dw_psw-.(%r13) # load disabled wait psw
+SYM_CODE_END(startup_continue)
- .align 16
-.LPG1:
-.Ldw: .quad 0x0002000180000000,0x0000000000000000
+ .balign 16
+SYM_DATA_LOCAL(dw_psw, .quad 0x0002000180000000,0x0000000000000000)
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 8f8456816d83..e7239aaf428b 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -9,134 +9,86 @@
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
-#include <linux/kprobes.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/cpu.h>
-#include <linux/sched/cputime.h>
+#include <trace/events/power.h>
+#include <asm/cpu_mf.h>
+#include <asm/cputime.h>
#include <asm/nmi.h>
#include <asm/smp.h>
#include "entry.h"
static DEFINE_PER_CPU(struct s390_idle_data, s390_idle);
-void enabled_wait(void)
+void account_idle_time_irq(void)
{
struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
- unsigned long long idle_time;
- unsigned long psw_mask;
+ unsigned long idle_time;
+ u64 cycles_new[8];
+ int i;
+
+ if (smp_cpu_mtid) {
+ stcctm(MT_DIAG, smp_cpu_mtid, cycles_new);
+ for (i = 0; i < smp_cpu_mtid; i++)
+ this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]);
+ }
- trace_hardirqs_on();
+ idle_time = S390_lowcore.int_clock - idle->clock_idle_enter;
- /* Wait for external, I/O or machine check interrupt. */
- psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT |
- PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
- clear_cpu_flag(CIF_NOHZ_DELAY);
+ S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock;
+ S390_lowcore.last_update_clock = S390_lowcore.int_clock;
- /* Call the assembler magic in entry.S */
- psw_idle(idle, psw_mask);
-
- trace_hardirqs_off();
+ S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter;
+ S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer;
/* Account time spent with enabled wait psw loaded as idle time. */
- write_seqcount_begin(&idle->seqcount);
- idle_time = idle->clock_idle_exit - idle->clock_idle_enter;
- idle->clock_idle_enter = idle->clock_idle_exit = 0ULL;
- idle->idle_time += idle_time;
- idle->idle_count++;
+ WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time);
+ WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1);
account_idle_time(cputime_to_nsecs(idle_time));
- write_seqcount_end(&idle->seqcount);
}
-NOKPROBE_SYMBOL(enabled_wait);
+
+void noinstr arch_cpu_idle(void)
+{
+ struct s390_idle_data *idle = this_cpu_ptr(&s390_idle);
+ unsigned long psw_mask;
+
+ /* Wait for external, I/O or machine check interrupt. */
+ psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT |
+ PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
+ clear_cpu_flag(CIF_NOHZ_DELAY);
+
+ /* psw_idle() returns with interrupts disabled. */
+ psw_idle(idle, psw_mask);
+}
static ssize_t show_idle_count(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr, char *buf)
{
struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
- unsigned long long idle_count;
- unsigned int seq;
-
- do {
- seq = read_seqcount_begin(&idle->seqcount);
- idle_count = READ_ONCE(idle->idle_count);
- if (READ_ONCE(idle->clock_idle_enter))
- idle_count++;
- } while (read_seqcount_retry(&idle->seqcount, seq));
- return sprintf(buf, "%llu\n", idle_count);
+
+ return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count));
}
DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL);
static ssize_t show_idle_time(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr, char *buf)
{
- unsigned long long now, idle_time, idle_enter, idle_exit, in_idle;
struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id);
- unsigned int seq;
-
- do {
- seq = read_seqcount_begin(&idle->seqcount);
- idle_time = READ_ONCE(idle->idle_time);
- idle_enter = READ_ONCE(idle->clock_idle_enter);
- idle_exit = READ_ONCE(idle->clock_idle_exit);
- } while (read_seqcount_retry(&idle->seqcount, seq));
- in_idle = 0;
- now = get_tod_clock();
- if (idle_enter) {
- if (idle_exit) {
- in_idle = idle_exit - idle_enter;
- } else if (now > idle_enter) {
- in_idle = now - idle_enter;
- }
- }
- idle_time += in_idle;
- return sprintf(buf, "%llu\n", idle_time >> 12);
-}
-DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
-u64 arch_cpu_idle_time(int cpu)
-{
- struct s390_idle_data *idle = &per_cpu(s390_idle, cpu);
- unsigned long long now, idle_enter, idle_exit, in_idle;
- unsigned int seq;
-
- do {
- seq = read_seqcount_begin(&idle->seqcount);
- idle_enter = READ_ONCE(idle->clock_idle_enter);
- idle_exit = READ_ONCE(idle->clock_idle_exit);
- } while (read_seqcount_retry(&idle->seqcount, seq));
- in_idle = 0;
- now = get_tod_clock();
- if (idle_enter) {
- if (idle_exit) {
- in_idle = idle_exit - idle_enter;
- } else if (now > idle_enter) {
- in_idle = now - idle_enter;
- }
- }
- return cputime_to_nsecs(in_idle);
+ return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12);
}
+DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL);
void arch_cpu_idle_enter(void)
{
- local_mcck_disable();
-}
-
-void arch_cpu_idle(void)
-{
- if (!test_cpu_flag(CIF_MCCK_PENDING))
- /* Halt the cpu and keep track of cpu time accounting. */
- enabled_wait();
- local_irq_enable();
}
void arch_cpu_idle_exit(void)
{
- local_mcck_enable();
- if (test_cpu_flag(CIF_MCCK_PENDING))
- s390_handle_mcck();
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
cpu_die();
}
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 6837affc19e8..ba75f6bee774 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -4,7 +4,6 @@
*
* Copyright IBM Corp. 2005, 2012
* Author(s): Michael Holzheu <holzheu@de.ibm.com>
- * Heiko Carstens <heiko.carstens@de.ibm.com>
* Volker Sameske <sameske@de.ibm.com>
*/
@@ -13,12 +12,15 @@
#include <linux/init.h>
#include <linux/device.h>
#include <linux/delay.h>
+#include <linux/kstrtox.h>
+#include <linux/panic_notifier.h>
#include <linux/reboot.h>
#include <linux/ctype.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/crash_dump.h>
#include <linux/debug_locks.h>
+#include <asm/asm-extable.h>
#include <asm/diag.h>
#include <asm/ipl.h>
#include <asm/smp.h>
@@ -28,6 +30,7 @@
#include <asm/sclp.h>
#include <asm/checksum.h>
#include <asm/debug.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
#include <asm/sections.h>
#include <asm/boot_data.h>
@@ -37,12 +40,18 @@
#define IPL_UNKNOWN_STR "unknown"
#define IPL_CCW_STR "ccw"
+#define IPL_ECKD_STR "eckd"
+#define IPL_ECKD_DUMP_STR "eckd_dump"
#define IPL_FCP_STR "fcp"
#define IPL_FCP_DUMP_STR "fcp_dump"
+#define IPL_NVME_STR "nvme"
+#define IPL_NVME_DUMP_STR "nvme_dump"
#define IPL_NSS_STR "nss"
#define DUMP_CCW_STR "ccw"
+#define DUMP_ECKD_STR "eckd"
#define DUMP_FCP_STR "fcp"
+#define DUMP_NVME_STR "nvme"
#define DUMP_NONE_STR "none"
/*
@@ -87,12 +96,20 @@ static char *ipl_type_str(enum ipl_type type)
switch (type) {
case IPL_TYPE_CCW:
return IPL_CCW_STR;
+ case IPL_TYPE_ECKD:
+ return IPL_ECKD_STR;
+ case IPL_TYPE_ECKD_DUMP:
+ return IPL_ECKD_DUMP_STR;
case IPL_TYPE_FCP:
return IPL_FCP_STR;
case IPL_TYPE_FCP_DUMP:
return IPL_FCP_DUMP_STR;
case IPL_TYPE_NSS:
return IPL_NSS_STR;
+ case IPL_TYPE_NVME:
+ return IPL_NVME_STR;
+ case IPL_TYPE_NVME_DUMP:
+ return IPL_NVME_DUMP_STR;
case IPL_TYPE_UNKNOWN:
default:
return IPL_UNKNOWN_STR;
@@ -103,6 +120,8 @@ enum dump_type {
DUMP_TYPE_NONE = 1,
DUMP_TYPE_CCW = 2,
DUMP_TYPE_FCP = 4,
+ DUMP_TYPE_NVME = 8,
+ DUMP_TYPE_ECKD = 16,
};
static char *dump_type_str(enum dump_type type)
@@ -112,8 +131,12 @@ static char *dump_type_str(enum dump_type type)
return DUMP_NONE_STR;
case DUMP_TYPE_CCW:
return DUMP_CCW_STR;
+ case DUMP_TYPE_ECKD:
+ return DUMP_ECKD_STR;
case DUMP_TYPE_FCP:
return DUMP_FCP_STR;
+ case DUMP_TYPE_NVME:
+ return DUMP_NVME_STR;
default:
return NULL;
}
@@ -133,37 +156,48 @@ static int reipl_capabilities = IPL_TYPE_UNKNOWN;
static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN;
static struct ipl_parameter_block *reipl_block_fcp;
+static struct ipl_parameter_block *reipl_block_nvme;
static struct ipl_parameter_block *reipl_block_ccw;
+static struct ipl_parameter_block *reipl_block_eckd;
static struct ipl_parameter_block *reipl_block_nss;
static struct ipl_parameter_block *reipl_block_actual;
static int dump_capabilities = DUMP_TYPE_NONE;
static enum dump_type dump_type = DUMP_TYPE_NONE;
static struct ipl_parameter_block *dump_block_fcp;
+static struct ipl_parameter_block *dump_block_nvme;
static struct ipl_parameter_block *dump_block_ccw;
+static struct ipl_parameter_block *dump_block_eckd;
static struct sclp_ipl_info sclp_ipl_info;
-static inline int __diag308(unsigned long subcode, void *addr)
+static bool reipl_nvme_clear;
+static bool reipl_fcp_clear;
+static bool reipl_ccw_clear;
+static bool reipl_eckd_clear;
+
+static unsigned long os_info_flags;
+
+static inline int __diag308(unsigned long subcode, unsigned long addr)
{
- register unsigned long _addr asm("0") = (unsigned long) addr;
- register unsigned long _rc asm("1") = 0;
+ union register_pair r1;
+ r1.even = addr;
+ r1.odd = 0;
asm volatile(
- " diag %0,%2,0x308\n"
+ " diag %[r1],%[subcode],0x308\n"
"0: nopr %%r7\n"
EX_TABLE(0b,0b)
- : "+d" (_addr), "+d" (_rc)
- : "d" (subcode) : "cc", "memory");
- return _rc;
+ : [r1] "+&d" (r1.pair)
+ : [subcode] "d" (subcode)
+ : "cc", "memory");
+ return r1.odd;
}
int diag308(unsigned long subcode, void *addr)
{
- if (IS_ENABLED(CONFIG_KASAN))
- __arch_local_irq_stosm(0x04); /* enable DAT */
diag_stat_inc(DIAG_STAT_X308);
- return __diag308(subcode, addr);
+ return __diag308(subcode, addr ? virt_to_phys(addr) : 0);
}
EXPORT_SYMBOL_GPL(diag308);
@@ -174,7 +208,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
- return snprintf(page, PAGE_SIZE, _format, ##args); \
+ return scnprintf(page, PAGE_SIZE, _format, ##args); \
}
#define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \
@@ -200,14 +234,14 @@ IPL_ATTR_SHOW_FN(_prefix, _name, "0.%x.%04x\n", \
_ipl_blk.ssid, _ipl_blk.devno); \
IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk); \
static struct kobj_attribute sys_##_prefix##_##_name##_attr = \
- __ATTR(_name, (S_IRUGO | S_IWUSR), \
+ __ATTR(_name, 0644, \
sys_##_prefix##_##_name##_show, \
sys_##_prefix##_##_name##_store) \
#define DEFINE_IPL_ATTR_RO(_prefix, _name, _format, _value) \
IPL_ATTR_SHOW_FN(_prefix, _name, _format, _value) \
static struct kobj_attribute sys_##_prefix##_##_name##_attr = \
- __ATTR(_name, S_IRUGO, sys_##_prefix##_##_name##_show, NULL)
+ __ATTR(_name, 0444, sys_##_prefix##_##_name##_show, NULL)
#define DEFINE_IPL_ATTR_RW(_prefix, _name, _fmt_out, _fmt_in, _value) \
IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \
@@ -222,7 +256,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \
return len; \
} \
static struct kobj_attribute sys_##_prefix##_##_name##_attr = \
- __ATTR(_name,(S_IRUGO | S_IWUSR), \
+ __ATTR(_name, 0644, \
sys_##_prefix##_##_name##_show, \
sys_##_prefix##_##_name##_store)
@@ -232,12 +266,12 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \
struct kobj_attribute *attr, \
const char *buf, size_t len) \
{ \
- strncpy(_value, buf, sizeof(_value) - 1); \
+ strscpy(_value, buf, sizeof(_value)); \
strim(_value); \
return len; \
} \
static struct kobj_attribute sys_##_prefix##_##_name##_attr = \
- __ATTR(_name,(S_IRUGO | S_IWUSR), \
+ __ATTR(_name, 0644, \
sys_##_prefix##_##_name##_show, \
sys_##_prefix##_##_name##_store)
@@ -258,6 +292,16 @@ static __init enum ipl_type get_ipl_type(void)
return IPL_TYPE_FCP_DUMP;
else
return IPL_TYPE_FCP;
+ case IPL_PBT_NVME:
+ if (ipl_block.nvme.opt == IPL_PB0_NVME_OPT_DUMP)
+ return IPL_TYPE_NVME_DUMP;
+ else
+ return IPL_TYPE_NVME;
+ case IPL_PBT_ECKD:
+ if (ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP)
+ return IPL_TYPE_ECKD_DUMP;
+ else
+ return IPL_TYPE_ECKD;
}
return IPL_TYPE_UNKNOWN;
}
@@ -302,7 +346,7 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj,
}
static struct kobj_attribute sys_ipl_vm_parm_attr =
- __ATTR(parm, S_IRUGO, ipl_vm_parm_show, NULL);
+ __ATTR(parm, 0444, ipl_vm_parm_show, NULL);
static ssize_t sys_ipl_device_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
@@ -311,16 +355,23 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj,
case IPL_TYPE_CCW:
return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid,
ipl_block.ccw.devno);
+ case IPL_TYPE_ECKD:
+ case IPL_TYPE_ECKD_DUMP:
+ return sprintf(page, "0.%x.%04x\n", ipl_block.eckd.ssid,
+ ipl_block.eckd.devno);
case IPL_TYPE_FCP:
case IPL_TYPE_FCP_DUMP:
return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno);
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ return sprintf(page, "%08ux\n", ipl_block.nvme.fid);
default:
return 0;
}
}
static struct kobj_attribute sys_ipl_device_attr =
- __ATTR(device, S_IRUGO, sys_ipl_device_show, NULL);
+ __ATTR(device, 0444, sys_ipl_device_show, NULL);
static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
@@ -330,7 +381,7 @@ static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj,
ipl_block.hdr.len);
}
static struct bin_attribute ipl_parameter_attr =
- __BIN_ATTR(binary_parameter, S_IRUGO, ipl_parameter_read, NULL,
+ __BIN_ATTR(binary_parameter, 0444, ipl_parameter_read, NULL,
PAGE_SIZE);
static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj,
@@ -342,8 +393,35 @@ static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj,
return memory_read_from_buffer(buf, count, &off, scp_data, size);
}
+
+static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t off, size_t count)
+{
+ unsigned int size = ipl_block.nvme.scp_data_len;
+ void *scp_data = &ipl_block.nvme.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
+}
+
+static ssize_t ipl_eckd_scp_data_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t off, size_t count)
+{
+ unsigned int size = ipl_block.eckd.scp_data_len;
+ void *scp_data = &ipl_block.eckd.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
+}
+
static struct bin_attribute ipl_scp_data_attr =
- __BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE);
+ __BIN_ATTR(scp_data, 0444, ipl_scp_data_read, NULL, PAGE_SIZE);
+
+static struct bin_attribute ipl_nvme_scp_data_attr =
+ __BIN_ATTR(scp_data, 0444, ipl_nvme_scp_data_read, NULL, PAGE_SIZE);
+
+static struct bin_attribute ipl_eckd_scp_data_attr =
+ __BIN_ATTR(scp_data, 0444, ipl_eckd_scp_data_read, NULL, PAGE_SIZE);
static struct bin_attribute *ipl_fcp_bin_attrs[] = {
&ipl_parameter_attr,
@@ -351,6 +429,18 @@ static struct bin_attribute *ipl_fcp_bin_attrs[] = {
NULL,
};
+static struct bin_attribute *ipl_nvme_bin_attrs[] = {
+ &ipl_parameter_attr,
+ &ipl_nvme_scp_data_attr,
+ NULL,
+};
+
+static struct bin_attribute *ipl_eckd_bin_attrs[] = {
+ &ipl_parameter_attr,
+ &ipl_eckd_scp_data_attr,
+ NULL,
+};
+
/* FCP ipl device attributes */
DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n",
@@ -362,6 +452,94 @@ DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n",
DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n",
(unsigned long long)ipl_block.fcp.br_lba);
+/* NVMe ipl device attributes */
+DEFINE_IPL_ATTR_RO(ipl_nvme, fid, "0x%08llx\n",
+ (unsigned long long)ipl_block.nvme.fid);
+DEFINE_IPL_ATTR_RO(ipl_nvme, nsid, "0x%08llx\n",
+ (unsigned long long)ipl_block.nvme.nsid);
+DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n",
+ (unsigned long long)ipl_block.nvme.bootprog);
+DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n",
+ (unsigned long long)ipl_block.nvme.br_lba);
+
+/* ECKD ipl device attributes */
+DEFINE_IPL_ATTR_RO(ipl_eckd, bootprog, "%lld\n",
+ (unsigned long long)ipl_block.eckd.bootprog);
+
+#define IPL_ATTR_BR_CHR_SHOW_FN(_name, _ipb) \
+static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *buf) \
+{ \
+ struct ipl_pb0_eckd *ipb = &(_ipb); \
+ \
+ if (!ipb->br_chr.cyl && \
+ !ipb->br_chr.head && \
+ !ipb->br_chr.record) \
+ return sprintf(buf, "auto\n"); \
+ \
+ return sprintf(buf, "0x%x,0x%x,0x%x\n", \
+ ipb->br_chr.cyl, \
+ ipb->br_chr.head, \
+ ipb->br_chr.record); \
+}
+
+#define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb) \
+static ssize_t eckd_##_name##_br_chr_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ struct ipl_pb0_eckd *ipb = &(_ipb); \
+ unsigned long args[3] = { 0 }; \
+ char *p, *p1, *tmp = NULL; \
+ int i, rc; \
+ \
+ if (!strncmp(buf, "auto", 4)) \
+ goto out; \
+ \
+ tmp = kstrdup(buf, GFP_KERNEL); \
+ p = tmp; \
+ for (i = 0; i < 3; i++) { \
+ p1 = strsep(&p, ", "); \
+ if (!p1) { \
+ rc = -EINVAL; \
+ goto err; \
+ } \
+ rc = kstrtoul(p1, 0, args + i); \
+ if (rc) \
+ goto err; \
+ } \
+ \
+ rc = -EINVAL; \
+ if (i != 3) \
+ goto err; \
+ \
+ if ((args[0] || args[1]) && !args[2]) \
+ goto err; \
+ \
+ if (args[0] > UINT_MAX || args[1] > 255 || args[2] > 255) \
+ goto err; \
+ \
+out: \
+ ipb->br_chr.cyl = args[0]; \
+ ipb->br_chr.head = args[1]; \
+ ipb->br_chr.record = args[2]; \
+ rc = len; \
+err: \
+ kfree(tmp); \
+ return rc; \
+}
+
+IPL_ATTR_BR_CHR_SHOW_FN(ipl, ipl_block.eckd);
+static struct kobj_attribute sys_ipl_eckd_br_chr_attr =
+ __ATTR(br_chr, 0644, eckd_ipl_br_chr_show, NULL);
+
+IPL_ATTR_BR_CHR_SHOW_FN(reipl, reipl_block_eckd->eckd);
+IPL_ATTR_BR_CHR_STORE_FN(reipl, reipl_block_eckd->eckd);
+
+static struct kobj_attribute sys_reipl_eckd_br_chr_attr =
+ __ATTR(br_chr, 0644, eckd_reipl_br_chr_show, eckd_reipl_br_chr_store);
+
static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
@@ -379,15 +557,12 @@ static struct kobj_attribute sys_ipl_ccw_loadparm_attr =
__ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL);
static struct attribute *ipl_fcp_attrs[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_fcp_wwpn_attr.attr,
&sys_ipl_fcp_lun_attr.attr,
&sys_ipl_fcp_bootprog_attr.attr,
&sys_ipl_fcp_br_lba_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
@@ -396,24 +571,45 @@ static struct attribute_group ipl_fcp_attr_group = {
.bin_attrs = ipl_fcp_bin_attrs,
};
+static struct attribute *ipl_nvme_attrs[] = {
+ &sys_ipl_nvme_fid_attr.attr,
+ &sys_ipl_nvme_nsid_attr.attr,
+ &sys_ipl_nvme_bootprog_attr.attr,
+ &sys_ipl_nvme_br_lba_attr.attr,
+ &sys_ipl_ccw_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group ipl_nvme_attr_group = {
+ .attrs = ipl_nvme_attrs,
+ .bin_attrs = ipl_nvme_bin_attrs,
+};
+
+static struct attribute *ipl_eckd_attrs[] = {
+ &sys_ipl_eckd_bootprog_attr.attr,
+ &sys_ipl_eckd_br_chr_attr.attr,
+ &sys_ipl_ccw_loadparm_attr.attr,
+ &sys_ipl_device_attr.attr,
+ NULL,
+};
+
+static struct attribute_group ipl_eckd_attr_group = {
+ .attrs = ipl_eckd_attrs,
+ .bin_attrs = ipl_eckd_bin_attrs,
+};
+
/* CCW ipl device attributes */
static struct attribute *ipl_ccw_attrs_vm[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
&sys_ipl_vm_parm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
static struct attribute *ipl_ccw_attrs_lpar[] = {
- &sys_ipl_type_attr.attr,
&sys_ipl_device_attr.attr,
&sys_ipl_ccw_loadparm_attr.attr,
- &sys_ipl_secure_attr.attr,
- &sys_ipl_has_secure_attr.attr,
NULL,
};
@@ -425,22 +621,21 @@ static struct attribute_group ipl_ccw_attr_group_lpar = {
.attrs = ipl_ccw_attrs_lpar
};
-/* UNKNOWN ipl device attributes */
-
-static struct attribute *ipl_unknown_attrs[] = {
+static struct attribute *ipl_common_attrs[] = {
&sys_ipl_type_attr.attr,
+ &sys_ipl_secure_attr.attr,
+ &sys_ipl_has_secure_attr.attr,
NULL,
};
-static struct attribute_group ipl_unknown_attr_group = {
- .attrs = ipl_unknown_attrs,
+static struct attribute_group ipl_common_attr_group = {
+ .attrs = ipl_common_attrs,
};
static struct kset *ipl_kset;
static void __ipl_run(void *unused)
{
- __bpon();
diag308(DIAG308_LOAD_CLEAR, NULL);
}
@@ -458,6 +653,9 @@ static int __init ipl_init(void)
rc = -ENOMEM;
goto out;
}
+ rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group);
+ if (rc)
+ goto out;
switch (ipl_info.type) {
case IPL_TYPE_CCW:
if (MACHINE_IS_VM)
@@ -467,13 +665,19 @@ static int __init ipl_init(void)
rc = sysfs_create_group(&ipl_kset->kobj,
&ipl_ccw_attr_group_lpar);
break;
+ case IPL_TYPE_ECKD:
+ case IPL_TYPE_ECKD_DUMP:
+ rc = sysfs_create_group(&ipl_kset->kobj, &ipl_eckd_attr_group);
+ break;
case IPL_TYPE_FCP:
case IPL_TYPE_FCP_DUMP:
rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group);
break;
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group);
+ break;
default:
- rc = sysfs_create_group(&ipl_kset->kobj,
- &ipl_unknown_attr_group);
break;
}
out:
@@ -564,11 +768,11 @@ static ssize_t reipl_ccw_vmparm_store(struct kobject *kobj,
}
static struct kobj_attribute sys_reipl_nss_vmparm_attr =
- __ATTR(parm, S_IRUGO | S_IWUSR, reipl_nss_vmparm_show,
- reipl_nss_vmparm_store);
+ __ATTR(parm, 0644, reipl_nss_vmparm_show,
+ reipl_nss_vmparm_store);
static struct kobj_attribute sys_reipl_ccw_vmparm_attr =
- __ATTR(parm, S_IRUGO | S_IWUSR, reipl_ccw_vmparm_show,
- reipl_ccw_vmparm_store);
+ __ATTR(parm, 0644, reipl_ccw_vmparm_show,
+ reipl_ccw_vmparm_store);
/* FCP reipl device attributes */
@@ -608,7 +812,7 @@ static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj,
return count;
}
static struct bin_attribute sys_reipl_fcp_scp_data_attr =
- __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_fcp_scpdata_read,
+ __BIN_ATTR(scp_data, 0644, reipl_fcp_scpdata_read,
reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE);
static struct bin_attribute *reipl_fcp_bin_attrs[] = {
@@ -673,24 +877,43 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb,
return len;
}
-/* FCP wrapper */
-static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+#define DEFINE_GENERIC_LOADPARM(name) \
+static ssize_t reipl_##name##_loadparm_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *page) \
+{ \
+ return reipl_generic_loadparm_show(reipl_block_##name, page); \
+} \
+static ssize_t reipl_##name##_loadparm_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ return reipl_generic_loadparm_store(reipl_block_##name, buf, len); \
+} \
+static struct kobj_attribute sys_reipl_##name##_loadparm_attr = \
+ __ATTR(loadparm, 0644, reipl_##name##_loadparm_show, \
+ reipl_##name##_loadparm_store)
+
+DEFINE_GENERIC_LOADPARM(fcp);
+DEFINE_GENERIC_LOADPARM(nvme);
+DEFINE_GENERIC_LOADPARM(ccw);
+DEFINE_GENERIC_LOADPARM(nss);
+DEFINE_GENERIC_LOADPARM(eckd);
+
+static ssize_t reipl_fcp_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
{
- return reipl_generic_loadparm_show(reipl_block_fcp, page);
+ return sprintf(page, "%u\n", reipl_fcp_clear);
}
-static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t len)
+static ssize_t reipl_fcp_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
{
- return reipl_generic_loadparm_store(reipl_block_fcp, buf, len);
+ if (kstrtobool(buf, &reipl_fcp_clear) < 0)
+ return -EINVAL;
+ return len;
}
-static struct kobj_attribute sys_reipl_fcp_loadparm_attr =
- __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show,
- reipl_fcp_loadparm_store);
-
static struct attribute *reipl_fcp_attrs[] = {
&sys_reipl_fcp_device_attr.attr,
&sys_reipl_fcp_wwpn_attr.attr,
@@ -706,51 +929,129 @@ static struct attribute_group reipl_fcp_attr_group = {
.bin_attrs = reipl_fcp_bin_attrs,
};
-/* CCW reipl device attributes */
-DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw);
+static struct kobj_attribute sys_reipl_fcp_clear_attr =
+ __ATTR(clear, 0644, reipl_fcp_clear_show, reipl_fcp_clear_store);
-/* NSS wrapper */
-static ssize_t reipl_nss_loadparm_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+/* NVME reipl device attributes */
+
+static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
{
- return reipl_generic_loadparm_show(reipl_block_nss, page);
+ size_t size = reipl_block_nvme->nvme.scp_data_len;
+ void *scp_data = reipl_block_nvme->nvme.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
}
-static ssize_t reipl_nss_loadparm_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t len)
+static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
{
- return reipl_generic_loadparm_store(reipl_block_nss, buf, len);
+ size_t scpdata_len = count;
+ size_t padding;
+
+ if (off)
+ return -EINVAL;
+
+ memcpy(reipl_block_nvme->nvme.scp_data, buf, count);
+ if (scpdata_len % 8) {
+ padding = 8 - (scpdata_len % 8);
+ memset(reipl_block_nvme->nvme.scp_data + scpdata_len,
+ 0, padding);
+ scpdata_len += padding;
+ }
+
+ reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len;
+ reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len;
+ reipl_block_nvme->nvme.scp_data_len = scpdata_len;
+
+ return count;
}
-/* CCW wrapper */
-static ssize_t reipl_ccw_loadparm_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+static struct bin_attribute sys_reipl_nvme_scp_data_attr =
+ __BIN_ATTR(scp_data, 0644, reipl_nvme_scpdata_read,
+ reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE);
+
+static struct bin_attribute *reipl_nvme_bin_attrs[] = {
+ &sys_reipl_nvme_scp_data_attr,
+ NULL,
+};
+
+DEFINE_IPL_ATTR_RW(reipl_nvme, fid, "0x%08llx\n", "%llx\n",
+ reipl_block_nvme->nvme.fid);
+DEFINE_IPL_ATTR_RW(reipl_nvme, nsid, "0x%08llx\n", "%llx\n",
+ reipl_block_nvme->nvme.nsid);
+DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n",
+ reipl_block_nvme->nvme.bootprog);
+DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n",
+ reipl_block_nvme->nvme.br_lba);
+
+static struct attribute *reipl_nvme_attrs[] = {
+ &sys_reipl_nvme_fid_attr.attr,
+ &sys_reipl_nvme_nsid_attr.attr,
+ &sys_reipl_nvme_bootprog_attr.attr,
+ &sys_reipl_nvme_br_lba_attr.attr,
+ &sys_reipl_nvme_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group reipl_nvme_attr_group = {
+ .attrs = reipl_nvme_attrs,
+ .bin_attrs = reipl_nvme_bin_attrs
+};
+
+static ssize_t reipl_nvme_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return sprintf(page, "%u\n", reipl_nvme_clear);
+}
+
+static ssize_t reipl_nvme_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (kstrtobool(buf, &reipl_nvme_clear) < 0)
+ return -EINVAL;
+ return len;
+}
+
+static struct kobj_attribute sys_reipl_nvme_clear_attr =
+ __ATTR(clear, 0644, reipl_nvme_clear_show, reipl_nvme_clear_store);
+
+/* CCW reipl device attributes */
+DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw);
+
+static ssize_t reipl_ccw_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
{
- return reipl_generic_loadparm_show(reipl_block_ccw, page);
+ return sprintf(page, "%u\n", reipl_ccw_clear);
}
-static ssize_t reipl_ccw_loadparm_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t len)
+static ssize_t reipl_ccw_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
{
- return reipl_generic_loadparm_store(reipl_block_ccw, buf, len);
+ if (kstrtobool(buf, &reipl_ccw_clear) < 0)
+ return -EINVAL;
+ return len;
}
-static struct kobj_attribute sys_reipl_ccw_loadparm_attr =
- __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show,
- reipl_ccw_loadparm_store);
+static struct kobj_attribute sys_reipl_ccw_clear_attr =
+ __ATTR(clear, 0644, reipl_ccw_clear_show, reipl_ccw_clear_store);
static struct attribute *reipl_ccw_attrs_vm[] = {
&sys_reipl_ccw_device_attr.attr,
&sys_reipl_ccw_loadparm_attr.attr,
&sys_reipl_ccw_vmparm_attr.attr,
+ &sys_reipl_ccw_clear_attr.attr,
NULL,
};
static struct attribute *reipl_ccw_attrs_lpar[] = {
&sys_reipl_ccw_device_attr.attr,
&sys_reipl_ccw_loadparm_attr.attr,
+ &sys_reipl_ccw_clear_attr.attr,
NULL,
};
@@ -764,6 +1065,86 @@ static struct attribute_group reipl_ccw_attr_group_lpar = {
.attrs = reipl_ccw_attrs_lpar,
};
+/* ECKD reipl device attributes */
+
+static ssize_t reipl_eckd_scpdata_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
+{
+ size_t size = reipl_block_eckd->eckd.scp_data_len;
+ void *scp_data = reipl_block_eckd->eckd.scp_data;
+
+ return memory_read_from_buffer(buf, count, &off, scp_data, size);
+}
+
+static ssize_t reipl_eckd_scpdata_write(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr,
+ char *buf, loff_t off, size_t count)
+{
+ size_t scpdata_len = count;
+ size_t padding;
+
+ if (off)
+ return -EINVAL;
+
+ memcpy(reipl_block_eckd->eckd.scp_data, buf, count);
+ if (scpdata_len % 8) {
+ padding = 8 - (scpdata_len % 8);
+ memset(reipl_block_eckd->eckd.scp_data + scpdata_len,
+ 0, padding);
+ scpdata_len += padding;
+ }
+
+ reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN + scpdata_len;
+ reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN + scpdata_len;
+ reipl_block_eckd->eckd.scp_data_len = scpdata_len;
+
+ return count;
+}
+
+static struct bin_attribute sys_reipl_eckd_scp_data_attr =
+ __BIN_ATTR(scp_data, 0644, reipl_eckd_scpdata_read,
+ reipl_eckd_scpdata_write, DIAG308_SCPDATA_SIZE);
+
+static struct bin_attribute *reipl_eckd_bin_attrs[] = {
+ &sys_reipl_eckd_scp_data_attr,
+ NULL,
+};
+
+DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd);
+DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n",
+ reipl_block_eckd->eckd.bootprog);
+
+static struct attribute *reipl_eckd_attrs[] = {
+ &sys_reipl_eckd_device_attr.attr,
+ &sys_reipl_eckd_bootprog_attr.attr,
+ &sys_reipl_eckd_br_chr_attr.attr,
+ &sys_reipl_eckd_loadparm_attr.attr,
+ NULL,
+};
+
+static struct attribute_group reipl_eckd_attr_group = {
+ .attrs = reipl_eckd_attrs,
+ .bin_attrs = reipl_eckd_bin_attrs
+};
+
+static ssize_t reipl_eckd_clear_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return sprintf(page, "%u\n", reipl_eckd_clear);
+}
+
+static ssize_t reipl_eckd_clear_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (kstrtobool(buf, &reipl_eckd_clear) < 0)
+ return -EINVAL;
+ return len;
+}
+
+static struct kobj_attribute sys_reipl_eckd_clear_attr =
+ __ATTR(clear, 0644, reipl_eckd_clear_show, reipl_eckd_clear_store);
/* NSS reipl device attributes */
static void reipl_get_ascii_nss_name(char *dst,
@@ -811,12 +1192,8 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj,
}
static struct kobj_attribute sys_reipl_nss_name_attr =
- __ATTR(name, S_IRUGO | S_IWUSR, reipl_nss_name_show,
- reipl_nss_name_store);
-
-static struct kobj_attribute sys_reipl_nss_loadparm_attr =
- __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nss_loadparm_show,
- reipl_nss_loadparm_store);
+ __ATTR(name, 0644, reipl_nss_name_show,
+ reipl_nss_name_store);
static struct attribute *reipl_nss_attrs[] = {
&sys_reipl_nss_name_attr.attr,
@@ -847,9 +1224,15 @@ static int reipl_set_type(enum ipl_type type)
case IPL_TYPE_CCW:
reipl_block_actual = reipl_block_ccw;
break;
+ case IPL_TYPE_ECKD:
+ reipl_block_actual = reipl_block_eckd;
+ break;
case IPL_TYPE_FCP:
reipl_block_actual = reipl_block_fcp;
break;
+ case IPL_TYPE_NVME:
+ reipl_block_actual = reipl_block_nvme;
+ break;
case IPL_TYPE_NSS:
reipl_block_actual = reipl_block_nss;
break;
@@ -874,8 +1257,12 @@ static ssize_t reipl_type_store(struct kobject *kobj,
if (strncmp(buf, IPL_CCW_STR, strlen(IPL_CCW_STR)) == 0)
rc = reipl_set_type(IPL_TYPE_CCW);
+ else if (strncmp(buf, IPL_ECKD_STR, strlen(IPL_ECKD_STR)) == 0)
+ rc = reipl_set_type(IPL_TYPE_ECKD);
else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0)
rc = reipl_set_type(IPL_TYPE_FCP);
+ else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0)
+ rc = reipl_set_type(IPL_TYPE_NVME);
else if (strncmp(buf, IPL_NSS_STR, strlen(IPL_NSS_STR)) == 0)
rc = reipl_set_type(IPL_TYPE_NSS);
return (rc != 0) ? rc : len;
@@ -886,17 +1273,39 @@ static struct kobj_attribute reipl_type_attr =
static struct kset *reipl_kset;
static struct kset *reipl_fcp_kset;
+static struct kset *reipl_nvme_kset;
+static struct kset *reipl_eckd_kset;
static void __reipl_run(void *unused)
{
switch (reipl_type) {
case IPL_TYPE_CCW:
diag308(DIAG308_SET, reipl_block_ccw);
- diag308(DIAG308_LOAD_CLEAR, NULL);
+ if (reipl_ccw_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL_DUMP, NULL);
+ break;
+ case IPL_TYPE_ECKD:
+ diag308(DIAG308_SET, reipl_block_eckd);
+ if (reipl_eckd_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL, NULL);
break;
case IPL_TYPE_FCP:
diag308(DIAG308_SET, reipl_block_fcp);
- diag308(DIAG308_LOAD_CLEAR, NULL);
+ if (reipl_fcp_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL, NULL);
+ break;
+ case IPL_TYPE_NVME:
+ diag308(DIAG308_SET, reipl_block_nvme);
+ if (reipl_nvme_clear)
+ diag308(DIAG308_LOAD_CLEAR, NULL);
+ else
+ diag308(DIAG308_LOAD_NORMAL, NULL);
break;
case IPL_TYPE_NSS:
diag308(DIAG308_SET, reipl_block_nss);
@@ -906,6 +1315,8 @@ static void __reipl_run(void *unused)
diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case IPL_TYPE_FCP_DUMP:
+ case IPL_TYPE_NVME_DUMP:
+ case IPL_TYPE_ECKD_DUMP:
break;
}
disabled_wait();
@@ -1008,10 +1419,16 @@ static int __init reipl_fcp_init(void)
}
rc = sysfs_create_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group);
- if (rc) {
- kset_unregister(reipl_fcp_kset);
- free_page((unsigned long) reipl_block_fcp);
- return rc;
+ if (rc)
+ goto out1;
+
+ if (test_facility(141)) {
+ rc = sysfs_create_file(&reipl_fcp_kset->kobj,
+ &sys_reipl_fcp_clear_attr.attr);
+ if (rc)
+ goto out2;
+ } else {
+ reipl_fcp_clear = true;
}
if (ipl_info.type == IPL_TYPE_FCP) {
@@ -1032,6 +1449,121 @@ static int __init reipl_fcp_init(void)
}
reipl_capabilities |= IPL_TYPE_FCP;
return 0;
+
+out2:
+ sysfs_remove_group(&reipl_fcp_kset->kobj, &reipl_fcp_attr_group);
+out1:
+ kset_unregister(reipl_fcp_kset);
+ free_page((unsigned long) reipl_block_fcp);
+ return rc;
+}
+
+static int __init reipl_nvme_init(void)
+{
+ int rc;
+
+ reipl_block_nvme = (void *) get_zeroed_page(GFP_KERNEL);
+ if (!reipl_block_nvme)
+ return -ENOMEM;
+
+ /* sysfs: create kset for mixing attr group and bin attrs */
+ reipl_nvme_kset = kset_create_and_add(IPL_NVME_STR, NULL,
+ &reipl_kset->kobj);
+ if (!reipl_nvme_kset) {
+ free_page((unsigned long) reipl_block_nvme);
+ return -ENOMEM;
+ }
+
+ rc = sysfs_create_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group);
+ if (rc)
+ goto out1;
+
+ if (test_facility(141)) {
+ rc = sysfs_create_file(&reipl_nvme_kset->kobj,
+ &sys_reipl_nvme_clear_attr.attr);
+ if (rc)
+ goto out2;
+ } else {
+ reipl_nvme_clear = true;
+ }
+
+ if (ipl_info.type == IPL_TYPE_NVME) {
+ memcpy(reipl_block_nvme, &ipl_block, sizeof(ipl_block));
+ /*
+ * Fix loadparm: There are systems where the (SCSI) LOADPARM
+ * is invalid in the IPL parameter block, so take it
+ * always from sclp_ipl_info.
+ */
+ memcpy(reipl_block_nvme->nvme.loadparm, sclp_ipl_info.loadparm,
+ LOADPARM_LEN);
+ } else {
+ reipl_block_nvme->hdr.len = IPL_BP_NVME_LEN;
+ reipl_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION;
+ reipl_block_nvme->nvme.len = IPL_BP0_NVME_LEN;
+ reipl_block_nvme->nvme.pbt = IPL_PBT_NVME;
+ reipl_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_IPL;
+ }
+ reipl_capabilities |= IPL_TYPE_NVME;
+ return 0;
+
+out2:
+ sysfs_remove_group(&reipl_nvme_kset->kobj, &reipl_nvme_attr_group);
+out1:
+ kset_unregister(reipl_nvme_kset);
+ free_page((unsigned long) reipl_block_nvme);
+ return rc;
+}
+
+static int __init reipl_eckd_init(void)
+{
+ int rc;
+
+ if (!sclp.has_sipl_eckd)
+ return 0;
+
+ reipl_block_eckd = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!reipl_block_eckd)
+ return -ENOMEM;
+
+ /* sysfs: create kset for mixing attr group and bin attrs */
+ reipl_eckd_kset = kset_create_and_add(IPL_ECKD_STR, NULL,
+ &reipl_kset->kobj);
+ if (!reipl_eckd_kset) {
+ free_page((unsigned long)reipl_block_eckd);
+ return -ENOMEM;
+ }
+
+ rc = sysfs_create_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group);
+ if (rc)
+ goto out1;
+
+ if (test_facility(141)) {
+ rc = sysfs_create_file(&reipl_eckd_kset->kobj,
+ &sys_reipl_eckd_clear_attr.attr);
+ if (rc)
+ goto out2;
+ } else {
+ reipl_eckd_clear = true;
+ }
+
+ if (ipl_info.type == IPL_TYPE_ECKD) {
+ memcpy(reipl_block_eckd, &ipl_block, sizeof(ipl_block));
+ } else {
+ reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN;
+ reipl_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION;
+ reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN;
+ reipl_block_eckd->eckd.pbt = IPL_PBT_ECKD;
+ reipl_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_IPL;
+ }
+ reipl_capabilities |= IPL_TYPE_ECKD;
+ return 0;
+
+out2:
+ sysfs_remove_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group);
+out1:
+ kset_unregister(reipl_eckd_kset);
+ free_page((unsigned long)reipl_block_eckd);
+ return rc;
}
static int __init reipl_type_init(void)
@@ -1049,9 +1581,15 @@ static int __init reipl_type_init(void)
if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) {
memcpy(reipl_block_fcp, reipl_block, size);
reipl_type = IPL_TYPE_FCP;
+ } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_NVME) {
+ memcpy(reipl_block_nvme, reipl_block, size);
+ reipl_type = IPL_TYPE_NVME;
} else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) {
memcpy(reipl_block_ccw, reipl_block, size);
reipl_type = IPL_TYPE_CCW;
+ } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_ECKD) {
+ memcpy(reipl_block_eckd, reipl_block, size);
+ reipl_type = IPL_TYPE_ECKD;
}
out:
return reipl_set_type(reipl_type);
@@ -1072,9 +1610,15 @@ static int __init reipl_init(void)
rc = reipl_ccw_init();
if (rc)
return rc;
+ rc = reipl_eckd_init();
+ if (rc)
+ return rc;
rc = reipl_fcp_init();
if (rc)
return rc;
+ rc = reipl_nvme_init();
+ if (rc)
+ return rc;
rc = reipl_nss_init();
if (rc)
return rc;
@@ -1118,6 +1662,52 @@ static struct attribute_group dump_fcp_attr_group = {
.attrs = dump_fcp_attrs,
};
+/* NVME dump device attributes */
+DEFINE_IPL_ATTR_RW(dump_nvme, fid, "0x%08llx\n", "%llx\n",
+ dump_block_nvme->nvme.fid);
+DEFINE_IPL_ATTR_RW(dump_nvme, nsid, "0x%08llx\n", "%llx\n",
+ dump_block_nvme->nvme.nsid);
+DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n",
+ dump_block_nvme->nvme.bootprog);
+DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n",
+ dump_block_nvme->nvme.br_lba);
+
+static struct attribute *dump_nvme_attrs[] = {
+ &sys_dump_nvme_fid_attr.attr,
+ &sys_dump_nvme_nsid_attr.attr,
+ &sys_dump_nvme_bootprog_attr.attr,
+ &sys_dump_nvme_br_lba_attr.attr,
+ NULL,
+};
+
+static struct attribute_group dump_nvme_attr_group = {
+ .name = IPL_NVME_STR,
+ .attrs = dump_nvme_attrs,
+};
+
+/* ECKD dump device attributes */
+DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd);
+DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n",
+ dump_block_eckd->eckd.bootprog);
+
+IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd);
+IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd);
+
+static struct kobj_attribute sys_dump_eckd_br_chr_attr =
+ __ATTR(br_chr, 0644, eckd_dump_br_chr_show, eckd_dump_br_chr_store);
+
+static struct attribute *dump_eckd_attrs[] = {
+ &sys_dump_eckd_device_attr.attr,
+ &sys_dump_eckd_bootprog_attr.attr,
+ &sys_dump_eckd_br_chr_attr.attr,
+ NULL,
+};
+
+static struct attribute_group dump_eckd_attr_group = {
+ .name = IPL_ECKD_STR,
+ .attrs = dump_eckd_attrs,
+};
+
/* CCW dump device attributes */
DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw);
@@ -1157,8 +1747,12 @@ static ssize_t dump_type_store(struct kobject *kobj,
rc = dump_set_type(DUMP_TYPE_NONE);
else if (strncmp(buf, DUMP_CCW_STR, strlen(DUMP_CCW_STR)) == 0)
rc = dump_set_type(DUMP_TYPE_CCW);
+ else if (strncmp(buf, DUMP_ECKD_STR, strlen(DUMP_ECKD_STR)) == 0)
+ rc = dump_set_type(DUMP_TYPE_ECKD);
else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0)
rc = dump_set_type(DUMP_TYPE_FCP);
+ else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0)
+ rc = dump_set_type(DUMP_TYPE_NVME);
return (rc != 0) ? rc : len;
}
@@ -1173,7 +1767,7 @@ static void diag308_dump(void *dump_block)
while (1) {
if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302)
break;
- udelay_simple(USEC_PER_SEC);
+ udelay(USEC_PER_SEC);
}
}
@@ -1183,9 +1777,15 @@ static void __dump_run(void *unused)
case DUMP_TYPE_CCW:
diag308_dump(dump_block_ccw);
break;
+ case DUMP_TYPE_ECKD:
+ diag308_dump(dump_block_eckd);
+ break;
case DUMP_TYPE_FCP:
diag308_dump(dump_block_fcp);
break;
+ case DUMP_TYPE_NVME:
+ diag308_dump(dump_block_nvme);
+ break;
default:
break;
}
@@ -1242,6 +1842,52 @@ static int __init dump_fcp_init(void)
return 0;
}
+static int __init dump_nvme_init(void)
+{
+ int rc;
+
+ if (!sclp_ipl_info.has_dump)
+ return 0; /* LDIPL DUMP is not installed */
+ dump_block_nvme = (void *) get_zeroed_page(GFP_KERNEL);
+ if (!dump_block_nvme)
+ return -ENOMEM;
+ rc = sysfs_create_group(&dump_kset->kobj, &dump_nvme_attr_group);
+ if (rc) {
+ free_page((unsigned long)dump_block_nvme);
+ return rc;
+ }
+ dump_block_nvme->hdr.len = IPL_BP_NVME_LEN;
+ dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION;
+ dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN;
+ dump_block_nvme->fcp.pbt = IPL_PBT_NVME;
+ dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP;
+ dump_capabilities |= DUMP_TYPE_NVME;
+ return 0;
+}
+
+static int __init dump_eckd_init(void)
+{
+ int rc;
+
+ if (!sclp_ipl_info.has_dump || !sclp.has_sipl_eckd)
+ return 0; /* LDIPL DUMP is not installed */
+ dump_block_eckd = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!dump_block_eckd)
+ return -ENOMEM;
+ rc = sysfs_create_group(&dump_kset->kobj, &dump_eckd_attr_group);
+ if (rc) {
+ free_page((unsigned long)dump_block_eckd);
+ return rc;
+ }
+ dump_block_eckd->hdr.len = IPL_BP_ECKD_LEN;
+ dump_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION;
+ dump_block_eckd->eckd.len = IPL_BP0_ECKD_LEN;
+ dump_block_eckd->eckd.pbt = IPL_PBT_ECKD;
+ dump_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_DUMP;
+ dump_capabilities |= DUMP_TYPE_ECKD;
+ return 0;
+}
+
static int __init dump_init(void)
{
int rc;
@@ -1257,9 +1903,15 @@ static int __init dump_init(void)
rc = dump_ccw_init();
if (rc)
return rc;
+ rc = dump_eckd_init();
+ if (rc)
+ return rc;
rc = dump_fcp_init();
if (rc)
return rc;
+ rc = dump_nvme_init();
+ if (rc)
+ return rc;
dump_set_type(DUMP_TYPE_NONE);
return 0;
}
@@ -1272,13 +1924,29 @@ static struct shutdown_action __refdata dump_action = {
static void dump_reipl_run(struct shutdown_trigger *trigger)
{
- unsigned long ipib = (unsigned long) reipl_block_actual;
+ struct lowcore *abs_lc;
unsigned int csum;
+ /*
+ * Set REIPL_CLEAR flag in os_info flags entry indicating
+ * 'clear' sysfs attribute has been set on the panicked system
+ * for specified reipl type.
+ * Always set for IPL_TYPE_NSS and IPL_TYPE_UNKNOWN.
+ */
+ if ((reipl_type == IPL_TYPE_CCW && reipl_ccw_clear) ||
+ (reipl_type == IPL_TYPE_ECKD && reipl_eckd_clear) ||
+ (reipl_type == IPL_TYPE_FCP && reipl_fcp_clear) ||
+ (reipl_type == IPL_TYPE_NVME && reipl_nvme_clear) ||
+ reipl_type == IPL_TYPE_NSS ||
+ reipl_type == IPL_TYPE_UNKNOWN)
+ os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR;
+ os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
csum = (__force unsigned int)
csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0);
- mem_assign_absolute(S390_lowcore.ipib, ipib);
- mem_assign_absolute(S390_lowcore.ipib_checksum, csum);
+ abs_lc = get_abs_lowcore();
+ abs_lc->ipib = __pa(reipl_block_actual);
+ abs_lc->ipib_checksum = csum;
+ put_abs_lowcore(abs_lc);
dump_run(trigger);
}
@@ -1472,7 +2140,6 @@ static struct kobj_attribute on_restart_attr = __ATTR_RW(on_restart);
static void __do_restart(void *ignore)
{
- __arch_local_irq_stosm(0x04); /* enable DAT */
smp_send_stop();
#ifdef CONFIG_CRASH_DUMP
crash_kexec(NULL);
@@ -1481,12 +2148,12 @@ static void __do_restart(void *ignore)
stop_run(&on_restart_trigger);
}
-void do_restart(void)
+void do_restart(void *arg)
{
tracing_off();
debug_locks_off();
lgr_info_log();
- smp_call_online_cpu(__do_restart, NULL);
+ smp_call_online_cpu(__do_restart, arg);
}
/* on halt */
@@ -1684,6 +2351,11 @@ void __init setup_ipl(void)
ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid;
ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno;
break;
+ case IPL_TYPE_ECKD:
+ case IPL_TYPE_ECKD_DUMP:
+ ipl_info.data.eckd.dev_id.ssid = ipl_block.eckd.ssid;
+ ipl_info.data.eckd.dev_id.devno = ipl_block.eckd.devno;
+ break;
case IPL_TYPE_FCP:
case IPL_TYPE_FCP_DUMP:
ipl_info.data.fcp.dev_id.ssid = 0;
@@ -1691,6 +2363,11 @@ void __init setup_ipl(void)
ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn;
ipl_info.data.fcp.lun = ipl_block.fcp.lun;
break;
+ case IPL_TYPE_NVME:
+ case IPL_TYPE_NVME_DUMP:
+ ipl_info.data.nvme.fid = ipl_block.nvme.fid;
+ ipl_info.data.nvme.nsid = ipl_block.nvme.nsid;
+ break;
case IPL_TYPE_NSS:
case IPL_TYPE_UNKNOWN:
/* We have no info to copy */
@@ -1705,8 +2382,8 @@ void s390_reset_system(void)
set_prefix(0);
/* Disable lowcore protection */
- __ctl_clear_bit(0, 28);
- diag_dma_ops.diag308_reset();
+ local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
+ diag_amode31_ops.diag308_reset();
}
#ifdef CONFIG_KEXEC_FILE
@@ -1783,7 +2460,7 @@ void *ipl_report_finish(struct ipl_report *report)
buf = vzalloc(report->size);
if (!buf)
- return ERR_PTR(-ENOMEM);
+ goto out;
ptr = buf;
memcpy(ptr, report->ipib, report->ipib->hdr.len);
@@ -1822,6 +2499,7 @@ void *ipl_report_finish(struct ipl_report *report)
}
BUG_ON(ptr > buf + report->size);
+out:
return buf;
}
diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c
index af43535a976d..b5245fadcfb0 100644
--- a/arch/s390/kernel/ipl_vmparm.c
+++ b/arch/s390/kernel/ipl_vmparm.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/minmax.h>
+#include <linux/string.h>
#include <asm/ebcdic.h>
#include <asm/ipl.h>
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 8371855042dc..6f71b0ce1068 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -21,12 +21,14 @@
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/irq.h>
+#include <linux/entry-common.h>
#include <asm/irq_regs.h>
#include <asm/cputime.h>
#include <asm/lowcore.h>
#include <asm/irq.h>
#include <asm/hw_irq.h>
#include <asm/stacktrace.h>
+#include <asm/softirq_stack.h>
#include "entry.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
@@ -95,27 +97,106 @@ static const struct irq_class irqclass_sub_desc[] = {
{.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"},
};
-void __init init_IRQ(void)
+static void do_IRQ(struct pt_regs *regs, int irq)
{
- BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS);
- init_cio_interrupts();
- init_airq_interrupts();
- init_ext_interrupts();
-}
-
-void do_IRQ(struct pt_regs *regs, int irq)
-{
- struct pt_regs *old_regs;
-
- old_regs = set_irq_regs(regs);
- irq_enter();
if (tod_after_eq(S390_lowcore.int_clock,
S390_lowcore.clock_comparator))
/* Serve timer interrupts first. */
clock_comparator_work();
generic_handle_irq(irq);
- irq_exit();
+}
+
+static int on_async_stack(void)
+{
+ unsigned long frame = current_frame_address();
+
+ return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0;
+}
+
+static void do_irq_async(struct pt_regs *regs, int irq)
+{
+ if (on_async_stack()) {
+ do_IRQ(regs, irq);
+ } else {
+ call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ,
+ struct pt_regs *, regs, int, irq);
+ }
+}
+
+static int irq_pending(struct pt_regs *regs)
+{
+ int cc;
+
+ asm volatile("tpi 0\n"
+ "ipm %0" : "=d" (cc) : : "cc");
+ return cc >> 28;
+}
+
+void noinstr do_io_irq(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ bool from_idle;
+
+ irq_enter_rcu();
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+ }
+
+ from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
+ if (from_idle)
+ account_idle_time_irq();
+
+ do {
+ regs->tpi_info = S390_lowcore.tpi_info;
+ if (S390_lowcore.tpi_info.adapter_IO)
+ do_irq_async(regs, THIN_INTERRUPT);
+ else
+ do_irq_async(regs, IO_INTERRUPT);
+ } while (MACHINE_IS_LPAR && irq_pending(regs));
+
+ irq_exit_rcu();
+
set_irq_regs(old_regs);
+ irqentry_exit(regs, state);
+
+ if (from_idle)
+ regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
+}
+
+void noinstr do_ext_irq(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ bool from_idle;
+
+ irq_enter_rcu();
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+ }
+
+ regs->int_code = S390_lowcore.ext_int_code_addr;
+ regs->int_parm = S390_lowcore.ext_params;
+ regs->int_parm_long = S390_lowcore.ext_params2;
+
+ from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT);
+ if (from_idle)
+ account_idle_time_irq();
+
+ do_irq_async(regs, EXT_INTERRUPT);
+
+ irq_exit_rcu();
+ set_irq_regs(old_regs);
+ irqentry_exit(regs, state);
+
+ if (from_idle)
+ regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT);
}
static void show_msi_interrupt(struct seq_file *p, int irq)
@@ -124,7 +205,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
unsigned long flags;
int cpu;
- irq_lock_sparse();
+ rcu_read_lock();
desc = irq_to_desc(irq);
if (!desc)
goto out;
@@ -132,7 +213,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
raw_spin_lock_irqsave(&desc->lock, flags);
seq_printf(p, "%3d: ", irq);
for_each_online_cpu(cpu)
- seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+ seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, cpu));
if (desc->irq_data.chip)
seq_printf(p, " %8s", desc->irq_data.chip->name);
@@ -143,7 +224,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq)
seq_putc(p, '\n');
raw_spin_unlock_irqrestore(&desc->lock, flags);
out:
- irq_unlock_sparse();
+ rcu_read_unlock();
}
/*
@@ -154,7 +235,7 @@ int show_interrupts(struct seq_file *p, void *v)
int index = *(loff_t *) v;
int cpu, irq;
- get_online_cpus();
+ cpus_read_lock();
if (index == 0) {
seq_puts(p, " ");
for_each_online_cpu(cpu)
@@ -184,7 +265,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
out:
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
@@ -194,24 +275,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
}
/*
- * Switch to the asynchronous interrupt stack for softirq execution.
- */
-void do_softirq_own_stack(void)
-{
- unsigned long old, new;
-
- old = current_stack_pointer();
- /* Check against async. stack address range. */
- new = S390_lowcore.async_stack;
- if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) {
- CALL_ON_STACK(__do_softirq, new, 0);
- } else {
- /* We are already on the async stack. */
- __do_softirq();
- }
-}
-
-/*
* ext_int_hash[index] is the list head for all external interrupts that hash
* to this index.
*/
@@ -279,7 +342,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy)
struct ext_int_info *p;
int index;
- ext_code = *(struct ext_code *) &regs->int_code;
+ ext_code.int_code = regs->int_code;
if (ext_code.code != EXT_IRQ_CLK_COMP)
set_cpu_flag(CIF_NOHZ_DELAY);
@@ -294,12 +357,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy)
return IRQ_HANDLED;
}
-static struct irqaction external_interrupt = {
- .name = "EXT",
- .handler = do_ext_interrupt,
-};
-
-void __init init_ext_interrupts(void)
+static void __init init_ext_interrupts(void)
{
int idx;
@@ -308,7 +366,16 @@ void __init init_ext_interrupts(void)
irq_set_chip_and_handler(EXT_INTERRUPT,
&dummy_irq_chip, handle_percpu_irq);
- setup_irq(EXT_INTERRUPT, &external_interrupt);
+ if (request_irq(EXT_INTERRUPT, do_ext_interrupt, 0, "EXT", NULL))
+ panic("Failed to register EXT interrupt\n");
+}
+
+void __init init_IRQ(void)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(irqclass_sub_desc) != NR_ARCH_IRQS);
+ init_cio_interrupts();
+ init_airq_interrupts();
+ init_ext_interrupts();
}
static DEFINE_SPINLOCK(irq_subclass_lock);
@@ -318,7 +385,7 @@ void irq_subclass_register(enum irq_subclass subclass)
{
spin_lock(&irq_subclass_lock);
if (!irq_subclass_refcount[subclass])
- ctl_set_bit(0, subclass);
+ system_ctl_set_bit(0, subclass);
irq_subclass_refcount[subclass]++;
spin_unlock(&irq_subclass_lock);
}
@@ -329,7 +396,7 @@ void irq_subclass_unregister(enum irq_subclass subclass)
spin_lock(&irq_subclass_lock);
irq_subclass_refcount[subclass]--;
if (!irq_subclass_refcount[subclass])
- ctl_clear_bit(0, subclass);
+ system_ctl_clear_bit(0, subclass);
spin_unlock(&irq_subclass_lock);
}
EXPORT_SYMBOL(irq_subclass_unregister);
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index ab584e8e3527..e808bb8bc0da 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -6,8 +6,9 @@
* Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
*/
#include <linux/uaccess.h>
-#include <linux/stop_machine.h>
#include <linux/jump_label.h>
+#include <linux/module.h>
+#include <asm/text-patching.h>
#include <asm/ipl.h>
struct insn {
@@ -36,21 +37,15 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected,
unsigned char *ipe = (unsigned char *)expected;
unsigned char *ipn = (unsigned char *)new;
- pr_emerg("Jump label code mismatch at %pS [%p]\n", ipc, ipc);
+ pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc);
pr_emerg("Found: %6ph\n", ipc);
pr_emerg("Expected: %6ph\n", ipe);
pr_emerg("New: %6ph\n", ipn);
panic("Corrupted kernel text");
}
-static struct insn orignop = {
- .opcode = 0xc004,
- .offset = JUMP_LABEL_NOP_OFFSET >> 1,
-};
-
-static void __jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- int init)
+static void jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
{
void *code = (void *)jump_entry_code(entry);
struct insn old, new;
@@ -62,29 +57,26 @@ static void __jump_label_transform(struct jump_entry *entry,
jump_label_make_branch(entry, &old);
jump_label_make_nop(entry, &new);
}
- if (init) {
- if (memcmp(code, &orignop, sizeof(orignop)))
- jump_label_bug(entry, &orignop, &new);
- } else {
- if (memcmp(code, &old, sizeof(old)))
- jump_label_bug(entry, &old, &new);
- }
+ if (memcmp(code, &old, sizeof(old)))
+ jump_label_bug(entry, &old, &new);
s390_kernel_write(code, &new, sizeof(new));
}
-static void __jump_label_sync(void *dummy)
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
{
+ jump_label_transform(entry, type);
+ text_poke_sync();
}
-void arch_jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type)
+bool arch_jump_label_transform_queue(struct jump_entry *entry,
+ enum jump_label_type type)
{
- __jump_label_transform(entry, type, 0);
- smp_call_function(__jump_label_sync, NULL, 1);
+ jump_label_transform(entry, type);
+ return true;
}
-void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+void arch_jump_label_transform_apply(void)
{
- __jump_label_transform(entry, type, 1);
+ text_poke_sync();
}
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 6f1388391620..f0cf20d4b3c5 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -7,6 +7,9 @@
* s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com>
*/
+#define pr_fmt(fmt) "kprobes: " fmt
+
+#include <linux/moduleloader.h>
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
@@ -21,28 +24,36 @@
#include <asm/set_memory.h>
#include <asm/sections.h>
#include <asm/dis.h>
+#include "kprobes.h"
+#include "entry.h"
DEFINE_PER_CPU(struct kprobe *, current_kprobe);
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
struct kretprobe_blackpoint kretprobe_blacklist[] = { };
-DEFINE_INSN_CACHE_OPS(s390_insn);
-
static int insn_page_in_use;
-static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+void *alloc_insn_page(void)
+{
+ void *page;
+
+ page = module_alloc(PAGE_SIZE);
+ if (!page)
+ return NULL;
+ set_memory_rox((unsigned long)page, 1);
+ return page;
+}
static void *alloc_s390_insn_page(void)
{
if (xchg(&insn_page_in_use, 1) == 1)
return NULL;
- set_memory_x((unsigned long) &insn_page, 1);
- return &insn_page;
+ return &kprobes_insn_page;
}
static void free_s390_insn_page(void *page)
{
- set_memory_nx((unsigned long) page, 1);
xchg(&insn_page_in_use, 0);
}
@@ -56,44 +67,32 @@ struct kprobe_insn_cache kprobe_s390_insn_slots = {
static void copy_instruction(struct kprobe *p)
{
- unsigned long ip = (unsigned long) p->addr;
+ kprobe_opcode_t insn[MAX_INSN_SIZE];
s64 disp, new_disp;
u64 addr, new_addr;
+ unsigned int len;
- if (ftrace_location(ip) == ip) {
+ len = insn_length(*p->addr >> 8);
+ memcpy(&insn, p->addr, len);
+ p->opcode = insn[0];
+ if (probe_is_insn_relative_long(&insn[0])) {
/*
- * If kprobes patches the instruction that is morphed by
- * ftrace make sure that kprobes always sees the branch
- * "jg .+24" that skips the mcount block or the "brcl 0,0"
- * in case of hotpatch.
+ * For pc-relative instructions in RIL-b or RIL-c format patch
+ * the RI2 displacement field. We have already made sure that
+ * the insn slot for the patched instruction is within the same
+ * 2GB area as the original instruction (either kernel image or
+ * module area). Therefore the new displacement will always fit.
*/
- ftrace_generate_nop_insn((struct ftrace_insn *)p->ainsn.insn);
- p->ainsn.is_ftrace_insn = 1;
- } else
- memcpy(p->ainsn.insn, p->addr, insn_length(*p->addr >> 8));
- p->opcode = p->ainsn.insn[0];
- if (!probe_is_insn_relative_long(p->ainsn.insn))
- return;
- /*
- * For pc-relative instructions in RIL-b or RIL-c format patch the
- * RI2 displacement field. We have already made sure that the insn
- * slot for the patched instruction is within the same 2GB area
- * as the original instruction (either kernel image or module area).
- * Therefore the new displacement will always fit.
- */
- disp = *(s32 *)&p->ainsn.insn[1];
- addr = (u64)(unsigned long)p->addr;
- new_addr = (u64)(unsigned long)p->ainsn.insn;
- new_disp = ((addr + (disp * 2)) - new_addr) / 2;
- *(s32 *)&p->ainsn.insn[1] = new_disp;
+ disp = *(s32 *)&insn[1];
+ addr = (u64)(unsigned long)p->addr;
+ new_addr = (u64)(unsigned long)p->ainsn.insn;
+ new_disp = ((addr + (disp * 2)) - new_addr) / 2;
+ *(s32 *)&insn[1] = new_disp;
+ }
+ s390_kernel_write(p->ainsn.insn, &insn, len);
}
NOKPROBE_SYMBOL(copy_instruction);
-static inline int is_kernel_addr(void *addr)
-{
- return addr < (void *)_end;
-}
-
static int s390_get_insn_slot(struct kprobe *p)
{
/*
@@ -102,7 +101,7 @@ static int s390_get_insn_slot(struct kprobe *p)
* field can be patched and executed within the insn slot.
*/
p->ainsn.insn = NULL;
- if (is_kernel_addr(p->addr))
+ if (is_kernel((unsigned long)p->addr))
p->ainsn.insn = get_s390_insn_slot();
else if (is_module_addr(p->addr))
p->ainsn.insn = get_insn_slot();
@@ -114,7 +113,7 @@ static void s390_free_insn_slot(struct kprobe *p)
{
if (!p->ainsn.insn)
return;
- if (is_kernel_addr(p->addr))
+ if (is_kernel((unsigned long)p->addr))
free_s390_insn_slot(p->ainsn.insn, 0);
else
free_insn_slot(p->ainsn.insn, 0);
@@ -122,9 +121,55 @@ static void s390_free_insn_slot(struct kprobe *p)
}
NOKPROBE_SYMBOL(s390_free_insn_slot);
+/* Check if paddr is at an instruction boundary */
+static bool can_probe(unsigned long paddr)
+{
+ unsigned long addr, offset = 0;
+ kprobe_opcode_t insn;
+ struct kprobe *kp;
+
+ if (paddr & 0x01)
+ return false;
+
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+ return false;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn)))
+ return false;
+
+ if (insn >> 8 == 0) {
+ if (insn != BREAKPOINT_INSTRUCTION) {
+ /*
+ * Note that QEMU inserts opcode 0x0000 to implement
+ * software breakpoints for guests. Since the size of
+ * the original instruction is unknown, stop following
+ * instructions and prevent setting a kprobe.
+ */
+ return false;
+ }
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case the original instruction is
+ * decoded.
+ */
+ kp = get_kprobe((void *)addr);
+ if (!kp) {
+ /* not a kprobe */
+ return false;
+ }
+ insn = kp->opcode;
+ }
+ addr += insn_length(insn >> 8);
+ }
+ return addr == paddr;
+}
+
int arch_prepare_kprobe(struct kprobe *p)
{
- if ((unsigned long) p->addr & 0x01)
+ if (!can_probe((unsigned long)p->addr))
return -EINVAL;
/* Make sure the probe isn't going on a difficult instruction */
if (probe_is_prohibited_opcode(p->addr))
@@ -136,11 +181,6 @@ int arch_prepare_kprobe(struct kprobe *p)
}
NOKPROBE_SYMBOL(arch_prepare_kprobe);
-int arch_check_ftrace_location(struct kprobe *p)
-{
- return 0;
-}
-
struct swap_insn_args {
struct kprobe *p;
unsigned int arm_kprobe : 1;
@@ -149,28 +189,11 @@ struct swap_insn_args {
static int swap_instruction(void *data)
{
struct swap_insn_args *args = data;
- struct ftrace_insn new_insn, *insn;
struct kprobe *p = args->p;
- size_t len;
-
- new_insn.opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode;
- len = sizeof(new_insn.opc);
- if (!p->ainsn.is_ftrace_insn)
- goto skip_ftrace;
- len = sizeof(new_insn);
- insn = (struct ftrace_insn *) p->addr;
- if (args->arm_kprobe) {
- if (is_ftrace_nop(insn))
- new_insn.disp = KPROBE_ON_FTRACE_NOP;
- else
- new_insn.disp = KPROBE_ON_FTRACE_CALL;
- } else {
- ftrace_generate_call_insn(&new_insn, (unsigned long)p->addr);
- if (insn->disp == KPROBE_ON_FTRACE_NOP)
- ftrace_generate_nop_insn(&new_insn);
- }
-skip_ftrace:
- s390_kernel_write(p->addr, &new_insn, len);
+ u16 opc;
+
+ opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode;
+ s390_kernel_write(p->addr, &opc, sizeof(opc));
return 0;
}
NOKPROBE_SYMBOL(swap_instruction);
@@ -201,20 +224,27 @@ static void enable_singlestep(struct kprobe_ctlblk *kcb,
struct pt_regs *regs,
unsigned long ip)
{
- struct per_regs per_kprobe;
+ union {
+ struct ctlreg regs[3];
+ struct {
+ struct ctlreg control;
+ struct ctlreg start;
+ struct ctlreg end;
+ };
+ } per_kprobe;
/* Set up the PER control registers %cr9-%cr11 */
- per_kprobe.control = PER_EVENT_IFETCH;
- per_kprobe.start = ip;
- per_kprobe.end = ip;
+ per_kprobe.control.val = PER_EVENT_IFETCH;
+ per_kprobe.start.val = ip;
+ per_kprobe.end.val = ip;
/* Save control regs and psw mask */
- __ctl_store(kcb->kprobe_saved_ctl, 9, 11);
+ __local_ctl_store(9, 11, kcb->kprobe_saved_ctl);
kcb->kprobe_saved_imask = regs->psw.mask &
(PSW_MASK_PER | PSW_MASK_IO | PSW_MASK_EXT);
/* Set PER control regs, turns on single step for the given address */
- __ctl_load(per_kprobe, 9, 11);
+ __local_ctl_load(9, 11, per_kprobe.regs);
regs->psw.mask |= PSW_MASK_PER;
regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT);
regs->psw.addr = ip;
@@ -226,7 +256,7 @@ static void disable_singlestep(struct kprobe_ctlblk *kcb,
unsigned long ip)
{
/* Restore control regs and psw mask, set new psw address */
- __ctl_load(kcb->kprobe_saved_ctl, 9, 11);
+ __local_ctl_load(9, 11, kcb->kprobe_saved_ctl);
regs->psw.mask &= ~PSW_MASK_PER;
regs->psw.mask |= kcb->kprobe_saved_imask;
regs->psw.addr = ip;
@@ -255,18 +285,10 @@ static void pop_kprobe(struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
+ kcb->prev_kprobe.kp = NULL;
}
NOKPROBE_SYMBOL(pop_kprobe);
-void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
-{
- ri->ret_addr = (kprobe_opcode_t *) regs->gprs[14];
-
- /* Replace the return addr with trampoline addr */
- regs->gprs[14] = (unsigned long) &kretprobe_trampoline;
-}
-NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-
static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p)
{
switch (kcb->kprobe_status) {
@@ -282,7 +304,7 @@ static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p)
* is a BUG. The code path resides in the .kprobes.text
* section and is executed with interrupts disabled.
*/
- pr_err("Invalid kprobe detected.\n");
+ pr_err("Failed to recover from reentered kprobes.\n");
dump_kprobe(p);
BUG();
}
@@ -348,109 +370,6 @@ static int kprobe_handler(struct pt_regs *regs)
NOKPROBE_SYMBOL(kprobe_handler);
/*
- * Function return probe trampoline:
- * - init_kprobes() establishes a probepoint here
- * - When the probed function returns, this probe
- * causes the handlers to fire
- */
-static void __used kretprobe_trampoline_holder(void)
-{
- asm volatile(".global kretprobe_trampoline\n"
- "kretprobe_trampoline: bcr 0,0\n");
-}
-
-/*
- * Called when the probe at kretprobe trampoline is hit
- */
-static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
- struct kretprobe_instance *ri;
- struct hlist_head *head, empty_rp;
- struct hlist_node *tmp;
- unsigned long flags, orig_ret_address;
- unsigned long trampoline_address;
- kprobe_opcode_t *correct_ret_addr;
-
- INIT_HLIST_HEAD(&empty_rp);
- kretprobe_hash_lock(current, &head, &flags);
-
- /*
- * It is possible to have multiple instances associated with a given
- * task either because an multiple functions in the call path
- * have a return probe installed on them, and/or more than one return
- * return probe was registered for a target function.
- *
- * We can handle this because:
- * - instances are always inserted at the head of the list
- * - when multiple return probes are registered for the same
- * function, the first instance's ret_addr will point to the
- * real return address, and all the rest will point to
- * kretprobe_trampoline
- */
- ri = NULL;
- orig_ret_address = 0;
- correct_ret_addr = NULL;
- trampoline_address = (unsigned long) &kretprobe_trampoline;
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- orig_ret_address = (unsigned long) ri->ret_addr;
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- kretprobe_assert(ri, orig_ret_address, trampoline_address);
-
- correct_ret_addr = ri->ret_addr;
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
-
- orig_ret_address = (unsigned long) ri->ret_addr;
-
- if (ri->rp && ri->rp->handler) {
- ri->ret_addr = correct_ret_addr;
- ri->rp->handler(ri, regs);
- }
-
- recycle_rp_inst(ri, &empty_rp);
-
- if (orig_ret_address != trampoline_address)
- /*
- * This is the real return address. Any other
- * instances associated with this task are for
- * other calls deeper on the call stack
- */
- break;
- }
-
- regs->psw.addr = orig_ret_address;
-
- kretprobe_hash_unlock(current, &flags);
-
- hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
- hlist_del(&ri->hlist);
- kfree(ri);
- }
- /*
- * By returning a non-zero value, we are telling
- * kprobe_handler() that we don't want the post_handler
- * to run (and have re-enabled preemption)
- */
- return 1;
-}
-NOKPROBE_SYMBOL(trampoline_probe_handler);
-
-/*
* Called after single-stepping. p->addr is the address of the
* instruction whose first byte has been replaced by the "breakpoint"
* instruction. To avoid the SMP problems that can occur when we
@@ -464,24 +383,6 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs)
unsigned long ip = regs->psw.addr;
int fixup = probe_get_fixup_type(p->ainsn.insn);
- /* Check if the kprobes location is an enabled ftrace caller */
- if (p->ainsn.is_ftrace_insn) {
- struct ftrace_insn *insn = (struct ftrace_insn *) p->addr;
- struct ftrace_insn call_insn;
-
- ftrace_generate_call_insn(&call_insn, (unsigned long) p->addr);
- /*
- * A kprobe on an enabled ftrace call site actually single
- * stepped an unconditional branch (ftrace nop equivalent).
- * Now we need to fixup things and pretend that a brasl r0,...
- * was executed instead.
- */
- if (insn->disp == KPROBE_ON_FTRACE_CALL) {
- ip += call_insn.disp * 2 - MCOUNT_INSN_SIZE;
- regs->gprs[0] = (unsigned long)p->addr + sizeof(*insn);
- }
- }
-
if (fixup & FIXUP_PSW_NORMAL)
ip += (unsigned long) p->addr - (unsigned long) p->ainsn.insn;
@@ -509,12 +410,11 @@ static int post_kprobe_handler(struct pt_regs *regs)
if (!p)
return 0;
+ resume_execution(p, regs);
if (kcb->kprobe_status != KPROBE_REENTER && p->post_handler) {
kcb->kprobe_status = KPROBE_HIT_SSDONE;
p->post_handler(p, regs, 0);
}
-
- resume_execution(p, regs);
pop_kprobe(kcb);
preempt_enable_no_resched();
@@ -534,7 +434,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
struct kprobe *p = kprobe_running();
- const struct exception_table_entry *entry;
switch(kcb->kprobe_status) {
case KPROBE_HIT_SS:
@@ -553,32 +452,11 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
case KPROBE_HIT_ACTIVE:
case KPROBE_HIT_SSDONE:
/*
- * We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accounting
- * these specific fault cases.
- */
- kprobes_inc_nmissed_count(p);
-
- /*
- * We come here because instructions in the pre/post
- * handler caused the page_fault, this could happen
- * if handler tries to access user space by
- * copy_from_user(), get_user() etc. Let the
- * user-specified handler try to fix it first.
- */
- if (p->fault_handler && p->fault_handler(p, regs, trapnr))
- return 1;
-
- /*
* In case the user-specified fault handler returned
* zero, try to fix up.
*/
- entry = s390_search_extables(regs->psw.addr);
- if (entry) {
- regs->psw.addr = extable_fixup(entry);
+ if (fixup_exception(regs))
return 1;
- }
-
/*
* fixup_exception() could not handle it,
* Let do_page_fault() fix it.
@@ -642,18 +520,13 @@ int kprobe_exceptions_notify(struct notifier_block *self,
}
NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-static struct kprobe trampoline = {
- .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
- .pre_handler = trampoline_probe_handler
-};
-
int __init arch_init_kprobes(void)
{
- return register_kprobe(&trampoline);
+ return 0;
}
int arch_trampoline_kprobe(struct kprobe *p)
{
- return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline;
+ return 0;
}
NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/s390/kernel/kprobes.h b/arch/s390/kernel/kprobes.h
new file mode 100644
index 000000000000..dc3ed5098ee7
--- /dev/null
+++ b/arch/s390/kernel/kprobes.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef _ARCH_S390_KPROBES_H
+#define _ARCH_S390_KPROBES_H
+
+#include <linux/kprobes.h>
+
+DEFINE_INSN_CACHE_OPS(s390_insn);
+
+#endif
diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S
new file mode 100644
index 000000000000..0fe4d725e98b
--- /dev/null
+++ b/arch/s390/kernel/kprobes_insn_page.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+
+/*
+ * insn_page is a special 4k aligned dummy function for kprobes.
+ * It will contain all kprobed instructions that are out-of-line executed.
+ * The page must be within the kernel image to guarantee that the
+ * out-of-line instructions are within 2GB distance of their original
+ * location. Using a dummy function ensures that the insn_page is within
+ * the text section of the kernel and mapped read-only/executable from
+ * the beginning on, thus avoiding to split large mappings if the page
+ * would be in the data section instead.
+ */
+ .section .kprobes.text, "ax"
+ .balign 4096
+SYM_CODE_START(kprobes_insn_page)
+ .rept 2048
+ .word 0x07fe
+ .endr
+SYM_CODE_END(kprobes_insn_page)
+ .previous
diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c
index 452502f9a0d9..6652e54cf3db 100644
--- a/arch/s390/kernel/lgr.c
+++ b/arch/s390/kernel/lgr.c
@@ -88,8 +88,7 @@ static void lgr_stsi_2_2_2(struct lgr_info *lgr_info)
if (stsi(si, 2, 2, 2))
return;
cpascii(lgr_info->name, si->name, sizeof(si->name));
- memcpy(&lgr_info->lpar_number, &si->lpar_number,
- sizeof(lgr_info->lpar_number));
+ lgr_info->lpar_number = si->lpar_number;
}
/*
@@ -167,7 +166,7 @@ static struct timer_list lgr_timer;
*/
static void lgr_timer_set(void)
{
- mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ);
+ mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC));
}
/*
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index cb8b1cc285c9..aa22ffc16bcd 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -3,7 +3,6 @@
* Copyright IBM Corp. 2005, 2011
*
* Author(s): Rolf Adelsberger,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
* Michael Holzheu <holzheu@linux.vnet.ibm.com>
*/
@@ -14,24 +13,25 @@
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/debug_locks.h>
-#include <linux/suspend.h>
+#include <asm/pfault.h>
#include <asm/cio.h>
#include <asm/setup.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/smp.h>
#include <asm/ipl.h>
#include <asm/diag.h>
#include <asm/elf.h>
#include <asm/asm-offsets.h>
#include <asm/cacheflush.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
#include <asm/set_memory.h>
#include <asm/stacktrace.h>
#include <asm/switch_to.h>
#include <asm/nmi.h>
+#include <asm/sclp.h>
-typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long);
+typedef void (*relocate_kernel_t)(unsigned long, unsigned long, unsigned long);
+typedef int (*purgatory_t)(int);
extern const unsigned char relocate_kernel[];
extern const unsigned long long relocate_kernel_len;
@@ -39,44 +39,17 @@ extern const unsigned long long relocate_kernel_len;
#ifdef CONFIG_CRASH_DUMP
/*
- * PM notifier callback for kdump
- */
-static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
- void *ptr)
-{
- switch (action) {
- case PM_SUSPEND_PREPARE:
- case PM_HIBERNATION_PREPARE:
- if (kexec_crash_image)
- arch_kexec_unprotect_crashkres();
- break;
- case PM_POST_SUSPEND:
- case PM_POST_HIBERNATION:
- if (kexec_crash_image)
- arch_kexec_protect_crashkres();
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
-}
-
-static int __init machine_kdump_pm_init(void)
-{
- pm_notifier(machine_kdump_pm_cb, 0);
- return 0;
-}
-arch_initcall(machine_kdump_pm_init);
-
-/*
* Reset the system, copy boot CPU registers to absolute zero,
* and jump to the kdump image
*/
-static void __do_machine_kdump(void *image)
+static void __do_machine_kdump(void *data)
{
- int (*start_kdump)(int);
+ struct kimage *image = data;
+ purgatory_t purgatory;
unsigned long prefix;
+ purgatory = (purgatory_t)image->start;
+
/* store_status() saved the prefix register to lowcore */
prefix = (unsigned long) S390_lowcore.prefixreg_save_area;
@@ -88,14 +61,12 @@ static void __do_machine_kdump(void *image)
* This need to be done *after* s390_reset_system set the
* prefix register of this CPU to zero
*/
- memcpy((void *) __LC_FPREGS_SAVE_AREA,
- (void *)(prefix + __LC_FPREGS_SAVE_AREA), 512);
+ memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA),
+ phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512);
- __load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA);
- start_kdump = (void *)((struct kimage *) image)->start;
- start_kdump(1);
+ call_nodat(1, int, purgatory, int, 1);
- /* Die if start_kdump returns */
+ /* Die if kdump returns */
disabled_wait();
}
@@ -119,16 +90,16 @@ static noinline void __machine_kdump(void *image)
continue;
}
/* Store status of the boot CPU */
- mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
- if (MACHINE_HAS_VX)
+ mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+ if (cpu_has_vx())
save_vx_regs((__vector128 *) mcesa->vector_save_area);
if (MACHINE_HAS_GS) {
- __ctl_store(cr2_old.val, 2, 2);
+ local_ctl_store(2, &cr2_old.reg);
cr2_new = cr2_old;
cr2_new.gse = 1;
- __ctl_load(cr2_new.val, 2, 2);
+ local_ctl_load(2, &cr2_new.reg);
save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area);
- __ctl_load(cr2_old.val, 2, 2);
+ local_ctl_load(2, &cr2_old.reg);
}
/*
* To create a good backchain for this CPU in the dump store_status
@@ -142,18 +113,6 @@ static noinline void __machine_kdump(void *image)
store_status(__do_machine_kdump, image);
}
-static unsigned long do_start_kdump(unsigned long addr)
-{
- struct kimage *image = (struct kimage *) addr;
- int (*start_kdump)(int) = (void *)image->start;
- int rc;
-
- __arch_local_irq_stnsm(0xfb); /* disable DAT */
- rc = start_kdump(0);
- __arch_local_irq_stosm(0x04); /* enable DAT */
- return rc;
-}
-
#endif /* CONFIG_CRASH_DUMP */
/*
@@ -162,11 +121,10 @@ static unsigned long do_start_kdump(unsigned long addr)
static bool kdump_csum_valid(struct kimage *image)
{
#ifdef CONFIG_CRASH_DUMP
+ purgatory_t purgatory = (purgatory_t)image->start;
int rc;
- preempt_disable();
- rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image);
- preempt_enable();
+ rc = call_nodat(1, int, purgatory, int, 0);
return rc == 0;
#else
return false;
@@ -240,7 +198,7 @@ int machine_kexec_prepare(struct kimage *image)
return -EINVAL;
/* Get the destination where the assembler code should be copied to.*/
- reboot_code_buffer = (void *) page_to_phys(image->control_code_page);
+ reboot_code_buffer = page_to_virt(image->control_code_page);
/* Then copy it */
memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len);
@@ -253,13 +211,17 @@ void machine_kexec_cleanup(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
+ struct lowcore *abs_lc;
+
VMCOREINFO_SYMBOL(lowcore_ptr);
VMCOREINFO_SYMBOL(high_memory);
VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
- vmcoreinfo_append_str("SDMA=%lx\n", __sdma);
- vmcoreinfo_append_str("EDMA=%lx\n", __edma);
+ vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31);
+ vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31);
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
- mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
+ abs_lc = get_abs_lowcore();
+ abs_lc->vmcore_info = paddr_vmcoreinfo_note();
+ put_abs_lowcore(abs_lc);
}
void machine_shutdown(void)
@@ -276,15 +238,20 @@ void machine_crash_shutdown(struct pt_regs *regs)
*/
static void __do_machine_kexec(void *data)
{
- relocate_kernel_t data_mover;
+ unsigned long data_mover, entry, diag308_subcode;
struct kimage *image = data;
+ data_mover = page_to_phys(image->control_code_page);
+ entry = virt_to_phys(&image->head);
+ diag308_subcode = DIAG308_CLEAR_RESET;
+ if (sclp.has_iplcc)
+ diag308_subcode |= DIAG308_FLAG_EI;
s390_reset_system();
- data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page);
- __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */
- /* Call the moving routine */
- (*data_mover)(&image->head, image->start);
+ call_nodat(3, void, (relocate_kernel_t)data_mover,
+ unsigned long, entry,
+ unsigned long, image->start,
+ unsigned long, diag308_subcode);
/* Die if kexec returns */
disabled_wait();
@@ -295,7 +262,6 @@ static void __do_machine_kexec(void *data)
*/
static void __machine_kexec(void *data)
{
- __arch_local_irq_stosm(0x04); /* enable DAT */
pfault_fini();
tracing_off();
debug_locks_off();
diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
index 8415ae7d2a23..8d207b82d9fe 100644
--- a/arch/s390/kernel/machine_kexec_file.c
+++ b/arch/s390/kernel/machine_kexec_file.c
@@ -7,11 +7,14 @@
* Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com>
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/elf.h>
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/module_signature.h>
#include <linux/verification.h>
+#include <linux/vmalloc.h>
#include <asm/boot_data.h>
#include <asm/ipl.h>
#include <asm/setup.h>
@@ -28,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1;
struct module_signature *ms;
unsigned long sig_len;
+ int ret;
/* Skip signature verification when not secure IPLed. */
if (!ipl_secure_flag)
@@ -62,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
return -EBADMSG;
}
- return verify_pkcs7_signature(kernel, kernel_len,
- kernel + kernel_len, sig_len,
- VERIFY_USE_PLATFORM_KEYRING,
- VERIFYING_MODULE_SIGNATURE,
- NULL, NULL);
+ ret = verify_pkcs7_signature(kernel, kernel_len,
+ kernel + kernel_len, sig_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING))
+ ret = verify_pkcs7_signature(kernel, kernel_len,
+ kernel + kernel_len, sig_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_MODULE_SIGNATURE,
+ NULL, NULL);
+ return ret;
}
#endif /* CONFIG_KEXEC_SIG */
@@ -151,7 +162,7 @@ static int kexec_file_add_initrd(struct kimage *image,
buf.mem += crashk_res.start;
buf.memsz = buf.bufsz;
- data->parm->initrd_start = buf.mem;
+ data->parm->initrd_start = data->memsz;
data->parm->initrd_size = buf.memsz;
data->memsz += buf.memsz;
@@ -170,15 +181,14 @@ static int kexec_file_add_ipl_report(struct kimage *image,
struct kexec_buf buf;
unsigned long addr;
void *ptr, *end;
+ int ret;
buf.image = image;
data->memsz = ALIGN(data->memsz, PAGE_SIZE);
buf.mem = data->memsz;
- if (image->type == KEXEC_TYPE_CRASH)
- buf.mem += crashk_res.start;
- ptr = (void *)ipl_cert_list_addr;
+ ptr = __va(ipl_cert_list_addr);
end = ptr + ipl_cert_list_size;
ncerts = 0;
while (ptr < end) {
@@ -190,7 +200,7 @@ static int kexec_file_add_ipl_report(struct kimage *image,
addr = data->memsz + data->report->size;
addr += ncerts * sizeof(struct ipl_rb_certificate_entry);
- ptr = (void *)ipl_cert_list_addr;
+ ptr = __va(ipl_cert_list_addr);
while (ptr < end) {
len = *(unsigned int *)ptr;
ptr += sizeof(len);
@@ -199,9 +209,13 @@ static int kexec_file_add_ipl_report(struct kimage *image,
ptr += len;
}
+ ret = -ENOMEM;
buf.buffer = ipl_report_finish(data->report);
+ if (!buf.buffer)
+ goto out;
buf.bufsz = data->report->size;
buf.memsz = buf.bufsz;
+ image->arch.ipl_buf = buf.buffer;
data->memsz += buf.memsz;
@@ -209,14 +223,21 @@ static int kexec_file_add_ipl_report(struct kimage *image,
data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr);
*lc_ipl_parmblock_ptr = (__u32)buf.mem;
- return kexec_add_buffer(&buf);
+ if (image->type == KEXEC_TYPE_CRASH)
+ buf.mem += crashk_res.start;
+
+ ret = kexec_add_buffer(&buf);
+out:
+ return ret;
}
void *kexec_file_add_components(struct kimage *image,
int (*add_kernel)(struct kimage *image,
struct s390_load_data *data))
{
+ unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE;
struct s390_load_data data = {0};
+ unsigned long minsize;
int ret;
data.report = ipl_report_init(&ipl_block);
@@ -227,10 +248,23 @@ void *kexec_file_add_components(struct kimage *image,
if (ret)
goto out;
- if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) {
- ret = -EINVAL;
+ ret = -EINVAL;
+ minsize = PARMAREA + offsetof(struct parmarea, command_line);
+ if (image->kernel_buf_len < minsize)
goto out;
- }
+
+ if (data.parm->max_command_line_size)
+ max_command_line_size = data.parm->max_command_line_size;
+
+ if (minsize + max_command_line_size < minsize)
+ goto out;
+
+ if (image->kernel_buf_len < minsize + max_command_line_size)
+ goto out;
+
+ if (image->cmdline_buf_len >= max_command_line_size)
+ goto out;
+
memcpy(data.parm->command_line, image->cmdline_buf,
image->cmdline_buf_len);
@@ -267,8 +301,16 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
const Elf_Shdr *relsec,
const Elf_Shdr *symtab)
{
+ const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
Elf_Rela *relas;
int i, r_type;
+ int ret;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
relas = (void *)pi->ehdr + relsec->sh_offset;
@@ -281,15 +323,27 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
sym = (void *)pi->ehdr + symtab->sh_offset;
sym += ELF64_R_SYM(relas[i].r_info);
- if (sym->st_shndx == SHN_UNDEF)
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
return -ENOEXEC;
+ }
- if (sym->st_shndx == SHN_COMMON)
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
return -ENOEXEC;
+ }
if (sym->st_shndx >= pi->ehdr->e_shnum &&
- sym->st_shndx != SHN_ABS)
+ sym->st_shndx != SHN_ABS) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
return -ENOEXEC;
+ }
loc = pi->purgatory_buf;
loc += section->sh_offset;
@@ -303,21 +357,23 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
addr = section->sh_addr + relas[i].r_offset;
r_type = ELF64_R_TYPE(relas[i].r_info);
- arch_kexec_do_relocs(r_type, loc, val, addr);
+
+ if (r_type == R_390_PLT32DBL)
+ r_type = R_390_PC32DBL;
+
+ ret = arch_kexec_do_relocs(r_type, loc, val, addr);
+ if (ret) {
+ pr_err("Unknown rela relocation: %d\n", r_type);
+ return -ENOEXEC;
+ }
}
return 0;
}
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
{
- /* A kernel must be at least large enough to contain head.S. During
- * load memory in head.S will be accessed, e.g. to register the next
- * command line. If the next kernel were smaller the current kernel
- * will panic at load.
- */
- if (buf_len < HEAD_END)
- return -ENOEXEC;
-
- return kexec_image_probe_default(image, buf, buf_len);
+ vfree(image->arch.ipl_buf);
+ image->arch.ipl_buf = NULL;
+
+ return kexec_image_post_load_cleanup_default(image);
}
diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c
index d5035de9020e..b7182cec48dc 100644
--- a/arch/s390/kernel/machine_kexec_reloc.c
+++ b/arch/s390/kernel/machine_kexec_reloc.c
@@ -28,6 +28,7 @@ int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val,
break;
case R_390_64: /* Direct 64 bit. */
case R_390_GLOB_DAT:
+ case R_390_JMP_SLOT:
*(u64 *)loc = val;
break;
case R_390_PC16: /* PC relative 16 bit. */
diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
index 9e1660a6b9db..ae4d4fd9afcd 100644
--- a/arch/s390/kernel/mcount.S
+++ b/arch/s390/kernel/mcount.S
@@ -2,8 +2,6 @@
/*
* Copyright IBM Corp. 2008, 2009
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
*/
#include <linux/linkage.h>
@@ -11,84 +9,187 @@
#include <asm/ftrace.h>
#include <asm/nospec-insn.h>
#include <asm/ptrace.h>
-#include <asm/export.h>
+
+#define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE)
+#define STACK_PTREGS (STACK_FRAME_OVERHEAD)
+#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
+#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
+
+#define STACK_FRAME_SIZE_FREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE)
+#define STACK_FREGS (STACK_FRAME_OVERHEAD)
+#define STACK_FREGS_PTREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS)
+#define STACK_FREGS_PTREGS_GPRS (STACK_FREGS_PTREGS + __PT_GPRS)
+#define STACK_FREGS_PTREGS_PSW (STACK_FREGS_PTREGS + __PT_PSW)
+#define STACK_FREGS_PTREGS_ORIG_GPR2 (STACK_FREGS_PTREGS + __PT_ORIG_GPR2)
+#define STACK_FREGS_PTREGS_FLAGS (STACK_FREGS_PTREGS + __PT_FLAGS)
+
+/* packed stack: allocate just enough for r14, r15 and backchain */
+#define TRACED_FUNC_FRAME_SIZE 24
+
+#ifdef CONFIG_FUNCTION_TRACER
GEN_BR_THUNK %r1
GEN_BR_THUNK %r14
.section .kprobes.text, "ax"
-ENTRY(ftrace_stub)
+SYM_FUNC_START(ftrace_stub)
BR_EX %r14
-ENDPROC(ftrace_stub)
+SYM_FUNC_END(ftrace_stub)
-#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE)
-#define STACK_PTREGS (STACK_FRAME_OVERHEAD)
-#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
-#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW)
+SYM_CODE_START(ftrace_stub_direct_tramp)
+ lgr %r1, %r0
+ BR_EX %r1
+SYM_CODE_END(ftrace_stub_direct_tramp)
-ENTRY(_mcount)
- BR_EX %r14
-ENDPROC(_mcount)
-EXPORT_SYMBOL(_mcount)
+ .macro ftrace_regs_entry, allregs=0
+ stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller
+
+ .if \allregs == 1
+ # save psw mask
+ # don't put any instructions clobbering CC before this point
+ epsw %r1,%r14
+ risbg %r14,%r1,0,31,32
+ .endif
-ENTRY(ftrace_caller)
- .globl ftrace_regs_caller
- .set ftrace_regs_caller,ftrace_caller
lgr %r1,%r15
-#if !(defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT))
- aghi %r0,MCOUNT_RETURN_FIXUP
-#endif
- aghi %r15,-STACK_FRAME_SIZE
+ # allocate stack frame for ftrace_caller to contain traced function
+ aghi %r15,-TRACED_FUNC_FRAME_SIZE
stg %r1,__SF_BACKCHAIN(%r15)
- stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15)
- stg %r0,(STACK_PTREGS_PSW+8)(%r15)
- stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15)
+ stg %r0,(__SF_GPRS+8*8)(%r15)
+ stg %r15,(__SF_GPRS+9*8)(%r15)
+ # allocate ftrace_regs and stack frame for ftrace_trace_function
+ aghi %r15,-STACK_FRAME_SIZE_FREGS
+ stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15)
+ xc STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+
+ .if \allregs == 1
+ stg %r14,(STACK_FREGS_PTREGS_PSW)(%r15)
+ mvghi STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS
+ .else
+ xc STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15)
+ .endif
+
+ lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address
+ aghi %r1,-TRACED_FUNC_FRAME_SIZE
+ stg %r1,__SF_BACKCHAIN(%r15)
+ stg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15)
+ stmg %r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
+ .endm
+
+SYM_CODE_START(ftrace_regs_caller)
+ ftrace_regs_entry 1
+ j ftrace_common
+SYM_CODE_END(ftrace_regs_caller)
+
+SYM_CODE_START(ftrace_caller)
+ ftrace_regs_entry 0
+ j ftrace_common
+SYM_CODE_END(ftrace_caller)
+
+SYM_CODE_START(ftrace_common)
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
aghik %r2,%r0,-MCOUNT_INSN_SIZE
lgrl %r4,function_trace_op
- lgrl %r1,ftrace_trace_function
+ lgrl %r1,ftrace_func
#else
lgr %r2,%r0
aghi %r2,-MCOUNT_INSN_SIZE
larl %r4,function_trace_op
lg %r4,0(%r4)
- larl %r1,ftrace_trace_function
+ larl %r1,ftrace_func
lg %r1,0(%r1)
#endif
lgr %r3,%r14
- la %r5,STACK_PTREGS(%r15)
+ la %r5,STACK_FREGS(%r15)
BASR_EX %r14,%r1
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
# The j instruction gets runtime patched to a nop instruction.
# See ftrace_enable_ftrace_graph_caller.
- .globl ftrace_graph_caller
-ftrace_graph_caller:
- j ftrace_graph_caller_end
- lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15)
- lg %r4,(STACK_PTREGS_PSW+8)(%r15)
+SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL)
+ j .Lftrace_graph_caller_end
+ lmg %r2,%r3,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15)
+ lg %r4,(STACK_FREGS_PTREGS_PSW+8)(%r15)
brasl %r14,prepare_ftrace_return
- stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15)
-ftrace_graph_caller_end:
- .globl ftrace_graph_caller_end
+ stg %r2,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15)
+.Lftrace_graph_caller_end:
+#endif
+ lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15)
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+ ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+ locgrz %r1,%r0
+#else
+ lg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15)
+ ltgr %r1,%r1
+ jnz 0f
+ lgr %r1,%r0
#endif
- lg %r1,(STACK_PTREGS_PSW+8)(%r15)
- lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
+0: lmg %r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15)
BR_EX %r1
-ENDPROC(ftrace_caller)
+SYM_CODE_END(ftrace_common)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(return_to_handler)
+SYM_FUNC_START(return_to_handler)
stmg %r2,%r5,32(%r15)
lgr %r1,%r15
- aghi %r15,-STACK_FRAME_OVERHEAD
+ aghi %r15,-(STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE)
stg %r1,__SF_BACKCHAIN(%r15)
+ la %r3,STACK_FRAME_OVERHEAD(%r15)
+ stg %r1,__FGRAPH_RET_FP(%r3)
+ stg %r2,__FGRAPH_RET_GPR2(%r3)
+ lgr %r2,%r3
brasl %r14,ftrace_return_to_handler
- aghi %r15,STACK_FRAME_OVERHEAD
+ aghi %r15,STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE
lgr %r14,%r2
lmg %r2,%r5,32(%r15)
BR_EX %r14
-ENDPROC(return_to_handler)
+SYM_FUNC_END(return_to_handler)
#endif
+#endif /* CONFIG_FUNCTION_TRACER */
+
+SYM_CODE_START(ftrace_shared_hotpatch_trampoline_br)
+ lmg %r0,%r1,2(%r1)
+ br %r1
+SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_br_end, SYM_L_GLOBAL)
+SYM_CODE_END(ftrace_shared_hotpatch_trampoline_br)
+
+#ifdef CONFIG_EXPOLINE
+SYM_CODE_START(ftrace_shared_hotpatch_trampoline_exrl)
+ lmg %r0,%r1,2(%r1)
+ exrl %r0,0f
+ j .
+0: br %r1
+SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_exrl_end, SYM_L_GLOBAL)
+SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl)
+#endif /* CONFIG_EXPOLINE */
+
+#ifdef CONFIG_RETHOOK
+
+SYM_CODE_START(arch_rethook_trampoline)
+ stg %r14,(__SF_GPRS+8*8)(%r15)
+ lay %r15,-STACK_FRAME_SIZE_PTREGS(%r15)
+ stmg %r0,%r14,STACK_PTREGS_GPRS(%r15)
+
+ # store original stack pointer in backchain and pt_regs
+ lay %r7,STACK_FRAME_SIZE_PTREGS(%r15)
+ stg %r7,__SF_BACKCHAIN(%r15)
+ stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15)
+
+ # store full psw
+ epsw %r2,%r3
+ risbg %r3,%r2,0,31,32
+ stg %r3,STACK_PTREGS_PSW(%r15)
+ larl %r1,arch_rethook_trampoline
+ stg %r1,STACK_PTREGS_PSW+8(%r15)
+
+ lay %r2,STACK_PTREGS(%r15)
+ brasl %r14,arch_rethook_trampoline_callback
+
+ mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15)
+ lmg %r0,%r15,STACK_PTREGS_GPRS(%r15)
+ lpswe __SF_EMPTY(%r15)
+SYM_CODE_END(arch_rethook_trampoline)
+
+#endif /* CONFIG_RETHOOK */
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index ba8f19bb438b..42215f9404af 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -14,14 +14,19 @@
#include <linux/elf.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
+#include <linux/ftrace.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/moduleloader.h>
#include <linux/bug.h>
+#include <linux/memory.h>
#include <asm/alternative.h>
#include <asm/nospec-branch.h>
#include <asm/facility.h>
+#include <asm/ftrace.lds.h>
+#include <asm/set_memory.h>
+#include <asm/setup.h>
#if 0
#define DEBUGP printk
@@ -29,24 +34,52 @@
#define DEBUGP(fmt , ...)
#endif
-#define PLT_ENTRY_SIZE 20
+#define PLT_ENTRY_SIZE 22
+
+static unsigned long get_module_load_offset(void)
+{
+ static DEFINE_MUTEX(module_kaslr_mutex);
+ static unsigned long module_load_offset;
+
+ if (!kaslr_enabled())
+ return 0;
+ /*
+ * Calculate the module_load_offset the first time this code
+ * is called. Once calculated it stays the same until reboot.
+ */
+ mutex_lock(&module_kaslr_mutex);
+ if (!module_load_offset)
+ module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+ mutex_unlock(&module_kaslr_mutex);
+ return module_load_offset;
+}
void *module_alloc(unsigned long size)
{
+ gfp_t gfp_mask = GFP_KERNEL;
void *p;
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
- p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size) < 0)) {
+ p = __vmalloc_node_range(size, MODULE_ALIGN,
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, gfp_mask, PAGE_KERNEL,
+ VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
return p;
}
+#ifdef CONFIG_FUNCTION_TRACER
+void module_arch_cleanup(struct module *mod)
+{
+ module_memfree(mod->arch.trampolines_start);
+}
+#endif
+
void module_arch_freeing_init(struct module *mod)
{
if (is_livepatch_module(mod) &&
@@ -114,6 +147,7 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
Elf_Rela *rela;
char *strings;
int nrela, i, j;
+ struct module_memory *mod_mem;
/* Find symbol table and string table. */
symtab = NULL;
@@ -161,23 +195,26 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
/* Increase core size by size of got & plt and set start
offsets for got and plt. */
- me->core_layout.size = ALIGN(me->core_layout.size, 4);
- me->arch.got_offset = me->core_layout.size;
- me->core_layout.size += me->arch.got_size;
- me->arch.plt_offset = me->core_layout.size;
+ mod_mem = &me->mem[MOD_TEXT];
+ mod_mem->size = ALIGN(mod_mem->size, 4);
+ me->arch.got_offset = mod_mem->size;
+ mod_mem->size += me->arch.got_size;
+ me->arch.plt_offset = mod_mem->size;
if (me->arch.plt_size) {
if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable)
me->arch.plt_size += PLT_ENTRY_SIZE;
- me->core_layout.size += me->arch.plt_size;
+ mod_mem->size += me->arch.plt_size;
}
return 0;
}
static int apply_rela_bits(Elf_Addr loc, Elf_Addr val,
- int sign, int bits, int shift)
+ int sign, int bits, int shift,
+ void *(*write)(void *dest, const void *src, size_t len))
{
unsigned long umax;
long min, max;
+ void *dest = (void *)loc;
if (val & ((1UL << shift) - 1))
return -ENOEXEC;
@@ -194,26 +231,33 @@ static int apply_rela_bits(Elf_Addr loc, Elf_Addr val,
return -ENOEXEC;
}
- if (bits == 8)
- *(unsigned char *) loc = val;
- else if (bits == 12)
- *(unsigned short *) loc = (val & 0xfff) |
+ if (bits == 8) {
+ unsigned char tmp = val;
+ write(dest, &tmp, 1);
+ } else if (bits == 12) {
+ unsigned short tmp = (val & 0xfff) |
(*(unsigned short *) loc & 0xf000);
- else if (bits == 16)
- *(unsigned short *) loc = val;
- else if (bits == 20)
- *(unsigned int *) loc = (val & 0xfff) << 16 |
- (val & 0xff000) >> 4 |
- (*(unsigned int *) loc & 0xf00000ff);
- else if (bits == 32)
- *(unsigned int *) loc = val;
- else if (bits == 64)
- *(unsigned long *) loc = val;
+ write(dest, &tmp, 2);
+ } else if (bits == 16) {
+ unsigned short tmp = val;
+ write(dest, &tmp, 2);
+ } else if (bits == 20) {
+ unsigned int tmp = (val & 0xfff) << 16 |
+ (val & 0xff000) >> 4 | (*(unsigned int *) loc & 0xf00000ff);
+ write(dest, &tmp, 4);
+ } else if (bits == 32) {
+ unsigned int tmp = val;
+ write(dest, &tmp, 4);
+ } else if (bits == 64) {
+ unsigned long tmp = val;
+ write(dest, &tmp, 8);
+ }
return 0;
}
static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
- const char *strtab, struct module *me)
+ const char *strtab, struct module *me,
+ void *(*write)(void *dest, const void *src, size_t len))
{
struct mod_arch_syminfo *info;
Elf_Addr loc, val;
@@ -241,17 +285,17 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_64: /* Direct 64 bit. */
val += rela->r_addend;
if (r_type == R_390_8)
- rc = apply_rela_bits(loc, val, 0, 8, 0);
+ rc = apply_rela_bits(loc, val, 0, 8, 0, write);
else if (r_type == R_390_12)
- rc = apply_rela_bits(loc, val, 0, 12, 0);
+ rc = apply_rela_bits(loc, val, 0, 12, 0, write);
else if (r_type == R_390_16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_20)
- rc = apply_rela_bits(loc, val, 1, 20, 0);
+ rc = apply_rela_bits(loc, val, 1, 20, 0, write);
else if (r_type == R_390_32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_PC16: /* PC relative 16 bit. */
case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */
@@ -260,15 +304,15 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_PC64: /* PC relative 64 bit. */
val += rela->r_addend - loc;
if (r_type == R_390_PC16)
- rc = apply_rela_bits(loc, val, 1, 16, 0);
+ rc = apply_rela_bits(loc, val, 1, 16, 0, write);
else if (r_type == R_390_PC16DBL)
- rc = apply_rela_bits(loc, val, 1, 16, 1);
+ rc = apply_rela_bits(loc, val, 1, 16, 1, write);
else if (r_type == R_390_PC32DBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
else if (r_type == R_390_PC32)
- rc = apply_rela_bits(loc, val, 1, 32, 0);
+ rc = apply_rela_bits(loc, val, 1, 32, 0, write);
else if (r_type == R_390_PC64)
- rc = apply_rela_bits(loc, val, 1, 64, 0);
+ rc = apply_rela_bits(loc, val, 1, 64, 0, write);
break;
case R_390_GOT12: /* 12 bit GOT offset. */
case R_390_GOT16: /* 16 bit GOT offset. */
@@ -283,33 +327,34 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_GOTPLT64: /* 64 bit offset to jump slot. */
case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */
if (info->got_initialized == 0) {
- Elf_Addr *gotent;
+ Elf_Addr *gotent = me->mem[MOD_TEXT].base +
+ me->arch.got_offset +
+ info->got_offset;
- gotent = me->core_layout.base + me->arch.got_offset +
- info->got_offset;
- *gotent = val;
+ write(gotent, &val, sizeof(*gotent));
info->got_initialized = 1;
}
val = info->got_offset + rela->r_addend;
if (r_type == R_390_GOT12 ||
r_type == R_390_GOTPLT12)
- rc = apply_rela_bits(loc, val, 0, 12, 0);
+ rc = apply_rela_bits(loc, val, 0, 12, 0, write);
else if (r_type == R_390_GOT16 ||
r_type == R_390_GOTPLT16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_GOT20 ||
r_type == R_390_GOTPLT20)
- rc = apply_rela_bits(loc, val, 1, 20, 0);
+ rc = apply_rela_bits(loc, val, 1, 20, 0, write);
else if (r_type == R_390_GOT32 ||
r_type == R_390_GOTPLT32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_GOT64 ||
r_type == R_390_GOTPLT64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
else if (r_type == R_390_GOTENT ||
r_type == R_390_GOTPLTENT) {
- val += (Elf_Addr) me->core_layout.base - loc;
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ val += (Elf_Addr)me->mem[MOD_TEXT].base +
+ me->arch.got_offset - loc;
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
}
break;
case R_390_PLT16DBL: /* 16 bit PC rel. PLT shifted by 1. */
@@ -320,25 +365,28 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */
case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */
if (info->plt_initialized == 0) {
- unsigned int *ip;
- ip = me->core_layout.base + me->arch.plt_offset +
- info->plt_offset;
- ip[0] = 0x0d10e310; /* basr 1,0 */
- ip[1] = 0x100a0004; /* lg 1,10(1) */
+ unsigned char insn[PLT_ENTRY_SIZE];
+ char *plt_base;
+ char *ip;
+
+ plt_base = me->mem[MOD_TEXT].base + me->arch.plt_offset;
+ ip = plt_base + info->plt_offset;
+ *(int *)insn = 0x0d10e310; /* basr 1,0 */
+ *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */
if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) {
- unsigned int *ij;
- ij = me->core_layout.base +
- me->arch.plt_offset +
- me->arch.plt_size - PLT_ENTRY_SIZE;
- ip[2] = 0xa7f40000 + /* j __jump_r1 */
- (unsigned int)(u16)
- (((unsigned long) ij - 8 -
- (unsigned long) ip) / 2);
+ char *jump_r1;
+
+ jump_r1 = plt_base + me->arch.plt_size -
+ PLT_ENTRY_SIZE;
+ /* brcl 0xf,__jump_r1 */
+ *(short *)&insn[8] = 0xc0f4;
+ *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2;
} else {
- ip[2] = 0x07f10000; /* br %r1 */
+ *(int *)&insn[8] = 0x07f10000; /* br %r1 */
}
- ip[3] = (unsigned int) (val >> 32);
- ip[4] = (unsigned int) val;
+ *(long *)&insn[14] = val;
+
+ write(ip, insn, sizeof(insn));
info->plt_initialized = 1;
}
if (r_type == R_390_PLTOFF16 ||
@@ -351,44 +399,44 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
val - loc + 0xffffUL < 0x1ffffeUL) ||
(r_type == R_390_PLT32DBL &&
val - loc + 0xffffffffULL < 0x1fffffffeULL)))
- val = (Elf_Addr) me->core_layout.base +
+ val = (Elf_Addr) me->mem[MOD_TEXT].base +
me->arch.plt_offset +
info->plt_offset;
val += rela->r_addend - loc;
}
if (r_type == R_390_PLT16DBL)
- rc = apply_rela_bits(loc, val, 1, 16, 1);
+ rc = apply_rela_bits(loc, val, 1, 16, 1, write);
else if (r_type == R_390_PLTOFF16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_PLT32DBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
else if (r_type == R_390_PLT32 ||
r_type == R_390_PLTOFF32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_PLT64 ||
r_type == R_390_PLTOFF64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_GOTOFF16: /* 16 bit offset to GOT. */
case R_390_GOTOFF32: /* 32 bit offset to GOT. */
case R_390_GOTOFF64: /* 64 bit offset to GOT. */
val = val + rela->r_addend -
- ((Elf_Addr) me->core_layout.base + me->arch.got_offset);
+ ((Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset);
if (r_type == R_390_GOTOFF16)
- rc = apply_rela_bits(loc, val, 0, 16, 0);
+ rc = apply_rela_bits(loc, val, 0, 16, 0, write);
else if (r_type == R_390_GOTOFF32)
- rc = apply_rela_bits(loc, val, 0, 32, 0);
+ rc = apply_rela_bits(loc, val, 0, 32, 0, write);
else if (r_type == R_390_GOTOFF64)
- rc = apply_rela_bits(loc, val, 0, 64, 0);
+ rc = apply_rela_bits(loc, val, 0, 64, 0, write);
break;
case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */
case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */
- val = (Elf_Addr) me->core_layout.base + me->arch.got_offset +
+ val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset +
rela->r_addend - loc;
if (r_type == R_390_GOTPC)
- rc = apply_rela_bits(loc, val, 1, 32, 0);
+ rc = apply_rela_bits(loc, val, 1, 32, 0, write);
else if (r_type == R_390_GOTPCDBL)
- rc = apply_rela_bits(loc, val, 1, 32, 1);
+ rc = apply_rela_bits(loc, val, 1, 32, 1, write);
break;
case R_390_COPY:
case R_390_GLOB_DAT: /* Create GOT entry. */
@@ -412,9 +460,10 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
return 0;
}
-int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
+static int __apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
unsigned int symindex, unsigned int relsec,
- struct module *me)
+ struct module *me,
+ void *(*write)(void *dest, const void *src, size_t len))
{
Elf_Addr base;
Elf_Sym *symtab;
@@ -430,13 +479,51 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
n = sechdrs[relsec].sh_size / sizeof(Elf_Rela);
for (i = 0; i < n; i++, rela++) {
- rc = apply_rela(rela, base, symtab, strtab, me);
+ rc = apply_rela(rela, base, symtab, strtab, me, write);
if (rc)
return rc;
}
return 0;
}
+int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
+ unsigned int symindex, unsigned int relsec,
+ struct module *me)
+{
+ bool early = me->state == MODULE_STATE_UNFORMED;
+ void *(*write)(void *, const void *, size_t) = memcpy;
+
+ if (!early)
+ write = s390_kernel_write;
+
+ return __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
+ write);
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+static int module_alloc_ftrace_hotpatch_trampolines(struct module *me,
+ const Elf_Shdr *s)
+{
+ char *start, *end;
+ int numpages;
+ size_t size;
+
+ size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
+ numpages = DIV_ROUND_UP(size, PAGE_SIZE);
+ start = module_alloc(numpages * PAGE_SIZE);
+ if (!start)
+ return -ENOMEM;
+ set_memory_rox((unsigned long)start, numpages);
+ end = start + size;
+
+ me->arch.trampolines_start = (struct ftrace_hotpatch_trampoline *)start;
+ me->arch.trampolines_end = (struct ftrace_hotpatch_trampoline *)end;
+ me->arch.next_trampoline = me->arch.trampolines_start;
+
+ return 0;
+}
+#endif /* CONFIG_FUNCTION_TRACER */
+
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
@@ -444,22 +531,19 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *s;
char *secstrings, *secname;
void *aseg;
+#ifdef CONFIG_FUNCTION_TRACER
+ int ret;
+#endif
if (IS_ENABLED(CONFIG_EXPOLINE) &&
!nospec_disable && me->arch.plt_size) {
unsigned int *ij;
- ij = me->core_layout.base + me->arch.plt_offset +
+ ij = me->mem[MOD_TEXT].base + me->arch.plt_offset +
me->arch.plt_size - PLT_ENTRY_SIZE;
- if (test_facility(35)) {
- ij[0] = 0xc6000000; /* exrl %r0,.+10 */
- ij[1] = 0x0005a7f4; /* j . */
- ij[2] = 0x000007f1; /* br %r1 */
- } else {
- ij[0] = 0x44000000 | (unsigned int)
- offsetof(struct lowcore, br_r1_trampoline);
- ij[1] = 0xa7f40000; /* j . */
- }
+ ij[0] = 0xc6000000; /* exrl %r0,.+10 */
+ ij[1] = 0x0005a7f4; /* j . */
+ ij[2] = 0x000007f1; /* br %r1 */
}
secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
@@ -478,8 +562,15 @@ int module_finalize(const Elf_Ehdr *hdr,
if (IS_ENABLED(CONFIG_EXPOLINE) &&
(str_has_prefix(secname, ".s390_return")))
nospec_revert(aseg, aseg + s->sh_size);
+
+#ifdef CONFIG_FUNCTION_TRACER
+ if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) {
+ ret = module_alloc_ftrace_hotpatch_trampolines(me, s);
+ if (ret < 0)
+ return ret;
+ }
+#endif /* CONFIG_FUNCTION_TRACER */
}
- jump_label_apply_nops(me);
return 0;
}
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 0a487fae763e..9ad44c26d1a2 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -6,12 +6,12 @@
* Author(s): Ingo Adlung <adlung@de.ibm.com>,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
* Cornelia Huck <cornelia.huck@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*/
#include <linux/kernel_stat.h>
#include <linux/init.h>
#include <linux/errno.h>
+#include <linux/entry-common.h>
#include <linux/hardirq.h>
#include <linux/log2.h>
#include <linux/kprobes.h>
@@ -19,18 +19,20 @@
#include <linux/time.h>
#include <linux/module.h>
#include <linux/sched/signal.h>
-
+#include <linux/kvm_host.h>
#include <linux/export.h>
#include <asm/lowcore.h>
+#include <asm/ctlreg.h>
#include <asm/smp.h>
#include <asm/stp.h>
#include <asm/cputime.h>
#include <asm/nmi.h>
#include <asm/crw.h>
#include <asm/switch_to.h>
-#include <asm/ctl_reg.h>
#include <asm/asm-offsets.h>
-#include <linux/kvm_host.h>
+#include <asm/pai.h>
+#include <asm/vx-insn.h>
+#include <asm/fpu/api.h>
struct mcck_struct {
unsigned int kill_task : 1;
@@ -41,116 +43,134 @@ struct mcck_struct {
};
static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
-static struct kmem_cache *mcesa_cache;
-static unsigned long mcesa_origin_lc;
static inline int nmi_needs_mcesa(void)
{
- return MACHINE_HAS_VX || MACHINE_HAS_GS;
-}
-
-static inline unsigned long nmi_get_mcesa_size(void)
-{
- if (MACHINE_HAS_GS)
- return MCESA_MAX_SIZE;
- return MCESA_MIN_SIZE;
+ return cpu_has_vx() || MACHINE_HAS_GS;
}
/*
* The initial machine check extended save area for the boot CPU.
- * It will be replaced by nmi_init() with an allocated structure.
- * The structure is required for machine check happening early in
- * the boot process.
+ * It will be replaced on the boot CPU reinit with an allocated
+ * structure. The structure is required for machine check happening
+ * early in the boot process.
*/
-static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE);
+static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE);
-void __init nmi_alloc_boot_cpu(struct lowcore *lc)
+void __init nmi_alloc_mcesa_early(u64 *mcesad)
{
if (!nmi_needs_mcesa())
return;
- lc->mcesad = (unsigned long) &boot_mcesa;
+ *mcesad = __pa(&boot_mcesa);
if (MACHINE_HAS_GS)
- lc->mcesad |= ilog2(MCESA_MAX_SIZE);
+ *mcesad |= ilog2(MCESA_MAX_SIZE);
}
-static int __init nmi_init(void)
+int nmi_alloc_mcesa(u64 *mcesad)
{
- unsigned long origin, cr0, size;
+ unsigned long size;
+ void *origin;
+ *mcesad = 0;
if (!nmi_needs_mcesa())
return 0;
- size = nmi_get_mcesa_size();
- if (size > MCESA_MIN_SIZE)
- mcesa_origin_lc = ilog2(size);
- /* create slab cache for the machine-check-extended-save-areas */
- mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL);
- if (!mcesa_cache)
- panic("Couldn't create nmi save area cache");
- origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
+ size = MACHINE_HAS_GS ? MCESA_MAX_SIZE : MCESA_MIN_SIZE;
+ origin = kmalloc(size, GFP_KERNEL);
if (!origin)
- panic("Couldn't allocate nmi save area");
+ return -ENOMEM;
/* The pointer is stored with mcesa_bits ORed in */
- kmemleak_not_leak((void *) origin);
- __ctl_store(cr0, 0, 0);
- __ctl_clear_bit(0, 28); /* disable lowcore protection */
- /* Replace boot_mcesa on the boot CPU */
- S390_lowcore.mcesad = origin | mcesa_origin_lc;
- __ctl_load(cr0, 0, 0);
+ kmemleak_not_leak(origin);
+ *mcesad = __pa(origin);
+ if (MACHINE_HAS_GS)
+ *mcesad |= ilog2(MCESA_MAX_SIZE);
return 0;
}
-early_initcall(nmi_init);
-int nmi_alloc_per_cpu(struct lowcore *lc)
+void nmi_free_mcesa(u64 *mcesad)
{
- unsigned long origin;
-
if (!nmi_needs_mcesa())
- return 0;
- origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
- if (!origin)
- return -ENOMEM;
- /* The pointer is stored with mcesa_bits ORed in */
- kmemleak_not_leak((void *) origin);
- lc->mcesad = origin | mcesa_origin_lc;
- return 0;
+ return;
+ kfree(__va(*mcesad & MCESA_ORIGIN_MASK));
}
-void nmi_free_per_cpu(struct lowcore *lc)
+static __always_inline char *nmi_puts(char *dest, const char *src)
{
- if (!nmi_needs_mcesa())
- return;
- kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK));
+ while (*src)
+ *dest++ = *src++;
+ *dest = 0;
+ return dest;
+}
+
+static __always_inline char *u64_to_hex(char *dest, u64 val)
+{
+ int i, num;
+
+ for (i = 1; i <= 16; i++) {
+ num = (val >> (64 - 4 * i)) & 0xf;
+ if (num >= 10)
+ *dest++ = 'A' + num - 10;
+ else
+ *dest++ = '0' + num;
+ }
+ *dest = 0;
+ return dest;
}
static notrace void s390_handle_damage(void)
{
+ union ctlreg0 cr0, cr0_new;
+ char message[100];
+ psw_t psw_save;
+ char *ptr;
+
smp_emergency_stop();
+ diag_amode31_ops.diag308_reset();
+ ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x");
+ u64_to_hex(ptr, S390_lowcore.mcck_interruption_code);
+
+ /*
+ * Disable low address protection and make machine check new PSW a
+ * disabled wait PSW. Any additional machine check cannot be handled.
+ */
+ local_ctl_store(0, &cr0.reg);
+ cr0_new = cr0;
+ cr0_new.lap = 0;
+ local_ctl_load(0, &cr0_new.reg);
+ psw_save = S390_lowcore.mcck_new_psw;
+ psw_bits(S390_lowcore.mcck_new_psw).io = 0;
+ psw_bits(S390_lowcore.mcck_new_psw).ext = 0;
+ psw_bits(S390_lowcore.mcck_new_psw).wait = 1;
+ sclp_emergency_printk(message);
+
+ /*
+ * Restore machine check new PSW and control register 0 to original
+ * values. This makes possible system dump analysis easier.
+ */
+ S390_lowcore.mcck_new_psw = psw_save;
+ local_ctl_load(0, &cr0.reg);
disabled_wait();
while (1);
}
NOKPROBE_SYMBOL(s390_handle_damage);
/*
- * Main machine check handler function. Will be called with interrupts enabled
- * or disabled and machine checks enabled or disabled.
+ * Main machine check handler function. Will be called with interrupts disabled
+ * and machine checks enabled.
*/
void s390_handle_mcck(void)
{
- unsigned long flags;
struct mcck_struct mcck;
+ unsigned long mflags;
/*
* Disable machine checks and get the current state of accumulated
* machine checks. Afterwards delete the old state and enable machine
* checks again.
*/
- local_irq_save(flags);
- local_mcck_disable();
+ local_mcck_save(mflags);
mcck = *this_cpu_ptr(&cpu_mcck);
memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck));
- clear_cpu_flag(CIF_MCCK_PENDING);
- local_mcck_enable();
- local_irq_restore(flags);
+ local_mcck_restore(mflags);
if (mcck.channel_report)
crw_handle_channel_report();
@@ -167,134 +187,150 @@ void s390_handle_mcck(void)
static int mchchk_wng_posted = 0;
/* Use single cpu clear, as we cannot handle smp here. */
- __ctl_clear_bit(14, 24); /* Disable WARNING MCH */
+ local_ctl_clear_bit(14, CR14_WARNING_SUBMASK_BIT);
if (xchg(&mchchk_wng_posted, 1) == 0)
kill_cad_pid(SIGPWR, 1);
}
if (mcck.stp_queue)
stp_queue_work();
if (mcck.kill_task) {
- local_irq_enable();
printk(KERN_EMERG "mcck: Terminating task because of machine "
"malfunction (code 0x%016lx).\n", mcck.mcck_code);
printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
current->comm, current->pid);
- do_exit(SIGSEGV);
+ if (is_global_init(current))
+ panic("mcck: Attempting to kill init!\n");
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, current, PIDTYPE_PID);
}
}
-EXPORT_SYMBOL_GPL(s390_handle_mcck);
/*
- * returns 0 if all required registers are available
+ * returns 0 if register contents could be validated
* returns 1 otherwise
*/
-static int notrace s390_check_registers(union mci mci, int umode)
+static int notrace s390_validate_registers(union mci mci)
{
+ struct mcesa *mcesa;
+ void *fpt_save_area;
union ctlreg2 cr2;
int kill_task;
+ u64 zero;
kill_task = 0;
+ zero = 0;
- if (!mci.gr) {
- /*
- * General purpose registers couldn't be restored and have
- * unknown contents. Stop system or terminate process.
- */
- if (!umode)
- s390_handle_damage();
+ if (!mci.gr || !mci.fp)
kill_task = 1;
- }
- /* Check control registers */
- if (!mci.cr) {
- /*
- * Control registers have unknown contents.
- * Can't recover and therefore stopping machine.
- */
- s390_handle_damage();
- }
- if (!mci.fp) {
- /*
- * Floating point registers can't be restored. If the
- * kernel currently uses floating point registers the
- * system is stopped. If the process has its floating
- * pointer registers loaded it is terminated.
- */
- if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7)
- s390_handle_damage();
- if (!test_cpu_flag(CIF_FPU))
- kill_task = 1;
- }
+ fpt_save_area = &S390_lowcore.floating_pt_save_area;
if (!mci.fc) {
- /*
- * Floating point control register can't be restored.
- * If the kernel currently uses the floating pointer
- * registers and needs the FPC register the system is
- * stopped. If the process has its floating pointer
- * registers loaded it is terminated.
- */
- if (S390_lowcore.fpu_flags & KERNEL_FPC)
- s390_handle_damage();
- if (!test_cpu_flag(CIF_FPU))
- kill_task = 1;
+ kill_task = 1;
+ asm volatile(
+ " lfpc %0\n"
+ :
+ : "Q" (zero));
+ } else {
+ asm volatile(
+ " lfpc %0\n"
+ :
+ : "Q" (S390_lowcore.fpt_creg_save_area));
}
- if (MACHINE_HAS_VX) {
- if (!mci.vr) {
- /*
- * Vector registers can't be restored. If the kernel
- * currently uses vector registers the system is
- * stopped. If the process has its vector registers
- * loaded it is terminated.
- */
- if (S390_lowcore.fpu_flags & KERNEL_VXR)
- s390_handle_damage();
- if (!test_cpu_flag(CIF_FPU))
- kill_task = 1;
- }
- }
- /* Check if access registers are valid */
- if (!mci.ar) {
+ mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
+ if (!cpu_has_vx()) {
+ /* Validate floating point registers */
+ asm volatile(
+ " ld 0,0(%0)\n"
+ " ld 1,8(%0)\n"
+ " ld 2,16(%0)\n"
+ " ld 3,24(%0)\n"
+ " ld 4,32(%0)\n"
+ " ld 5,40(%0)\n"
+ " ld 6,48(%0)\n"
+ " ld 7,56(%0)\n"
+ " ld 8,64(%0)\n"
+ " ld 9,72(%0)\n"
+ " ld 10,80(%0)\n"
+ " ld 11,88(%0)\n"
+ " ld 12,96(%0)\n"
+ " ld 13,104(%0)\n"
+ " ld 14,112(%0)\n"
+ " ld 15,120(%0)\n"
+ :
+ : "a" (fpt_save_area)
+ : "memory");
+ } else {
+ /* Validate vector registers */
+ union ctlreg0 cr0;
+
/*
- * Access registers have unknown contents.
- * Terminating task.
+ * The vector validity must only be checked if not running a
+ * KVM guest. For KVM guests the machine check is forwarded by
+ * KVM and it is the responsibility of the guest to take
+ * appropriate actions. The host vector or FPU values have been
+ * saved by KVM and will be restored by KVM.
*/
- kill_task = 1;
+ if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST))
+ kill_task = 1;
+ cr0.reg = S390_lowcore.cregs_save_area[0];
+ cr0.afp = cr0.vx = 1;
+ local_ctl_load(0, &cr0.reg);
+ asm volatile(
+ " la 1,%0\n"
+ " VLM 0,15,0,1\n"
+ " VLM 16,31,256,1\n"
+ :
+ : "Q" (*(struct vx_array *)mcesa->vector_save_area)
+ : "1");
+ local_ctl_load(0, &S390_lowcore.cregs_save_area[0]);
}
- /* Check guarded storage registers */
- cr2.val = S390_lowcore.cregs_save_area[2];
+ /* Validate access registers */
+ asm volatile(
+ " lam 0,15,0(%0)\n"
+ :
+ : "a" (&S390_lowcore.access_regs_save_area)
+ : "memory");
+ if (!mci.ar)
+ kill_task = 1;
+ /* Validate guarded storage registers */
+ cr2.reg = S390_lowcore.cregs_save_area[2];
if (cr2.gse) {
if (!mci.gs) {
/*
- * Guarded storage register can't be restored and
- * the current processes uses guarded storage.
- * It has to be terminated.
+ * 2 cases:
+ * - machine check in kernel or userspace
+ * - machine check while running SIE (KVM guest)
+ * For kernel or userspace the userspace values of
+ * guarded storage control can not be recreated, the
+ * process must be terminated.
+ * For SIE the guest values of guarded storage can not
+ * be recreated. This is either due to a bug or due to
+ * GS being disabled in the guest. The guest will be
+ * notified by KVM code and the guests machine check
+ * handling must take care of this. The host values
+ * are saved by KVM and are not affected.
*/
- kill_task = 1;
+ if (!test_cpu_flag(CIF_MCCK_GUEST))
+ kill_task = 1;
+ } else {
+ load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
}
}
- /* Check if old PSW is valid */
- if (!mci.wp) {
- /*
- * Can't tell if we come from user or kernel mode
- * -> stopping machine.
- */
- s390_handle_damage();
- }
- /* Check for invalid kernel instruction address */
- if (!mci.ia && !umode) {
- /*
- * The instruction address got lost while running
- * in the kernel -> stopping machine.
- */
- s390_handle_damage();
- }
+ /*
+ * The getcpu vdso syscall reads CPU number from the programmable
+ * field of the TOD clock. Disregard the TOD programmable register
+ * validity bit and load the CPU number into the TOD programmable
+ * field unconditionally.
+ */
+ set_tod_programmable_field(raw_smp_processor_id());
+ /* Validate clock comparator register */
+ set_clock_comparator(S390_lowcore.clock_comparator);
if (!mci.ms || !mci.pm || !mci.ia)
kill_task = 1;
return kill_task;
}
-NOKPROBE_SYMBOL(s390_check_registers);
+NOKPROBE_SYMBOL(s390_validate_registers);
/*
* Backup the guest's machine check info to its description block
@@ -305,8 +341,7 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs)
struct sie_page *sie_page;
/* r14 contains the sie block, which was set in sie64a */
- struct kvm_s390_sie_block *sie_block =
- (struct kvm_s390_sie_block *) regs->gprs[14];
+ struct kvm_s390_sie_block *sie_block = phys_to_virt(regs->gprs[14]);
if (sie_block == NULL)
/* Something's seriously wrong, stop system. */
@@ -340,19 +375,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
static unsigned long long last_ipd;
struct mcck_struct *mcck;
unsigned long long tmp;
+ irqentry_state_t irq_state;
union mci mci;
unsigned long mcck_dam_code;
+ int mcck_pending = 0;
+
+ irq_state = irqentry_nmi_enter(regs);
- nmi_enter();
+ if (user_mode(regs))
+ update_timer_mcck();
inc_irq_stat(NMI_NMI);
mci.val = S390_lowcore.mcck_interruption_code;
mcck = this_cpu_ptr(&cpu_mcck);
- if (mci.sd) {
- /* System damage -> stopping machine */
- s390_handle_damage();
- }
-
/*
* Reinject the instruction processing damages' machine checks
* including Delayed Access Exception into the guest
@@ -393,14 +428,16 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
s390_handle_damage();
}
}
- if (s390_check_registers(mci, user_mode(regs))) {
+ if (s390_validate_registers(mci)) {
+ if (!user_mode(regs))
+ s390_handle_damage();
/*
* Couldn't restore all register contents for the
* user space process -> mark task for termination.
*/
mcck->kill_task = 1;
mcck->mcck_code = mci.val;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
/*
@@ -420,34 +457,32 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
mcck->stp_queue |= stp_sync_check();
if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
mcck->stp_queue |= stp_island_check();
- if (mcck->stp_queue)
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
-
/*
* Reinject storage related machine checks into the guest if they
* happen when the guest is running.
*/
if (!test_cpu_flag(CIF_MCCK_GUEST)) {
+ /* Storage error uncorrected */
if (mci.se)
- /* Storage error uncorrected */
s390_handle_damage();
+ /* Storage key-error uncorrected */
if (mci.ke)
- /* Storage key-error uncorrected */
s390_handle_damage();
+ /* Storage degradation */
if (mci.ds && mci.fa)
- /* Storage degradation */
s390_handle_damage();
}
if (mci.cp) {
/* Channel report word pending */
mcck->channel_report = 1;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
if (mci.w) {
/* Warning pending */
mcck->warning = 1;
- set_cpu_flag(CIF_MCCK_PENDING);
+ mcck_pending = 1;
}
/*
@@ -462,15 +497,19 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
*((long *)(regs->gprs[15] + __SF_SIE_REASON)) = -EINTR;
}
clear_cpu_flag(CIF_MCCK_GUEST);
- nmi_exit();
+
+ if (mcck_pending)
+ schedule_mcck_handler();
+
+ irqentry_nmi_exit(regs, irq_state);
}
NOKPROBE_SYMBOL(s390_do_machine_check);
static int __init machine_check_init(void)
{
- ctl_set_bit(14, 25); /* enable external damage MCH */
- ctl_set_bit(14, 27); /* enable system recovery MCH */
- ctl_set_bit(14, 24); /* enable warning MCH */
+ system_ctl_set_bit(14, CR14_EXTERNAL_DAMAGE_SUBMASK_BIT);
+ system_ctl_set_bit(14, CR14_RECOVERY_SUBMASK_BIT);
+ system_ctl_set_bit(14, CR14_WARNING_SUBMASK_BIT);
return 0;
}
early_initcall(machine_check_init);
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index 29e511f5bf06..d1b16d83e49a 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -14,14 +14,14 @@ static int __init nobp_setup_early(char *str)
return rc;
if (enabled && test_facility(82)) {
/*
- * The user explicitely requested nobp=1, enable it and
+ * The user explicitly requested nobp=1, enable it and
* disable the expoline support.
*/
- __set_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __set_facility(82, alt_stfle_fac_list);
if (IS_ENABLED(CONFIG_EXPOLINE))
nospec_disable = 1;
} else {
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
return 0;
}
@@ -29,7 +29,7 @@ early_param("nobp", nobp_setup_early);
static int __init nospec_setup_early(char *str)
{
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
return 0;
}
early_param("nospec", nospec_setup_early);
@@ -38,9 +38,9 @@ static int __init nospec_report(void)
{
if (test_facility(156))
pr_info("Spectre V2 mitigation: etokens\n");
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+ if (nospec_uses_trampoline())
pr_info("Spectre V2 mitigation: execute trampolines\n");
- if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
+ if (__test_facility(82, alt_stfle_fac_list))
pr_info("Spectre V2 mitigation: limited branch prediction\n");
return 0;
}
@@ -66,14 +66,14 @@ void __init nospec_auto_detect(void)
*/
if (__is_defined(CC_USING_EXPOLINE))
nospec_disable = 1;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
} else if (__is_defined(CC_USING_EXPOLINE)) {
/*
* The kernel has been compiled with expolines.
* Keep expolines enabled and disable nobp.
*/
nospec_disable = 0;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
/*
* If the kernel has not been compiled with expolines the
@@ -86,7 +86,7 @@ static int __init spectre_v2_setup_early(char *str)
{
if (str && !strncmp(str, "on", 2)) {
nospec_disable = 0;
- __clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+ __clear_facility(82, alt_stfle_fac_list);
}
if (str && !strncmp(str, "off", 3))
nospec_disable = 1;
@@ -99,11 +99,13 @@ early_param("spectre_v2", spectre_v2_setup_early);
static void __init_or_module __nospec_revert(s32 *start, s32 *end)
{
enum { BRCL_EXPOLINE, BRASL_EXPOLINE } type;
+ static const u8 branch[] = { 0x47, 0x00, 0x07, 0x00 };
u8 *instr, *thunk, *br;
u8 insnbuf[6];
s32 *epo;
/* Second part of the instruction replace is always a nop */
+ memcpy(insnbuf + 2, branch, sizeof(branch));
for (epo = start; epo < end; epo++) {
instr = (u8 *) epo + *epo;
if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04)
@@ -116,42 +118,20 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
if (thunk[0] == 0xc6 && thunk[1] == 0x00)
/* exrl %r0,<target-br> */
br = thunk + (*(int *)(thunk + 2)) * 2;
- else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 &&
- thunk[6] == 0x44 && thunk[7] == 0x00 &&
- (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 &&
- (thunk[1] & 0xf0) == (thunk[8] & 0xf0))
- /* larl %rx,<target br> + ex %r0,0(%rx) */
- br = thunk + (*(int *)(thunk + 2)) * 2;
else
continue;
- /* Check for unconditional branch 0x07f? or 0x47f???? */
- if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0)
+ if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
continue;
-
- memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x07, 0x00 }, 4);
switch (type) {
case BRCL_EXPOLINE:
+ /* brcl to thunk, replace with br + nop */
insnbuf[0] = br[0];
insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
- if (br[0] == 0x47) {
- /* brcl to b, replace with bc + nopr */
- insnbuf[2] = br[2];
- insnbuf[3] = br[3];
- } else {
- /* brcl to br, replace with bcr + nop */
- }
break;
case BRASL_EXPOLINE:
+ /* brasl to thunk, replace with basr + nop */
+ insnbuf[0] = 0x0d;
insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
- if (br[0] == 0x47) {
- /* brasl to b, replace with bas + nopr */
- insnbuf[0] = 0x4d;
- insnbuf[2] = br[2];
- insnbuf[3] = br[3];
- } else {
- /* brasl to br, replace with basr + nop */
- insnbuf[0] = 0x0d;
- }
break;
}
diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c
index 48f472bf9290..52d4353188ad 100644
--- a/arch/s390/kernel/nospec-sysfs.c
+++ b/arch/s390/kernel/nospec-sysfs.c
@@ -15,9 +15,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
{
if (test_facility(156))
return sprintf(buf, "Mitigation: etokens\n");
- if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+ if (nospec_uses_trampoline())
return sprintf(buf, "Mitigation: execute trampolines\n");
- if (__test_facility(82, S390_lowcore.alt_stfle_fac_list))
+ if (__test_facility(82, alt_stfle_fac_list))
return sprintf(buf, "Mitigation: limited branch prediction\n");
return sprintf(buf, "Vulnerable\n");
}
diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c
new file mode 100644
index 000000000000..23ab9f02f278
--- /dev/null
+++ b/arch/s390/kernel/numa.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NUMA support for s390
+ *
+ * Implement NUMA core code.
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/cpumask.h>
+#include <linux/memblock.h>
+#include <linux/node.h>
+#include <asm/numa.h>
+
+struct pglist_data *node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+void __init numa_setup(void)
+{
+ int nid;
+
+ nodes_clear(node_possible_map);
+ node_set(0, node_possible_map);
+ node_set_online(0);
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8);
+ if (!NODE_DATA(nid))
+ panic("%s: Failed to allocate %zu bytes align=0x%x\n",
+ __func__, sizeof(pg_data_t), 8);
+ }
+ NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT;
+ NODE_DATA(0)->node_id = 0;
+}
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index 0a5e4bafb6ad..6e1824141b29 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -13,8 +13,10 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <asm/checksum.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
+#include <asm/maccess.h>
+#include <asm/asm-offsets.h>
/*
* OS info structure has to be page aligned
@@ -45,24 +47,26 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size)
*/
void os_info_entry_add(int nr, void *ptr, u64 size)
{
- os_info.entry[nr].addr = (u64)(unsigned long)ptr;
+ os_info.entry[nr].addr = __pa(ptr);
os_info.entry[nr].size = size;
os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0);
os_info.csum = os_info_csum(&os_info);
}
/*
- * Initialize OS info struture and set lowcore pointer
+ * Initialize OS info structure and set lowcore pointer
*/
void __init os_info_init(void)
{
- void *ptr = &os_info;
+ struct lowcore *abs_lc;
os_info.version_major = OS_INFO_VERSION_MAJOR;
os_info.version_minor = OS_INFO_VERSION_MINOR;
os_info.magic = OS_INFO_MAGIC;
os_info.csum = os_info_csum(&os_info);
- mem_assign_absolute(S390_lowcore.os_info, (unsigned long) ptr);
+ abs_lc = get_abs_lowcore();
+ abs_lc->os_info = __pa(&os_info);
+ put_abs_lowcore(abs_lc);
}
#ifdef CONFIG_CRASH_DUMP
@@ -90,7 +94,7 @@ static void os_info_old_alloc(int nr, int align)
goto fail;
}
buf_align = PTR_ALIGN(buf, align);
- if (copy_oldmem_kernel(buf_align, (void *) addr, size)) {
+ if (copy_oldmem_kernel(buf_align, addr, size)) {
msg = "copy failed";
goto fail_free;
}
@@ -121,17 +125,16 @@ static void os_info_old_init(void)
if (os_info_init)
return;
- if (!OLDMEM_BASE)
+ if (!oldmem_data.start)
goto fail;
- if (copy_oldmem_kernel(&addr, &S390_lowcore.os_info, sizeof(addr)))
+ if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr)))
goto fail;
if (addr == 0 || addr % PAGE_SIZE)
goto fail;
os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL);
if (!os_info_old)
goto fail;
- if (copy_oldmem_kernel(os_info_old, (void *) addr,
- sizeof(*os_info_old)))
+ if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old)))
goto fail_free;
if (os_info_old->magic != OS_INFO_MAGIC)
goto fail_free;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 0eb1d1cc53a8..41ed6e0f0a2a 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -2,8 +2,9 @@
/*
* Performance event support for s390x - CPU-measurement Counter Facility
*
- * Copyright IBM Corp. 2012, 2019
+ * Copyright IBM Corp. 2012, 2023
* Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
+ * Thomas Richter <tmricht@linux.ibm.com>
*/
#define KMSG_COMPONENT "cpum_cf"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
@@ -14,7 +15,574 @@
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/export.h>
-#include <asm/cpu_mcf.h>
+#include <linux/miscdevice.h>
+#include <linux/perf_event.h>
+
+#include <asm/cpu_mf.h>
+#include <asm/hwctrset.h>
+#include <asm/debug.h>
+
+enum cpumf_ctr_set {
+ CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */
+ CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */
+ CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */
+ CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */
+ CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */
+
+ /* Maximum number of counter sets */
+ CPUMF_CTR_SET_MAX,
+};
+
+#define CPUMF_LCCTL_ENABLE_SHIFT 16
+#define CPUMF_LCCTL_ACTCTL_SHIFT 0
+
+static inline void ctr_set_enable(u64 *state, u64 ctrsets)
+{
+ *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT;
+}
+
+static inline void ctr_set_disable(u64 *state, u64 ctrsets)
+{
+ *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT);
+}
+
+static inline void ctr_set_start(u64 *state, u64 ctrsets)
+{
+ *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT;
+}
+
+static inline void ctr_set_stop(u64 *state, u64 ctrsets)
+{
+ *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT);
+}
+
+static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest)
+{
+ switch (set) {
+ case CPUMF_CTR_SET_BASIC:
+ return stcctm(BASIC, range, dest);
+ case CPUMF_CTR_SET_USER:
+ return stcctm(PROBLEM_STATE, range, dest);
+ case CPUMF_CTR_SET_CRYPTO:
+ return stcctm(CRYPTO_ACTIVITY, range, dest);
+ case CPUMF_CTR_SET_EXT:
+ return stcctm(EXTENDED, range, dest);
+ case CPUMF_CTR_SET_MT_DIAG:
+ return stcctm(MT_DIAG_CLEARING, range, dest);
+ case CPUMF_CTR_SET_MAX:
+ return 3;
+ }
+ return 3;
+}
+
+struct cpu_cf_events {
+ refcount_t refcnt; /* Reference count */
+ atomic_t ctr_set[CPUMF_CTR_SET_MAX];
+ u64 state; /* For perf_event_open SVC */
+ u64 dev_state; /* For /dev/hwctr */
+ unsigned int flags;
+ size_t used; /* Bytes used in data */
+ size_t usedss; /* Bytes used in start/stop */
+ unsigned char start[PAGE_SIZE]; /* Counter set at event add */
+ unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */
+ unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */
+ unsigned int sets; /* # Counter set saved in memory */
+};
+
+static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */
+static debug_info_t *cf_dbg;
+
+/*
+ * The CPU Measurement query counter information instruction contains
+ * information which varies per machine generation, but is constant and
+ * does not change when running on a particular machine, such as counter
+ * first and second version number. This is needed to determine the size
+ * of counter sets. Extract this information at device driver initialization.
+ */
+static struct cpumf_ctr_info cpumf_ctr_info;
+
+struct cpu_cf_ptr {
+ struct cpu_cf_events *cpucf;
+};
+
+static struct cpu_cf_root { /* Anchor to per CPU data */
+ refcount_t refcnt; /* Overall active events */
+ struct cpu_cf_ptr __percpu *cfptr;
+} cpu_cf_root;
+
+/*
+ * Serialize event initialization and event removal. Both are called from
+ * user space in task context with perf_event_open() and close()
+ * system calls.
+ *
+ * This mutex serializes functions cpum_cf_alloc_cpu() called at event
+ * initialization via cpumf_pmu_event_init() and function cpum_cf_free_cpu()
+ * called at event removal via call back function hw_perf_event_destroy()
+ * when the event is deleted. They are serialized to enforce correct
+ * bookkeeping of pointer and reference counts anchored by
+ * struct cpu_cf_root and the access to cpu_cf_root::refcnt and the
+ * per CPU pointers stored in cpu_cf_root::cfptr.
+ */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Get pointer to per-cpu structure.
+ *
+ * Function get_cpu_cfhw() is called from
+ * - cfset_copy_all(): This function is protected by cpus_read_lock(), so
+ * CPU hot plug remove can not happen. Event removal requires a close()
+ * first.
+ *
+ * Function this_cpu_cfhw() is called from perf common code functions:
+ * - pmu_{en|dis}able(), pmu_{add|del}()and pmu_{start|stop}():
+ * All functions execute with interrupts disabled on that particular CPU.
+ * - cfset_ioctl_{on|off}, cfset_cpu_read(): see comment cfset_copy_all().
+ *
+ * Therefore it is safe to access the CPU specific pointer to the event.
+ */
+static struct cpu_cf_events *get_cpu_cfhw(int cpu)
+{
+ struct cpu_cf_ptr __percpu *p = cpu_cf_root.cfptr;
+
+ if (p) {
+ struct cpu_cf_ptr *q = per_cpu_ptr(p, cpu);
+
+ return q->cpucf;
+ }
+ return NULL;
+}
+
+static struct cpu_cf_events *this_cpu_cfhw(void)
+{
+ return get_cpu_cfhw(smp_processor_id());
+}
+
+/* Disable counter sets on dedicated CPU */
+static void cpum_cf_reset_cpu(void *flags)
+{
+ lcctl(0);
+}
+
+/* Free per CPU data when the last event is removed. */
+static void cpum_cf_free_root(void)
+{
+ if (!refcount_dec_and_test(&cpu_cf_root.refcnt))
+ return;
+ free_percpu(cpu_cf_root.cfptr);
+ cpu_cf_root.cfptr = NULL;
+ irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
+ on_each_cpu(cpum_cf_reset_cpu, NULL, 1);
+ debug_sprintf_event(cf_dbg, 4, "%s root.refcnt %u cfptr %d\n",
+ __func__, refcount_read(&cpu_cf_root.refcnt),
+ !cpu_cf_root.cfptr);
+}
+
+/*
+ * On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int cpum_cf_alloc_root(void)
+{
+ int rc = 0;
+
+ if (refcount_inc_not_zero(&cpu_cf_root.refcnt))
+ return rc;
+
+ /* The memory is already zeroed. */
+ cpu_cf_root.cfptr = alloc_percpu(struct cpu_cf_ptr);
+ if (cpu_cf_root.cfptr) {
+ refcount_set(&cpu_cf_root.refcnt, 1);
+ on_each_cpu(cpum_cf_reset_cpu, NULL, 1);
+ irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
+ } else {
+ rc = -ENOMEM;
+ }
+
+ return rc;
+}
+
+/* Free CPU counter data structure for a PMU */
+static void cpum_cf_free_cpu(int cpu)
+{
+ struct cpu_cf_events *cpuhw;
+ struct cpu_cf_ptr *p;
+
+ mutex_lock(&pmc_reserve_mutex);
+ /*
+ * When invoked via CPU hotplug handler, there might be no events
+ * installed or that particular CPU might not have an
+ * event installed. This anchor pointer can be NULL!
+ */
+ if (!cpu_cf_root.cfptr)
+ goto out;
+ p = per_cpu_ptr(cpu_cf_root.cfptr, cpu);
+ cpuhw = p->cpucf;
+ /*
+ * Might be zero when called from CPU hotplug handler and no event
+ * installed on that CPU, but on different CPUs.
+ */
+ if (!cpuhw)
+ goto out;
+
+ if (refcount_dec_and_test(&cpuhw->refcnt)) {
+ kfree(cpuhw);
+ p->cpucf = NULL;
+ }
+ cpum_cf_free_root();
+out:
+ mutex_unlock(&pmc_reserve_mutex);
+}
+
+/* Allocate CPU counter data structure for a PMU. Called under mutex lock. */
+static int cpum_cf_alloc_cpu(int cpu)
+{
+ struct cpu_cf_events *cpuhw;
+ struct cpu_cf_ptr *p;
+ int rc;
+
+ mutex_lock(&pmc_reserve_mutex);
+ rc = cpum_cf_alloc_root();
+ if (rc)
+ goto unlock;
+ p = per_cpu_ptr(cpu_cf_root.cfptr, cpu);
+ cpuhw = p->cpucf;
+
+ if (!cpuhw) {
+ cpuhw = kzalloc(sizeof(*cpuhw), GFP_KERNEL);
+ if (cpuhw) {
+ p->cpucf = cpuhw;
+ refcount_set(&cpuhw->refcnt, 1);
+ } else {
+ rc = -ENOMEM;
+ }
+ } else {
+ refcount_inc(&cpuhw->refcnt);
+ }
+ if (rc) {
+ /*
+ * Error in allocation of event, decrement anchor. Since
+ * cpu_cf_event in not created, its destroy() function is not
+ * invoked. Adjust the reference counter for the anchor.
+ */
+ cpum_cf_free_root();
+ }
+unlock:
+ mutex_unlock(&pmc_reserve_mutex);
+ return rc;
+}
+
+/*
+ * Create/delete per CPU data structures for /dev/hwctr interface and events
+ * created by perf_event_open().
+ * If cpu is -1, track task on all available CPUs. This requires
+ * allocation of hardware data structures for all CPUs. This setup handles
+ * perf_event_open() with task context and /dev/hwctr interface.
+ * If cpu is non-zero install event on this CPU only. This setup handles
+ * perf_event_open() with CPU context.
+ */
+static int cpum_cf_alloc(int cpu)
+{
+ cpumask_var_t mask;
+ int rc;
+
+ if (cpu == -1) {
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+ for_each_online_cpu(cpu) {
+ rc = cpum_cf_alloc_cpu(cpu);
+ if (rc) {
+ for_each_cpu(cpu, mask)
+ cpum_cf_free_cpu(cpu);
+ break;
+ }
+ cpumask_set_cpu(cpu, mask);
+ }
+ free_cpumask_var(mask);
+ } else {
+ rc = cpum_cf_alloc_cpu(cpu);
+ }
+ return rc;
+}
+
+static void cpum_cf_free(int cpu)
+{
+ if (cpu == -1) {
+ for_each_online_cpu(cpu)
+ cpum_cf_free_cpu(cpu);
+ } else {
+ cpum_cf_free_cpu(cpu);
+ }
+}
+
+#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */
+ /* interval in seconds */
+
+/* Counter sets are stored as data stream in a page sized memory buffer and
+ * exported to user space via raw data attached to the event sample data.
+ * Each counter set starts with an eight byte header consisting of:
+ * - a two byte eye catcher (0xfeef)
+ * - a one byte counter set number
+ * - a two byte counter set size (indicates the number of counters in this set)
+ * - a three byte reserved value (must be zero) to make the header the same
+ * size as a counter value.
+ * All counter values are eight byte in size.
+ *
+ * All counter sets are followed by a 64 byte trailer.
+ * The trailer consists of a:
+ * - flag field indicating valid fields when corresponding bit set
+ * - the counter facility first and second version number
+ * - the CPU speed if nonzero
+ * - the time stamp the counter sets have been collected
+ * - the time of day (TOD) base value
+ * - the machine type.
+ *
+ * The counter sets are saved when the process is prepared to be executed on a
+ * CPU and saved again when the process is going to be removed from a CPU.
+ * The difference of both counter sets are calculated and stored in the event
+ * sample data area.
+ */
+struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */
+ unsigned int def:16; /* 0-15 Data Entry Format */
+ unsigned int set:16; /* 16-31 Counter set identifier */
+ unsigned int ctr:16; /* 32-47 Number of stored counters */
+ unsigned int res1:16; /* 48-63 Reserved */
+};
+
+struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */
+ /* 0 - 7 */
+ union {
+ struct {
+ unsigned int clock_base:1; /* TOD clock base set */
+ unsigned int speed:1; /* CPU speed set */
+ /* Measurement alerts */
+ unsigned int mtda:1; /* Loss of MT ctr. data alert */
+ unsigned int caca:1; /* Counter auth. change alert */
+ unsigned int lcda:1; /* Loss of counter data alert */
+ };
+ unsigned long flags; /* 0-63 All indicators */
+ };
+ /* 8 - 15 */
+ unsigned int cfvn:16; /* 64-79 Ctr First Version */
+ unsigned int csvn:16; /* 80-95 Ctr Second Version */
+ unsigned int cpu_speed:32; /* 96-127 CPU speed */
+ /* 16 - 23 */
+ unsigned long timestamp; /* 128-191 Timestamp (TOD) */
+ /* 24 - 55 */
+ union {
+ struct {
+ unsigned long progusage1;
+ unsigned long progusage2;
+ unsigned long progusage3;
+ unsigned long tod_base;
+ };
+ unsigned long progusage[4];
+ };
+ /* 56 - 63 */
+ unsigned int mach_type:16; /* Machine type */
+ unsigned int res1:16; /* Reserved */
+ unsigned int res2:32; /* Reserved */
+};
+
+/* Create the trailer data at the end of a page. */
+static void cfdiag_trailer(struct cf_trailer_entry *te)
+{
+ struct cpuid cpuid;
+
+ te->cfvn = cpumf_ctr_info.cfvn; /* Counter version numbers */
+ te->csvn = cpumf_ctr_info.csvn;
+
+ get_cpu_id(&cpuid); /* Machine type */
+ te->mach_type = cpuid.machine;
+ te->cpu_speed = cfdiag_cpu_speed;
+ if (te->cpu_speed)
+ te->speed = 1;
+ te->clock_base = 1; /* Save clock base */
+ te->tod_base = tod_clock_base.tod;
+ te->timestamp = get_tod_clock_fast();
+}
+
+/*
+ * The number of counters per counter set varies between machine generations,
+ * but is constant when running on a particular machine generation.
+ * Determine each counter set size at device driver initialization and
+ * retrieve it later.
+ */
+static size_t cpumf_ctr_setsizes[CPUMF_CTR_SET_MAX];
+static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset)
+{
+ size_t ctrset_size = 0;
+
+ switch (ctrset) {
+ case CPUMF_CTR_SET_BASIC:
+ if (cpumf_ctr_info.cfvn >= 1)
+ ctrset_size = 6;
+ break;
+ case CPUMF_CTR_SET_USER:
+ if (cpumf_ctr_info.cfvn == 1)
+ ctrset_size = 6;
+ else if (cpumf_ctr_info.cfvn >= 3)
+ ctrset_size = 2;
+ break;
+ case CPUMF_CTR_SET_CRYPTO:
+ if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5)
+ ctrset_size = 16;
+ else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7)
+ ctrset_size = 20;
+ break;
+ case CPUMF_CTR_SET_EXT:
+ if (cpumf_ctr_info.csvn == 1)
+ ctrset_size = 32;
+ else if (cpumf_ctr_info.csvn == 2)
+ ctrset_size = 48;
+ else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5)
+ ctrset_size = 128;
+ else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7)
+ ctrset_size = 160;
+ break;
+ case CPUMF_CTR_SET_MT_DIAG:
+ if (cpumf_ctr_info.csvn > 3)
+ ctrset_size = 48;
+ break;
+ case CPUMF_CTR_SET_MAX:
+ break;
+ }
+ cpumf_ctr_setsizes[ctrset] = ctrset_size;
+}
+
+/*
+ * Return the maximum possible counter set size (in number of 8 byte counters)
+ * depending on type and model number.
+ */
+static size_t cpum_cf_read_setsize(enum cpumf_ctr_set ctrset)
+{
+ return cpumf_ctr_setsizes[ctrset];
+}
+
+/* Read a counter set. The counter set number determines the counter set and
+ * the CPUM-CF first and second version number determine the number of
+ * available counters in each counter set.
+ * Each counter set starts with header containing the counter set number and
+ * the number of eight byte counters.
+ *
+ * The functions returns the number of bytes occupied by this counter set
+ * including the header.
+ * If there is no counter in the counter set, this counter set is useless and
+ * zero is returned on this case.
+ *
+ * Note that the counter sets may not be enabled or active and the stcctm
+ * instruction might return error 3. Depending on error_ok value this is ok,
+ * for example when called from cpumf_pmu_start() call back function.
+ */
+static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
+ size_t room, bool error_ok)
+{
+ size_t ctrset_size, need = 0;
+ int rc = 3; /* Assume write failure */
+
+ ctrdata->def = CF_DIAG_CTRSET_DEF;
+ ctrdata->set = ctrset;
+ ctrdata->res1 = 0;
+ ctrset_size = cpum_cf_read_setsize(ctrset);
+
+ if (ctrset_size) { /* Save data */
+ need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
+ if (need <= room) {
+ rc = ctr_stcctm(ctrset, ctrset_size,
+ (u64 *)(ctrdata + 1));
+ }
+ if (rc != 3 || error_ok)
+ ctrdata->ctr = ctrset_size;
+ else
+ need = 0;
+ }
+
+ return need;
+}
+
+static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = {
+ [CPUMF_CTR_SET_BASIC] = 0x02,
+ [CPUMF_CTR_SET_USER] = 0x04,
+ [CPUMF_CTR_SET_CRYPTO] = 0x08,
+ [CPUMF_CTR_SET_EXT] = 0x01,
+ [CPUMF_CTR_SET_MT_DIAG] = 0x20,
+};
+
+/* Read out all counter sets and save them in the provided data buffer.
+ * The last 64 byte host an artificial trailer entry.
+ */
+static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth,
+ bool error_ok)
+{
+ struct cf_trailer_entry *trailer;
+ size_t offset = 0, done;
+ int i;
+
+ memset(data, 0, sz);
+ sz -= sizeof(*trailer); /* Always room for trailer */
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ struct cf_ctrset_entry *ctrdata = data + offset;
+
+ if (!(auth & cpumf_ctr_ctl[i]))
+ continue; /* Counter set not authorized */
+
+ done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok);
+ offset += done;
+ }
+ trailer = data + offset;
+ cfdiag_trailer(trailer);
+ return offset + sizeof(*trailer);
+}
+
+/* Calculate the difference for each counter in a counter set. */
+static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters)
+{
+ for (; --counters >= 0; ++pstart, ++pstop)
+ if (*pstop >= *pstart)
+ *pstop -= *pstart;
+ else
+ *pstop = *pstart - *pstop + 1;
+}
+
+/* Scan the counter sets and calculate the difference of each counter
+ * in each set. The result is the increment of each counter during the
+ * period the counter set has been activated.
+ *
+ * Return true on success.
+ */
+static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
+{
+ struct cf_trailer_entry *trailer_start, *trailer_stop;
+ struct cf_ctrset_entry *ctrstart, *ctrstop;
+ size_t offset = 0;
+
+ auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
+ do {
+ ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
+ ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
+
+ if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
+ pr_err_once("cpum_cf_diag counter set compare error "
+ "in set %i\n", ctrstart->set);
+ return 0;
+ }
+ auth &= ~cpumf_ctr_ctl[ctrstart->set];
+ if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
+ cfdiag_diffctrset((u64 *)(ctrstart + 1),
+ (u64 *)(ctrstop + 1), ctrstart->ctr);
+ offset += ctrstart->ctr * sizeof(u64) +
+ sizeof(*ctrstart);
+ }
+ } while (ctrstart->def && auth);
+
+ /* Save time_stamp from start of event in stop's trailer */
+ trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);
+ trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset);
+ trailer_stop->progusage[0] = trailer_start->timestamp;
+
+ return 1;
+}
static enum cpumf_ctr_set get_counter_set(u64 event)
{
@@ -34,39 +602,35 @@ static enum cpumf_ctr_set get_counter_set(u64 event)
return set;
}
-static int validate_ctr_version(const struct hw_perf_event *hwc)
+static int validate_ctr_version(const u64 config, enum cpumf_ctr_set set)
{
- struct cpu_cf_events *cpuhw;
- int err = 0;
u16 mtdiag_ctl;
-
- cpuhw = &get_cpu_var(cpu_cf_events);
+ int err = 0;
/* check required version for counter sets */
- switch (hwc->config_base) {
+ switch (set) {
case CPUMF_CTR_SET_BASIC:
case CPUMF_CTR_SET_USER:
- if (cpuhw->info.cfvn < 1)
+ if (cpumf_ctr_info.cfvn < 1)
err = -EOPNOTSUPP;
break;
case CPUMF_CTR_SET_CRYPTO:
- if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 &&
- hwc->config > 79) ||
- (cpuhw->info.csvn >= 6 && hwc->config > 83))
+ if ((cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5 &&
+ config > 79) || (cpumf_ctr_info.csvn >= 6 && config > 83))
err = -EOPNOTSUPP;
break;
case CPUMF_CTR_SET_EXT:
- if (cpuhw->info.csvn < 1)
+ if (cpumf_ctr_info.csvn < 1)
err = -EOPNOTSUPP;
- if ((cpuhw->info.csvn == 1 && hwc->config > 159) ||
- (cpuhw->info.csvn == 2 && hwc->config > 175) ||
- (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5
- && hwc->config > 255) ||
- (cpuhw->info.csvn >= 6 && hwc->config > 287))
+ if ((cpumf_ctr_info.csvn == 1 && config > 159) ||
+ (cpumf_ctr_info.csvn == 2 && config > 175) ||
+ (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5 &&
+ config > 255) ||
+ (cpumf_ctr_info.csvn >= 6 && config > 287))
err = -EOPNOTSUPP;
break;
case CPUMF_CTR_SET_MT_DIAG:
- if (cpuhw->info.csvn <= 3)
+ if (cpumf_ctr_info.csvn <= 3)
err = -EOPNOTSUPP;
/*
* MT-diagnostic counters are read-only. The counter set
@@ -81,35 +645,15 @@ static int validate_ctr_version(const struct hw_perf_event *hwc)
* counter set is enabled and active.
*/
mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG];
- if (!((cpuhw->info.auth_ctl & mtdiag_ctl) &&
- (cpuhw->info.enable_ctl & mtdiag_ctl) &&
- (cpuhw->info.act_ctl & mtdiag_ctl)))
+ if (!((cpumf_ctr_info.auth_ctl & mtdiag_ctl) &&
+ (cpumf_ctr_info.enable_ctl & mtdiag_ctl) &&
+ (cpumf_ctr_info.act_ctl & mtdiag_ctl)))
err = -EOPNOTSUPP;
break;
+ case CPUMF_CTR_SET_MAX:
+ err = -EOPNOTSUPP;
}
- put_cpu_var(cpu_cf_events);
- return err;
-}
-
-static int validate_ctr_auth(const struct hw_perf_event *hwc)
-{
- struct cpu_cf_events *cpuhw;
- u64 ctrs_state;
- int err = 0;
-
- cpuhw = &get_cpu_var(cpu_cf_events);
-
- /* Check authorization for cpu counter sets.
- * If the particular CPU counter set is not authorized,
- * return with -ENOENT in order to fall back to other
- * PMUs that might suffice the event request.
- */
- ctrs_state = cpumf_ctr_ctl[hwc->config_base];
- if (!(ctrs_state & cpuhw->info.auth_ctl))
- err = -ENOENT;
-
- put_cpu_var(cpu_cf_events);
return err;
}
@@ -120,20 +664,17 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc)
*/
static void cpumf_pmu_enable(struct pmu *pmu)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
int err;
- if (cpuhw->flags & PMU_F_ENABLED)
- return;
-
- err = lcctl(cpuhw->state);
- if (err) {
- pr_err("Enabling the performance measuring unit "
- "failed with rc=%x\n", err);
+ if (!cpuhw || (cpuhw->flags & PMU_F_ENABLED))
return;
- }
- cpuhw->flags |= PMU_F_ENABLED;
+ err = lcctl(cpuhw->state | cpuhw->dev_state);
+ if (err)
+ pr_err("Enabling the performance measuring unit failed with rc=%x\n", err);
+ else
+ cpuhw->flags |= PMU_F_ENABLED;
}
/*
@@ -143,39 +684,26 @@ static void cpumf_pmu_enable(struct pmu *pmu)
*/
static void cpumf_pmu_disable(struct pmu *pmu)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- int err;
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
u64 inactive;
+ int err;
- if (!(cpuhw->flags & PMU_F_ENABLED))
+ if (!cpuhw || !(cpuhw->flags & PMU_F_ENABLED))
return;
inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
+ inactive |= cpuhw->dev_state;
err = lcctl(inactive);
- if (err) {
- pr_err("Disabling the performance measuring unit "
- "failed with rc=%x\n", err);
- return;
- }
-
- cpuhw->flags &= ~PMU_F_ENABLED;
+ if (err)
+ pr_err("Disabling the performance measuring unit failed with rc=%x\n", err);
+ else
+ cpuhw->flags &= ~PMU_F_ENABLED;
}
-
-/* Number of perf events counting hardware events */
-static atomic_t num_events = ATOMIC_INIT(0);
-/* Used to avoid races in calling reserve/release_cpumf_hardware */
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
/* Release the PMU if event is the last perf event */
static void hw_perf_event_destroy(struct perf_event *event)
{
- if (!atomic_add_unless(&num_events, -1, 1)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_dec_return(&num_events) == 0)
- __kernel_cpumcf_end();
- mutex_unlock(&pmc_reserve_mutex);
- }
+ cpum_cf_free(event->cpu);
}
/* CPUMF <-> perf event mappings for kernel+userspace (basic set) */
@@ -199,12 +727,17 @@ static const int cpumf_generic_events_user[] = {
[PERF_COUNT_HW_BUS_CYCLES] = -1,
};
+static int is_userspace_event(u64 ev)
+{
+ return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+ cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev;
+}
+
static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
{
struct perf_event_attr *attr = &event->attr;
struct hw_perf_event *hwc = &event->hw;
enum cpumf_ctr_set set;
- int err = 0;
u64 ev;
switch (type) {
@@ -221,21 +754,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
if (is_sampling_event(event)) /* No sampling support */
return -ENOENT;
ev = attr->config;
- /* Count user space (problem-state) only */
if (!attr->exclude_user && attr->exclude_kernel) {
- if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
- return -EOPNOTSUPP;
- ev = cpumf_generic_events_user[ev];
-
- /* No support for kernel space counters only */
+ /*
+ * Count user space (problem-state) only
+ * Handle events 32 and 33 as 0:u and 1:u
+ */
+ if (!is_userspace_event(ev)) {
+ if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
+ return -EOPNOTSUPP;
+ ev = cpumf_generic_events_user[ev];
+ }
} else if (!attr->exclude_kernel && attr->exclude_user) {
+ /* No support for kernel space counters only */
return -EOPNOTSUPP;
-
- /* Count user and kernel space */
} else {
- if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
- return -EOPNOTSUPP;
- ev = cpumf_generic_events_basic[ev];
+ /* Count user and kernel space, incl. events 32 + 33 */
+ if (!is_userspace_event(ev)) {
+ if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
+ return -EOPNOTSUPP;
+ ev = cpumf_generic_events_basic[ev];
+ }
}
break;
@@ -260,36 +798,51 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
/*
* Use the hardware perf event structure to store the
* counter number in the 'config' member and the counter
- * set number in the 'config_base'. The counter set number
- * is then later used to enable/disable the counter(s).
+ * set number in the 'config_base' as bit mask.
+ * It is later used to enable/disable the counter(s).
*/
hwc->config = ev;
- hwc->config_base = set;
+ hwc->config_base = cpumf_ctr_ctl[set];
break;
case CPUMF_CTR_SET_MAX:
/* The counter could not be associated to a counter set */
return -EINVAL;
- };
+ }
/* Initialize for using the CPU-measurement counter facility */
- if (!atomic_inc_not_zero(&num_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin())
- err = -EBUSY;
- else
- atomic_inc(&num_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
- if (err)
- return err;
+ if (cpum_cf_alloc(event->cpu))
+ return -ENOMEM;
event->destroy = hw_perf_event_destroy;
- /* Finally, validate version and authorization of the counter set */
- err = validate_ctr_auth(hwc);
- if (!err)
- err = validate_ctr_version(hwc);
+ /*
+ * Finally, validate version and authorization of the counter set.
+ * If the particular CPU counter set is not authorized,
+ * return with -ENOENT in order to fall back to other
+ * PMUs that might suffice the event request.
+ */
+ if (!(hwc->config_base & cpumf_ctr_info.auth_ctl))
+ return -ENOENT;
+ return validate_ctr_version(hwc->config, set);
+}
- return err;
+/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different
+ * attribute::type values:
+ * - PERF_TYPE_HARDWARE:
+ * - pmu->type:
+ * Handle both type of invocations identical. They address the same hardware.
+ * The result is different when event modifiers exclude_kernel and/or
+ * exclude_user are also set.
+ */
+static int cpumf_pmu_event_type(struct perf_event *event)
+{
+ u64 ev = event->attr.config;
+
+ if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+ cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev ||
+ cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
+ cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev)
+ return PERF_TYPE_HARDWARE;
+ return PERF_TYPE_RAW;
}
static int cpumf_pmu_event_init(struct perf_event *event)
@@ -301,7 +854,7 @@ static int cpumf_pmu_event_init(struct perf_event *event)
err = __hw_perf_event_init(event, type);
else if (event->pmu->type == type)
/* Registered as unknown PMU */
- err = __hw_perf_event_init(event, PERF_TYPE_RAW);
+ err = __hw_perf_event_init(event, cpumf_pmu_event_type(event));
else
return -ENOENT;
@@ -361,18 +914,13 @@ static void cpumf_pmu_read(struct perf_event *event)
static void cpumf_pmu_start(struct perf_event *event, int flags)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
struct hw_perf_event *hwc = &event->hw;
+ int i;
- if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
- return;
-
- if (WARN_ON_ONCE(hwc->config == -1))
+ if (!(hwc->state & PERF_HES_STOPPED))
return;
- if (flags & PERF_EF_RELOAD)
- WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-
hwc->state = 0;
/* (Re-)enable and activate the counter set */
@@ -384,45 +932,95 @@ static void cpumf_pmu_start(struct perf_event *event, int flags)
* needs to be synchronized. At this point, the counter set can be in
* the inactive or disabled state.
*/
- hw_perf_event_reset(event);
+ if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+ cpuhw->usedss = cfdiag_getctr(cpuhw->start,
+ sizeof(cpuhw->start),
+ hwc->config_base, true);
+ } else {
+ hw_perf_event_reset(event);
+ }
- /* increment refcount for this counter set */
- atomic_inc(&cpuhw->ctr_set[hwc->config_base]);
+ /* Increment refcount for counter sets */
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+ if ((hwc->config_base & cpumf_ctr_ctl[i]))
+ atomic_inc(&cpuhw->ctr_set[i]);
+}
+
+/* Create perf event sample with the counter sets as raw data. The sample
+ * is then pushed to the event subsystem and the function checks for
+ * possible event overflows. If an event overflow occurs, the PMU is
+ * stopped.
+ *
+ * Return non-zero if an event overflow occurred.
+ */
+static int cfdiag_push_sample(struct perf_event *event,
+ struct cpu_cf_events *cpuhw)
+{
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ int overflow;
+
+ /* Setup perf sample */
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+
+ if (event->attr.sample_type & PERF_SAMPLE_CPU)
+ data.cpu_entry.cpu = event->cpu;
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = cpuhw->usedss;
+ raw.frag.data = cpuhw->stop;
+ perf_sample_save_raw_data(&data, &raw);
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ if (overflow)
+ event->pmu->stop(event, 0);
+
+ perf_event_update_userpage(event);
+ return overflow;
}
static void cpumf_pmu_stop(struct perf_event *event, int flags)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
struct hw_perf_event *hwc = &event->hw;
+ int i;
if (!(hwc->state & PERF_HES_STOPPED)) {
/* Decrement reference count for this counter set and if this
* is the last used counter in the set, clear activation
* control and set the counter set state to inactive.
*/
- if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base]))
- ctr_set_stop(&cpuhw->state, hwc->config_base);
- event->hw.state |= PERF_HES_STOPPED;
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (!(hwc->config_base & cpumf_ctr_ctl[i]))
+ continue;
+ if (!atomic_dec_return(&cpuhw->ctr_set[i]))
+ ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]);
+ }
+ hwc->state |= PERF_HES_STOPPED;
}
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- hw_perf_event_update(event);
- event->hw.state |= PERF_HES_UPTODATE;
+ if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) {
+ local64_inc(&event->count);
+ cpuhw->usedss = cfdiag_getctr(cpuhw->stop,
+ sizeof(cpuhw->stop),
+ event->hw.config_base,
+ false);
+ if (cfdiag_diffctr(cpuhw, event->hw.config_base))
+ cfdiag_push_sample(event, cpuhw);
+ } else {
+ hw_perf_event_update(event);
+ }
+ hwc->state |= PERF_HES_UPTODATE;
}
}
static int cpumf_pmu_add(struct perf_event *event, int flags)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- /* Check authorization for the counter set to which this
- * counter belongs.
- * For group events transaction, the authorization check is
- * done in cpumf_pmu_commit_txn().
- */
- if (!(cpuhw->txn_flags & PERF_PMU_TXN_ADD))
- if (validate_ctr_auth(&event->hw))
- return -ENOENT;
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
ctr_set_enable(&cpuhw->state, event->hw.config_base);
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
@@ -430,14 +1028,13 @@ static int cpumf_pmu_add(struct perf_event *event, int flags)
if (flags & PERF_EF_START)
cpumf_pmu_start(event, PERF_EF_RELOAD);
- perf_event_update_userpage(event);
-
return 0;
}
static void cpumf_pmu_del(struct perf_event *event, int flags)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ int i;
cpumf_pmu_stop(event, PERF_EF_UPDATE);
@@ -449,112 +1046,905 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
* clear enable control and resets all counters in a set. Therefore,
* cpumf_pmu_start() always has to reenable a counter set.
*/
- if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base]))
- ctr_set_disable(&cpuhw->state, event->hw.config_base);
-
- perf_event_update_userpage(event);
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
+ if (!atomic_read(&cpuhw->ctr_set[i]))
+ ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]);
}
+/* Performance monitoring unit for s390x */
+static struct pmu cpumf_pmu = {
+ .task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .pmu_enable = cpumf_pmu_enable,
+ .pmu_disable = cpumf_pmu_disable,
+ .event_init = cpumf_pmu_event_init,
+ .add = cpumf_pmu_add,
+ .del = cpumf_pmu_del,
+ .start = cpumf_pmu_start,
+ .stop = cpumf_pmu_stop,
+ .read = cpumf_pmu_read,
+};
+
+static struct cfset_session { /* CPUs and counter set bit mask */
+ struct list_head head; /* Head of list of active processes */
+} cfset_session = {
+ .head = LIST_HEAD_INIT(cfset_session.head)
+};
+
+static refcount_t cfset_opencnt = REFCOUNT_INIT(0); /* Access count */
/*
- * Start group events scheduling transaction.
- * Set flags to perform a single test at commit time.
+ * Synchronize access to device /dev/hwc. This mutex protects against
+ * concurrent access to functions cfset_open() and cfset_release().
+ * Same for CPU hotplug add and remove events triggering
+ * cpum_cf_online_cpu() and cpum_cf_offline_cpu().
+ * It also serializes concurrent device ioctl access from multiple
+ * processes accessing /dev/hwc.
*
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
+ * The mutex protects concurrent access to the /dev/hwctr session management
+ * struct cfset_session and reference counting variable cfset_opencnt.
*/
-static void cpumf_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+static DEFINE_MUTEX(cfset_ctrset_mutex);
+
+/*
+ * CPU hotplug handles only /dev/hwctr device.
+ * For perf_event_open() the CPU hotplug handling is done on kernel common
+ * code:
+ * - CPU add: Nothing is done since a file descriptor can not be created
+ * and returned to the user.
+ * - CPU delete: Handled by common code via pmu_disable(), pmu_stop() and
+ * pmu_delete(). The event itself is removed when the file descriptor is
+ * closed.
+ */
+static int cfset_online_cpu(unsigned int cpu);
+
+static int cpum_cf_online_cpu(unsigned int cpu)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ int rc = 0;
+
+ /*
+ * Ignore notification for perf_event_open().
+ * Handle only /dev/hwctr device sessions.
+ */
+ mutex_lock(&cfset_ctrset_mutex);
+ if (refcount_read(&cfset_opencnt)) {
+ rc = cpum_cf_alloc_cpu(cpu);
+ if (!rc)
+ cfset_online_cpu(cpu);
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ return rc;
+}
+
+static int cfset_offline_cpu(unsigned int cpu);
+
+static int cpum_cf_offline_cpu(unsigned int cpu)
+{
+ /*
+ * During task exit processing of grouped perf events triggered by CPU
+ * hotplug processing, pmu_disable() is called as part of perf context
+ * removal process. Therefore do not trigger event removal now for
+ * perf_event_open() created events. Perf common code triggers event
+ * destruction when the event file descriptor is closed.
+ *
+ * Handle only /dev/hwctr device sessions.
+ */
+ mutex_lock(&cfset_ctrset_mutex);
+ if (refcount_read(&cfset_opencnt)) {
+ cfset_offline_cpu(cpu);
+ cpum_cf_free_cpu(cpu);
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ return 0;
+}
+
+/* Return true if store counter set multiple instruction is available */
+static inline int stccm_avail(void)
+{
+ return test_facility(142);
+}
+
+/* CPU-measurement alerts for the counter facility */
+static void cpumf_measurement_alert(struct ext_code ext_code,
+ unsigned int alert, unsigned long unused)
+{
+ struct cpu_cf_events *cpuhw;
+
+ if (!(alert & CPU_MF_INT_CF_MASK))
+ return;
- WARN_ON_ONCE(cpuhw->txn_flags); /* txn already in flight */
+ inc_irq_stat(IRQEXT_CMC);
- cpuhw->txn_flags = txn_flags;
- if (txn_flags & ~PERF_PMU_TXN_ADD)
+ /*
+ * Measurement alerts are shared and might happen when the PMU
+ * is not reserved. Ignore these alerts in this case.
+ */
+ cpuhw = this_cpu_cfhw();
+ if (!cpuhw)
return;
- perf_pmu_disable(pmu);
- cpuhw->tx_state = cpuhw->state;
+ /* counter authorization change alert */
+ if (alert & CPU_MF_INT_CF_CACA)
+ qctri(&cpumf_ctr_info);
+
+ /* loss of counter data alert */
+ if (alert & CPU_MF_INT_CF_LCDA)
+ pr_err("CPU[%i] Counter data was lost\n", smp_processor_id());
+
+ /* loss of MT counter data alert */
+ if (alert & CPU_MF_INT_CF_MTDA)
+ pr_warn("CPU[%i] MT counter data was lost\n",
+ smp_processor_id());
}
-/*
- * Stop and cancel a group events scheduling tranctions.
- * Assumes cpumf_pmu_del() is called for each successful added
- * cpumf_pmu_add() during the transaction.
+static int cfset_init(void);
+static int __init cpumf_pmu_init(void)
+{
+ int rc;
+
+ /* Extract counter measurement facility information */
+ if (!cpum_cf_avail() || qctri(&cpumf_ctr_info))
+ return -ENODEV;
+
+ /* Determine and store counter set sizes for later reference */
+ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+ cpum_cf_make_setsize(rc);
+
+ /*
+ * Clear bit 15 of cr0 to unauthorize problem-state to
+ * extract measurement counters
+ */
+ system_ctl_clear_bit(0, CR0_CPUMF_EXTRACTION_AUTH_BIT);
+
+ /* register handler for measurement-alert interruptions */
+ rc = register_external_irq(EXT_IRQ_MEASURE_ALERT,
+ cpumf_measurement_alert);
+ if (rc) {
+ pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc);
+ return rc;
+ }
+
+ /* Setup s390dbf facility */
+ cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
+ if (!cf_dbg) {
+ pr_err("Registration of s390dbf(cpum_cf) failed\n");
+ rc = -ENOMEM;
+ goto out1;
+ }
+ debug_register_view(cf_dbg, &debug_sprintf_view);
+
+ cpumf_pmu.attr_groups = cpumf_cf_event_group();
+ rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
+ if (rc) {
+ pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+ goto out2;
+ } else if (stccm_avail()) { /* Setup counter set device */
+ cfset_init();
+ }
+
+ rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE,
+ "perf/s390/cf:online",
+ cpum_cf_online_cpu, cpum_cf_offline_cpu);
+ return rc;
+
+out2:
+ debug_unregister_view(cf_dbg, &debug_sprintf_view);
+ debug_unregister(cf_dbg);
+out1:
+ unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert);
+ return rc;
+}
+
+/* Support for the CPU Measurement Facility counter set extraction using
+ * device /dev/hwctr. This allows user space programs to extract complete
+ * counter set via normal file operations.
+ */
+
+struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */
+ unsigned int sets; /* Counter set bit mask */
+ atomic_t cpus_ack; /* # CPUs successfully executed func */
+};
+
+struct cfset_request { /* CPUs and counter set bit mask */
+ unsigned long ctrset; /* Bit mask of counter set to read */
+ cpumask_t mask; /* CPU mask to read from */
+ struct list_head node; /* Chain to cfset_session.head */
+};
+
+static void cfset_session_init(void)
+{
+ INIT_LIST_HEAD(&cfset_session.head);
+}
+
+/* Remove current request from global bookkeeping. Maintain a counter set bit
+ * mask on a per CPU basis.
+ * Done in process context under mutex protection.
*/
-static void cpumf_pmu_cancel_txn(struct pmu *pmu)
+static void cfset_session_del(struct cfset_request *p)
{
- unsigned int txn_flags;
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
+ list_del(&p->node);
+}
- WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */
+/* Add current request to global bookkeeping. Maintain a counter set bit mask
+ * on a per CPU basis.
+ * Done in process context under mutex protection.
+ */
+static void cfset_session_add(struct cfset_request *p)
+{
+ list_add(&p->node, &cfset_session.head);
+}
- txn_flags = cpuhw->txn_flags;
- cpuhw->txn_flags = 0;
- if (txn_flags & ~PERF_PMU_TXN_ADD)
- return;
+/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
+ * path is currently used.
+ * The cpu_cf_events::dev_state is used to denote counter sets in use by this
+ * interface. It is always or'ed in. If this interface is not active, its
+ * value is zero and no additional counter sets will be included.
+ *
+ * The cpu_cf_events::state is used by the perf_event_open SVC and remains
+ * unchanged.
+ *
+ * perf_pmu_enable() and perf_pmu_enable() and its call backs
+ * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the
+ * performance measurement subsystem to enable per process
+ * CPU Measurement counter facility.
+ * The XXX_enable() and XXX_disable functions are used to turn off
+ * x86 performance monitoring interrupt (PMI) during scheduling.
+ * s390 uses these calls to temporarily stop and resume the active CPU
+ * counters sets during scheduling.
+ *
+ * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr
+ * device access. The perf_event_open() SVC interface makes a lot of effort
+ * to only run the counters while the calling process is actively scheduled
+ * to run.
+ * When /dev/hwctr interface is also used at the same time, the counter sets
+ * will keep running, even when the process is scheduled off a CPU.
+ * However this is not a problem and does not lead to wrong counter values
+ * for the perf_event_open() SVC. The current counter value will be recorded
+ * during schedule-in. At schedule-out time the current counter value is
+ * extracted again and the delta is calculated and added to the event.
+ */
+/* Stop all counter sets via ioctl interface */
+static void cfset_ioctl_off(void *parm)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ struct cfset_call_on_cpu_parm *p = parm;
+ int rc;
+
+ /* Check if any counter set used by /dev/hwctr */
+ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+ if ((p->sets & cpumf_ctr_ctl[rc])) {
+ if (!atomic_dec_return(&cpuhw->ctr_set[rc])) {
+ ctr_set_disable(&cpuhw->dev_state,
+ cpumf_ctr_ctl[rc]);
+ ctr_set_stop(&cpuhw->dev_state,
+ cpumf_ctr_ctl[rc]);
+ }
+ }
+ /* Keep perf_event_open counter sets */
+ rc = lcctl(cpuhw->dev_state | cpuhw->state);
+ if (rc)
+ pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->state, S390_HWCTR_DEVICE, rc);
+ if (!cpuhw->dev_state)
+ cpuhw->flags &= ~PMU_F_IN_USE;
+}
+
+/* Start counter sets on particular CPU */
+static void cfset_ioctl_on(void *parm)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ struct cfset_call_on_cpu_parm *p = parm;
+ int rc;
+
+ cpuhw->flags |= PMU_F_IN_USE;
+ ctr_set_enable(&cpuhw->dev_state, p->sets);
+ ctr_set_start(&cpuhw->dev_state, p->sets);
+ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
+ if ((p->sets & cpumf_ctr_ctl[rc]))
+ atomic_inc(&cpuhw->ctr_set[rc]);
+ rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */
+ if (!rc)
+ atomic_inc(&p->cpus_ack);
+ else
+ pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc);
+}
+
+static void cfset_release_cpu(void *p)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ int rc;
+
+ cpuhw->dev_state = 0;
+ rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */
+ if (rc)
+ pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
+ cpuhw->state, S390_HWCTR_DEVICE, rc);
+}
+
+/* This modifies the process CPU mask to adopt it to the currently online
+ * CPUs. Offline CPUs can not be addresses. This call terminates the access
+ * and is usually followed by close() or a new iotcl(..., START, ...) which
+ * creates a new request structure.
+ */
+static void cfset_all_stop(struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p = {
+ .sets = req->ctrset,
+ };
- WARN_ON(cpuhw->tx_state != cpuhw->state);
+ cpumask_and(&req->mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1);
+}
- perf_pmu_enable(pmu);
+/* Release function is also called when application gets terminated without
+ * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command.
+ */
+static int cfset_release(struct inode *inode, struct file *file)
+{
+ mutex_lock(&cfset_ctrset_mutex);
+ /* Open followed by close/exit has no private_data */
+ if (file->private_data) {
+ cfset_all_stop(file->private_data);
+ cfset_session_del(file->private_data);
+ kfree(file->private_data);
+ file->private_data = NULL;
+ }
+ if (refcount_dec_and_test(&cfset_opencnt)) { /* Last close */
+ on_each_cpu(cfset_release_cpu, NULL, 1);
+ cpum_cf_free(-1);
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ return 0;
}
/*
- * Commit the group events scheduling transaction. On success, the
- * transaction is closed. On error, the transaction is kept open
- * until cpumf_pmu_cancel_txn() is called.
+ * Open via /dev/hwctr device. Allocate all per CPU resources on the first
+ * open of the device. The last close releases all per CPU resources.
+ * Parallel perf_event_open system calls also use per CPU resources.
+ * These invocations are handled via reference counting on the per CPU data
+ * structures.
*/
-static int cpumf_pmu_commit_txn(struct pmu *pmu)
+static int cfset_open(struct inode *inode, struct file *file)
{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- u64 state;
+ int rc = 0;
+
+ if (!perfmon_capable())
+ return -EPERM;
+ file->private_data = NULL;
+
+ mutex_lock(&cfset_ctrset_mutex);
+ if (!refcount_inc_not_zero(&cfset_opencnt)) { /* First open */
+ rc = cpum_cf_alloc(-1);
+ if (!rc) {
+ cfset_session_init();
+ refcount_set(&cfset_opencnt, 1);
+ }
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
- WARN_ON_ONCE(!cpuhw->txn_flags); /* no txn in flight */
+ /* nonseekable_open() never fails */
+ return rc ?: nonseekable_open(inode, file);
+}
- if (cpuhw->txn_flags & ~PERF_PMU_TXN_ADD) {
- cpuhw->txn_flags = 0;
- return 0;
+static int cfset_all_start(struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p = {
+ .sets = req->ctrset,
+ .cpus_ack = ATOMIC_INIT(0),
+ };
+ cpumask_var_t mask;
+ int rc = 0;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+ cpumask_and(mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
+ if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
+ on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
+ rc = -EIO;
}
+ free_cpumask_var(mask);
+ return rc;
+}
- /* check if the updated state can be scheduled */
- state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
- state >>= CPUMF_LCCTL_ENABLE_SHIFT;
- if ((state & cpuhw->info.auth_ctl) != state)
- return -ENOENT;
+/* Return the maximum required space for all possible CPUs in case one
+ * CPU will be onlined during the START, READ, STOP cycles.
+ * To find out the size of the counter sets, any one CPU will do. They
+ * all have the same counter sets.
+ */
+static size_t cfset_needspace(unsigned int sets)
+{
+ size_t bytes = 0;
+ int i;
+
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (!(sets & cpumf_ctr_ctl[i]))
+ continue;
+ bytes += cpum_cf_read_setsize(i) * sizeof(u64) +
+ sizeof(((struct s390_ctrset_setdata *)0)->set) +
+ sizeof(((struct s390_ctrset_setdata *)0)->no_cnts);
+ }
+ bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids *
+ (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) +
+ sizeof(((struct s390_ctrset_cpudata *)0)->no_sets));
+ return bytes;
+}
+
+static int cfset_all_copy(unsigned long arg, cpumask_t *mask)
+{
+ struct s390_ctrset_read __user *ctrset_read;
+ unsigned int cpu, cpus, rc = 0;
+ void __user *uptr;
+
+ ctrset_read = (struct s390_ctrset_read __user *)arg;
+ uptr = ctrset_read->data;
+ for_each_cpu(cpu, mask) {
+ struct cpu_cf_events *cpuhw = get_cpu_cfhw(cpu);
+ struct s390_ctrset_cpudata __user *ctrset_cpudata;
+
+ ctrset_cpudata = uptr;
+ rc = put_user(cpu, &ctrset_cpudata->cpu_nr);
+ rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets);
+ rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data,
+ cpuhw->used);
+ if (rc) {
+ rc = -EFAULT;
+ goto out;
+ }
+ uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used;
+ cond_resched();
+ }
+ cpus = cpumask_weight(mask);
+ if (put_user(cpus, &ctrset_read->no_cpus))
+ rc = -EFAULT;
+out:
+ return rc;
+}
- cpuhw->txn_flags = 0;
- perf_pmu_enable(pmu);
+static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset,
+ int ctrset_size, size_t room)
+{
+ size_t need = 0;
+ int rc = -1;
+
+ need = sizeof(*p) + sizeof(u64) * ctrset_size;
+ if (need <= room) {
+ p->set = cpumf_ctr_ctl[ctrset];
+ p->no_cnts = ctrset_size;
+ rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv);
+ if (rc == 3) /* Nothing stored */
+ need = 0;
+ }
+ return need;
+}
+
+/* Read all counter sets. */
+static void cfset_cpu_read(void *parm)
+{
+ struct cpu_cf_events *cpuhw = this_cpu_cfhw();
+ struct cfset_call_on_cpu_parm *p = parm;
+ int set, set_size;
+ size_t space;
+
+ /* No data saved yet */
+ cpuhw->used = 0;
+ cpuhw->sets = 0;
+ memset(cpuhw->data, 0, sizeof(cpuhw->data));
+
+ /* Scan the counter sets */
+ for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) {
+ struct s390_ctrset_setdata *sp = (void *)cpuhw->data +
+ cpuhw->used;
+
+ if (!(p->sets & cpumf_ctr_ctl[set]))
+ continue; /* Counter set not in list */
+ set_size = cpum_cf_read_setsize(set);
+ space = sizeof(cpuhw->data) - cpuhw->used;
+ space = cfset_cpuset_read(sp, set, set_size, space);
+ if (space) {
+ cpuhw->used += space;
+ cpuhw->sets += 1;
+ }
+ }
+}
+
+static int cfset_all_read(unsigned long arg, struct cfset_request *req)
+{
+ struct cfset_call_on_cpu_parm p;
+ cpumask_var_t mask;
+ int rc;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ p.sets = req->ctrset;
+ cpumask_and(mask, &req->mask, cpu_online_mask);
+ on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
+ rc = cfset_all_copy(arg, mask);
+ free_cpumask_var(mask);
+ return rc;
+}
+
+static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req)
+{
+ int ret = -ENODATA;
+
+ if (req && req->ctrset)
+ ret = cfset_all_read(arg, req);
+ return ret;
+}
+
+static long cfset_ioctl_stop(struct file *file)
+{
+ struct cfset_request *req = file->private_data;
+ int ret = -ENXIO;
+
+ if (req) {
+ cfset_all_stop(req);
+ cfset_session_del(req);
+ kfree(req);
+ file->private_data = NULL;
+ ret = 0;
+ }
+ return ret;
+}
+
+static long cfset_ioctl_start(unsigned long arg, struct file *file)
+{
+ struct s390_ctrset_start __user *ustart;
+ struct s390_ctrset_start start;
+ struct cfset_request *preq;
+ void __user *umask;
+ unsigned int len;
+ int ret = 0;
+ size_t need;
+
+ if (file->private_data)
+ return -EBUSY;
+ ustart = (struct s390_ctrset_start __user *)arg;
+ if (copy_from_user(&start, ustart, sizeof(start)))
+ return -EFAULT;
+ if (start.version != S390_HWCTR_START_VERSION)
+ return -EINVAL;
+ if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_USER] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] |
+ cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]))
+ return -EINVAL; /* Invalid counter set */
+ if (!start.counter_sets)
+ return -EINVAL; /* No counter set at all? */
+
+ preq = kzalloc(sizeof(*preq), GFP_KERNEL);
+ if (!preq)
+ return -ENOMEM;
+ cpumask_clear(&preq->mask);
+ len = min_t(u64, start.cpumask_len, cpumask_size());
+ umask = (void __user *)start.cpumask;
+ if (copy_from_user(&preq->mask, umask, len)) {
+ kfree(preq);
+ return -EFAULT;
+ }
+ if (cpumask_empty(&preq->mask)) {
+ kfree(preq);
+ return -EINVAL;
+ }
+ need = cfset_needspace(start.counter_sets);
+ if (put_user(need, &ustart->data_bytes)) {
+ kfree(preq);
+ return -EFAULT;
+ }
+ preq->ctrset = start.counter_sets;
+ ret = cfset_all_start(preq);
+ if (!ret) {
+ cfset_session_add(preq);
+ file->private_data = preq;
+ } else {
+ kfree(preq);
+ }
+ return ret;
+}
+
+/* Entry point to the /dev/hwctr device interface.
+ * The ioctl system call supports three subcommands:
+ * S390_HWCTR_START: Start the specified counter sets on a CPU list. The
+ * counter set keeps running until explicitly stopped. Returns the number
+ * of bytes needed to store the counter values. If another S390_HWCTR_START
+ * ioctl subcommand is called without a previous S390_HWCTR_STOP stop
+ * command on the same file descriptor, -EBUSY is returned.
+ * S390_HWCTR_READ: Read the counter set values from specified CPU list given
+ * with the S390_HWCTR_START command.
+ * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
+ * previous S390_HWCTR_START subcommand.
+ */
+static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ cpus_read_lock();
+ mutex_lock(&cfset_ctrset_mutex);
+ switch (cmd) {
+ case S390_HWCTR_START:
+ ret = cfset_ioctl_start(arg, file);
+ break;
+ case S390_HWCTR_STOP:
+ ret = cfset_ioctl_stop(file);
+ break;
+ case S390_HWCTR_READ:
+ ret = cfset_ioctl_read(arg, file->private_data);
+ break;
+ default:
+ ret = -ENOTTY;
+ break;
+ }
+ mutex_unlock(&cfset_ctrset_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+
+static const struct file_operations cfset_fops = {
+ .owner = THIS_MODULE,
+ .open = cfset_open,
+ .release = cfset_release,
+ .unlocked_ioctl = cfset_ioctl,
+ .compat_ioctl = cfset_ioctl,
+ .llseek = no_llseek
+};
+
+static struct miscdevice cfset_dev = {
+ .name = S390_HWCTR_DEVICE,
+ .minor = MISC_DYNAMIC_MINOR,
+ .fops = &cfset_fops,
+ .mode = 0666,
+};
+
+/* Hotplug add of a CPU. Scan through all active processes and add
+ * that CPU to the list of CPUs supplied with ioctl(..., START, ...).
+ */
+static int cfset_online_cpu(unsigned int cpu)
+{
+ struct cfset_call_on_cpu_parm p;
+ struct cfset_request *rp;
+
+ if (!list_empty(&cfset_session.head)) {
+ list_for_each_entry(rp, &cfset_session.head, node) {
+ p.sets = rp->ctrset;
+ cfset_ioctl_on(&p);
+ cpumask_set_cpu(cpu, &rp->mask);
+ }
+ }
return 0;
}
-/* Performance monitoring unit for s390x */
-static struct pmu cpumf_pmu = {
+/* Hotplug remove of a CPU. Scan through all active processes and clear
+ * that CPU from the list of CPUs supplied with ioctl(..., START, ...).
+ * Adjust reference counts.
+ */
+static int cfset_offline_cpu(unsigned int cpu)
+{
+ struct cfset_call_on_cpu_parm p;
+ struct cfset_request *rp;
+
+ if (!list_empty(&cfset_session.head)) {
+ list_for_each_entry(rp, &cfset_session.head, node) {
+ p.sets = rp->ctrset;
+ cfset_ioctl_off(&p);
+ cpumask_clear_cpu(cpu, &rp->mask);
+ }
+ }
+ return 0;
+}
+
+static void cfdiag_read(struct perf_event *event)
+{
+}
+
+static int get_authctrsets(void)
+{
+ unsigned long auth = 0;
+ enum cpumf_ctr_set i;
+
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ if (cpumf_ctr_info.auth_ctl & cpumf_ctr_ctl[i])
+ auth |= cpumf_ctr_ctl[i];
+ }
+ return auth;
+}
+
+/* Setup the event. Test for authorized counter sets and only include counter
+ * sets which are authorized at the time of the setup. Including unauthorized
+ * counter sets result in specification exception (and panic).
+ */
+static int cfdiag_event_init2(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ int err = 0;
+
+ /* Set sample_period to indicate sampling */
+ event->hw.config = attr->config;
+ event->hw.sample_period = attr->sample_period;
+ local64_set(&event->hw.period_left, event->hw.sample_period);
+ local64_set(&event->count, 0);
+ event->hw.last_period = event->hw.sample_period;
+
+ /* Add all authorized counter sets to config_base. The
+ * the hardware init function is either called per-cpu or just once
+ * for all CPUS (event->cpu == -1). This depends on the whether
+ * counting is started for all CPUs or on a per workload base where
+ * the perf event moves from one CPU to another CPU.
+ * Checking the authorization on any CPU is fine as the hardware
+ * applies the same authorization settings to all CPUs.
+ */
+ event->hw.config_base = get_authctrsets();
+
+ /* No authorized counter sets, nothing to count/sample */
+ if (!event->hw.config_base)
+ err = -EINVAL;
+
+ return err;
+}
+
+static int cfdiag_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ int err = -ENOENT;
+
+ if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
+ event->attr.type != event->pmu->type)
+ goto out;
+
+ /* Raw events are used to access counters directly,
+ * hence do not permit excludes.
+ * This event is useless without PERF_SAMPLE_RAW to return counter set
+ * values as raw data.
+ */
+ if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
+ !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* Initialize for using the CPU-measurement counter facility */
+ if (cpum_cf_alloc(event->cpu))
+ return -ENOMEM;
+ event->destroy = hw_perf_event_destroy;
+
+ err = cfdiag_event_init2(event);
+ if (unlikely(err))
+ event->destroy(event);
+out:
+ return err;
+}
+
+/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used
+ * to collect the complete counter sets for a scheduled process. Target
+ * are complete counter sets attached as raw data to the artificial event.
+ * This results in complete counter sets available when a process is
+ * scheduled. Contains the delta of every counter while the process was
+ * running.
+ */
+CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
+
+static struct attribute *cfdiag_events_attr[] = {
+ CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
+ NULL,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *cfdiag_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group cfdiag_events_group = {
+ .name = "events",
+ .attrs = cfdiag_events_attr,
+};
+static struct attribute_group cfdiag_format_group = {
+ .name = "format",
+ .attrs = cfdiag_format_attr,
+};
+static const struct attribute_group *cfdiag_attr_groups[] = {
+ &cfdiag_events_group,
+ &cfdiag_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for event CF_DIAG. Since this event
+ * is also started and stopped via the perf_event_open() system call, use
+ * the same event enable/disable call back functions. They do not
+ * have a pointer to the perf_event strcture as first parameter.
+ *
+ * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
+ * Reuse them and distinguish the event (always first parameter) via
+ * 'config' member.
+ */
+static struct pmu cf_diag = {
.task_ctx_nr = perf_sw_context,
- .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .event_init = cfdiag_event_init,
.pmu_enable = cpumf_pmu_enable,
.pmu_disable = cpumf_pmu_disable,
- .event_init = cpumf_pmu_event_init,
.add = cpumf_pmu_add,
.del = cpumf_pmu_del,
.start = cpumf_pmu_start,
.stop = cpumf_pmu_stop,
- .read = cpumf_pmu_read,
- .start_txn = cpumf_pmu_start_txn,
- .commit_txn = cpumf_pmu_commit_txn,
- .cancel_txn = cpumf_pmu_cancel_txn,
+ .read = cfdiag_read,
+
+ .attr_groups = cfdiag_attr_groups
};
-static int __init cpumf_pmu_init(void)
+/* Calculate memory needed to store all counter sets together with header and
+ * trailer data. This is independent of the counter set authorization which
+ * can vary depending on the configuration.
+ */
+static size_t cfdiag_maxsize(struct cpumf_ctr_info *info)
+{
+ size_t max_size = sizeof(struct cf_trailer_entry);
+ enum cpumf_ctr_set i;
+
+ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
+ size_t size = cpum_cf_read_setsize(i);
+
+ if (size)
+ max_size += size * sizeof(u64) +
+ sizeof(struct cf_ctrset_entry);
+ }
+ return max_size;
+}
+
+/* Get the CPU speed, try sampling facility first and CPU attributes second. */
+static void cfdiag_get_cpu_speed(void)
+{
+ unsigned long mhz;
+
+ if (cpum_sf_avail()) { /* Sampling facility first */
+ struct hws_qsi_info_block si;
+
+ memset(&si, 0, sizeof(si));
+ if (!qsi(&si)) {
+ cfdiag_cpu_speed = si.cpu_speed;
+ return;
+ }
+ }
+
+ /* Fallback: CPU speed extract static part. Used in case
+ * CPU Measurement Sampling Facility is turned off.
+ */
+ mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
+ if (mhz != -1UL)
+ cfdiag_cpu_speed = mhz & 0xffffffff;
+}
+
+static int cfset_init(void)
{
+ size_t need;
int rc;
- if (!kernel_cpumcf_avail())
- return -ENODEV;
+ cfdiag_get_cpu_speed();
+ /* Make sure the counter set data fits into predefined buffer. */
+ need = cfdiag_maxsize(&cpumf_ctr_info);
+ if (need > sizeof(((struct cpu_cf_events *)0)->start)) {
+ pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
+ need);
+ return -ENOMEM;
+ }
- cpumf_pmu.attr_groups = cpumf_cf_event_group();
- rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1);
- if (rc)
- pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
+ rc = misc_register(&cfset_dev);
+ if (rc) {
+ pr_err("Registration of /dev/%s failed rc=%i\n",
+ cfset_dev.name, rc);
+ goto out;
+ }
+
+ rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
+ if (rc) {
+ misc_deregister(&cfset_dev);
+ pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
+ rc);
+ }
+out:
return rc;
}
-subsys_initcall(cpumf_pmu_init);
+
+device_initcall(cpumf_pmu_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c
deleted file mode 100644
index 3bced89caffb..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_common.c
+++ /dev/null
@@ -1,201 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * CPU-Measurement Counter Facility Support - Common Layer
- *
- * Copyright IBM Corp. 2019
- * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- */
-#define KMSG_COMPONENT "cpum_cf_common"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-
-/* Per-CPU event structure for the counter facility */
-DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = {
- .ctr_set = {
- [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0),
- [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0),
- [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0),
- [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0),
- [CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0),
- },
- .alert = ATOMIC64_INIT(0),
- .state = 0,
- .flags = 0,
- .txn_flags = 0,
-};
-/* Indicator whether the CPU-Measurement Counter Facility Support is ready */
-static bool cpum_cf_initalized;
-
-/* CPU-measurement alerts for the counter facility */
-static void cpumf_measurement_alert(struct ext_code ext_code,
- unsigned int alert, unsigned long unused)
-{
- struct cpu_cf_events *cpuhw;
-
- if (!(alert & CPU_MF_INT_CF_MASK))
- return;
-
- inc_irq_stat(IRQEXT_CMC);
- cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- /* Measurement alerts are shared and might happen when the PMU
- * is not reserved. Ignore these alerts in this case. */
- if (!(cpuhw->flags & PMU_F_RESERVED))
- return;
-
- /* counter authorization change alert */
- if (alert & CPU_MF_INT_CF_CACA)
- qctri(&cpuhw->info);
-
- /* loss of counter data alert */
- if (alert & CPU_MF_INT_CF_LCDA)
- pr_err("CPU[%i] Counter data was lost\n", smp_processor_id());
-
- /* loss of MT counter data alert */
- if (alert & CPU_MF_INT_CF_MTDA)
- pr_warn("CPU[%i] MT counter data was lost\n",
- smp_processor_id());
-
- /* store alert for special handling by in-kernel users */
- atomic64_or(alert, &cpuhw->alert);
-}
-
-#define PMC_INIT 0
-#define PMC_RELEASE 1
-static void cpum_cf_setup_cpu(void *flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- switch (*((int *) flags)) {
- case PMC_INIT:
- memset(&cpuhw->info, 0, sizeof(cpuhw->info));
- qctri(&cpuhw->info);
- cpuhw->flags |= PMU_F_RESERVED;
- break;
-
- case PMC_RELEASE:
- cpuhw->flags &= ~PMU_F_RESERVED;
- break;
- }
-
- /* Disable CPU counter sets */
- lcctl(0);
-}
-
-bool kernel_cpumcf_avail(void)
-{
- return cpum_cf_initalized;
-}
-EXPORT_SYMBOL(kernel_cpumcf_avail);
-
-
-/* Reserve/release functions for sharing perf hardware */
-static DEFINE_SPINLOCK(cpumcf_owner_lock);
-static void *cpumcf_owner;
-
-/* Initialize the CPU-measurement counter facility */
-int __kernel_cpumcf_begin(void)
-{
- int flags = PMC_INIT;
- int err = 0;
-
- spin_lock(&cpumcf_owner_lock);
- if (cpumcf_owner)
- err = -EBUSY;
- else
- cpumcf_owner = __builtin_return_address(0);
- spin_unlock(&cpumcf_owner_lock);
- if (err)
- return err;
-
- on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
- irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
- return 0;
-}
-EXPORT_SYMBOL(__kernel_cpumcf_begin);
-
-/* Obtain the CPU-measurement alerts for the counter facility */
-unsigned long kernel_cpumcf_alert(int clear)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- unsigned long alert;
-
- alert = atomic64_read(&cpuhw->alert);
- if (clear)
- atomic64_set(&cpuhw->alert, 0);
-
- return alert;
-}
-EXPORT_SYMBOL(kernel_cpumcf_alert);
-
-/* Release the CPU-measurement counter facility */
-void __kernel_cpumcf_end(void)
-{
- int flags = PMC_RELEASE;
-
- on_each_cpu(cpum_cf_setup_cpu, &flags, 1);
- irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-
- spin_lock(&cpumcf_owner_lock);
- cpumcf_owner = NULL;
- spin_unlock(&cpumcf_owner_lock);
-}
-EXPORT_SYMBOL(__kernel_cpumcf_end);
-
-static int cpum_cf_setup(unsigned int cpu, int flags)
-{
- local_irq_disable();
- cpum_cf_setup_cpu(&flags);
- local_irq_enable();
- return 0;
-}
-
-static int cpum_cf_online_cpu(unsigned int cpu)
-{
- return cpum_cf_setup(cpu, PMC_INIT);
-}
-
-static int cpum_cf_offline_cpu(unsigned int cpu)
-{
- return cpum_cf_setup(cpu, PMC_RELEASE);
-}
-
-static int __init cpum_cf_init(void)
-{
- int rc;
-
- if (!cpum_cf_avail())
- return -ENODEV;
-
- /* clear bit 15 of cr0 to unauthorize problem-state to
- * extract measurement counters */
- ctl_clear_bit(0, 48);
-
- /* register handler for measurement-alert interruptions */
- rc = register_external_irq(EXT_IRQ_MEASURE_ALERT,
- cpumf_measurement_alert);
- if (rc) {
- pr_err("Registering for CPU-measurement alerts "
- "failed with rc=%i\n", rc);
- return rc;
- }
-
- rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE,
- "perf/s390/cf:online",
- cpum_cf_online_cpu, cpum_cf_offline_cpu);
- if (!rc)
- cpum_cf_initalized = true;
-
- return rc;
-}
-early_initcall(cpum_cf_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c
deleted file mode 100644
index e949ab832ed7..000000000000
--- a/arch/s390/kernel/perf_cpum_cf_diag.c
+++ /dev/null
@@ -1,705 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Performance event support for s390x - CPU-measurement Counter Sets
- *
- * Copyright IBM Corp. 2019
- * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
- * Thomas Richer <tmricht@linux.ibm.com>
- */
-#define KMSG_COMPONENT "cpum_cf_diag"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/processor.h>
-
-#include <asm/ctl_reg.h>
-#include <asm/irq.h>
-#include <asm/cpu_mcf.h>
-#include <asm/timex.h>
-#include <asm/debug.h>
-
-#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */
-
-static unsigned int cf_diag_cpu_speed;
-static debug_info_t *cf_diag_dbg;
-
-struct cf_diag_csd { /* Counter set data per CPU */
- size_t used; /* Bytes used in data/start */
- unsigned char start[PAGE_SIZE]; /* Counter set at event start */
- unsigned char data[PAGE_SIZE]; /* Counter set at event delete */
-};
-static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd);
-
-/* Counter sets are stored as data stream in a page sized memory buffer and
- * exported to user space via raw data attached to the event sample data.
- * Each counter set starts with an eight byte header consisting of:
- * - a two byte eye catcher (0xfeef)
- * - a one byte counter set number
- * - a two byte counter set size (indicates the number of counters in this set)
- * - a three byte reserved value (must be zero) to make the header the same
- * size as a counter value.
- * All counter values are eight byte in size.
- *
- * All counter sets are followed by a 64 byte trailer.
- * The trailer consists of a:
- * - flag field indicating valid fields when corresponding bit set
- * - the counter facility first and second version number
- * - the CPU speed if nonzero
- * - the time stamp the counter sets have been collected
- * - the time of day (TOD) base value
- * - the machine type.
- *
- * The counter sets are saved when the process is prepared to be executed on a
- * CPU and saved again when the process is going to be removed from a CPU.
- * The difference of both counter sets are calculated and stored in the event
- * sample data area.
- */
-
-struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */
- unsigned int def:16; /* 0-15 Data Entry Format */
- unsigned int set:16; /* 16-31 Counter set identifier */
- unsigned int ctr:16; /* 32-47 Number of stored counters */
- unsigned int res1:16; /* 48-63 Reserved */
-};
-
-struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */
- /* 0 - 7 */
- union {
- struct {
- unsigned int clock_base:1; /* TOD clock base set */
- unsigned int speed:1; /* CPU speed set */
- /* Measurement alerts */
- unsigned int mtda:1; /* Loss of MT ctr. data alert */
- unsigned int caca:1; /* Counter auth. change alert */
- unsigned int lcda:1; /* Loss of counter data alert */
- };
- unsigned long flags; /* 0-63 All indicators */
- };
- /* 8 - 15 */
- unsigned int cfvn:16; /* 64-79 Ctr First Version */
- unsigned int csvn:16; /* 80-95 Ctr Second Version */
- unsigned int cpu_speed:32; /* 96-127 CPU speed */
- /* 16 - 23 */
- unsigned long timestamp; /* 128-191 Timestamp (TOD) */
- /* 24 - 55 */
- union {
- struct {
- unsigned long progusage1;
- unsigned long progusage2;
- unsigned long progusage3;
- unsigned long tod_base;
- };
- unsigned long progusage[4];
- };
- /* 56 - 63 */
- unsigned int mach_type:16; /* Machine type */
- unsigned int res1:16; /* Reserved */
- unsigned int res2:32; /* Reserved */
-};
-
-/* Create the trailer data at the end of a page. */
-static void cf_diag_trailer(struct cf_trailer_entry *te)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cpuid cpuid;
-
- te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */
- te->csvn = cpuhw->info.csvn;
-
- get_cpu_id(&cpuid); /* Machine type */
- te->mach_type = cpuid.machine;
- te->cpu_speed = cf_diag_cpu_speed;
- if (te->cpu_speed)
- te->speed = 1;
- te->clock_base = 1; /* Save clock base */
- memcpy(&te->tod_base, &tod_clock_base[1], 8);
- store_tod_clock((__u64 *)&te->timestamp);
-}
-
-/*
- * Change the CPUMF state to active.
- * Enable and activate the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_enable(struct pmu *pmu)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- int err;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s pmu %p cpu %d flags %#x state %#llx\n",
- __func__, pmu, smp_processor_id(), cpuhw->flags,
- cpuhw->state);
- if (cpuhw->flags & PMU_F_ENABLED)
- return;
-
- err = lcctl(cpuhw->state);
- if (err) {
- pr_err("Enabling the performance measuring unit "
- "failed with rc=%x\n", err);
- return;
- }
- cpuhw->flags |= PMU_F_ENABLED;
-}
-
-/*
- * Change the CPUMF state to inactive.
- * Disable and enable (inactive) the CPU-counter sets according
- * to the per-cpu control state.
- */
-static void cf_diag_disable(struct pmu *pmu)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- u64 inactive;
- int err;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s pmu %p cpu %d flags %#x state %#llx\n",
- __func__, pmu, smp_processor_id(), cpuhw->flags,
- cpuhw->state);
- if (!(cpuhw->flags & PMU_F_ENABLED))
- return;
-
- inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1);
- err = lcctl(inactive);
- if (err) {
- pr_err("Disabling the performance measuring unit "
- "failed with rc=%x\n", err);
- return;
- }
- cpuhw->flags &= ~PMU_F_ENABLED;
-}
-
-/* Number of perf events counting hardware events */
-static atomic_t cf_diag_events = ATOMIC_INIT(0);
-
-/* Release the PMU if event is the last perf event */
-static void cf_diag_perf_event_destroy(struct perf_event *event)
-{
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d cf_diag_events %d\n",
- __func__, event, event->cpu,
- atomic_read(&cf_diag_events));
- if (atomic_dec_return(&cf_diag_events) == 0)
- __kernel_cpumcf_end();
-}
-
-/* Setup the event. Test for authorized counter sets and only include counter
- * sets which are authorized at the time of the setup. Including unauthorized
- * counter sets result in specification exception (and panic).
- */
-static int __hw_perf_event_init(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- struct cpu_cf_events *cpuhw;
- enum cpumf_ctr_set i;
- int err = 0;
-
- debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__,
- event, event->cpu);
-
- event->hw.config = attr->config;
- event->hw.config_base = 0;
-
- /* Add all authorized counter sets to config_base. The
- * the hardware init function is either called per-cpu or just once
- * for all CPUS (event->cpu == -1). This depends on the whether
- * counting is started for all CPUs or on a per workload base where
- * the perf event moves from one CPU to another CPU.
- * Checking the authorization on any CPU is fine as the hardware
- * applies the same authorization settings to all CPUs.
- */
- cpuhw = &get_cpu_var(cpu_cf_events);
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
- if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i])
- event->hw.config_base |= cpumf_ctr_ctl[i];
- put_cpu_var(cpu_cf_events);
-
- /* No authorized counter sets, nothing to count/sample */
- if (!event->hw.config_base) {
- err = -EINVAL;
- goto out;
- }
-
- /* Set sample_period to indicate sampling */
- event->hw.sample_period = attr->sample_period;
- local64_set(&event->hw.period_left, event->hw.sample_period);
- event->hw.last_period = event->hw.sample_period;
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n",
- __func__, err, event->hw.config_base);
- return err;
-}
-
-static int cf_diag_event_init(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- int err = -ENOENT;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d config %#llx type:%u "
- "sample_type %#llx cf_diag_events %d\n", __func__,
- event, event->cpu, attr->config, event->pmu->type,
- attr->sample_type, atomic_read(&cf_diag_events));
-
- if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG ||
- event->attr.type != event->pmu->type)
- goto out;
-
- /* Raw events are used to access counters directly,
- * hence do not permit excludes.
- * This event is usesless without PERF_SAMPLE_RAW to return counter set
- * values as raw data.
- */
- if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv ||
- !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) {
- err = -EOPNOTSUPP;
- goto out;
- }
-
- /* Initialize for using the CPU-measurement counter facility */
- if (atomic_inc_return(&cf_diag_events) == 1) {
- if (__kernel_cpumcf_begin()) {
- atomic_dec(&cf_diag_events);
- err = -EBUSY;
- goto out;
- }
- }
- event->destroy = cf_diag_perf_event_destroy;
-
- err = __hw_perf_event_init(event);
- if (unlikely(err))
- event->destroy(event);
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
- return err;
-}
-
-static void cf_diag_read(struct perf_event *event)
-{
- debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event);
-}
-
-/* Return the maximum possible counter set size (in number of 8 byte counters)
- * depending on type and model number.
- */
-static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset,
- struct cpumf_ctr_info *info)
-{
- size_t ctrset_size = 0;
-
- switch (ctrset) {
- case CPUMF_CTR_SET_BASIC:
- if (info->cfvn >= 1)
- ctrset_size = 6;
- break;
- case CPUMF_CTR_SET_USER:
- if (info->cfvn == 1)
- ctrset_size = 6;
- else if (info->cfvn >= 3)
- ctrset_size = 2;
- break;
- case CPUMF_CTR_SET_CRYPTO:
- if (info->csvn >= 1 && info->csvn <= 5)
- ctrset_size = 16;
- else if (info->csvn == 6)
- ctrset_size = 20;
- break;
- case CPUMF_CTR_SET_EXT:
- if (info->csvn == 1)
- ctrset_size = 32;
- else if (info->csvn == 2)
- ctrset_size = 48;
- else if (info->csvn >= 3 && info->csvn <= 5)
- ctrset_size = 128;
- else if (info->csvn == 6)
- ctrset_size = 160;
- break;
- case CPUMF_CTR_SET_MT_DIAG:
- if (info->csvn > 3)
- ctrset_size = 48;
- break;
- case CPUMF_CTR_SET_MAX:
- break;
- }
-
- return ctrset_size;
-}
-
-/* Calculate memory needed to store all counter sets together with header and
- * trailer data. This is independend of the counter set authorization which
- * can vary depending on the configuration.
- */
-static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info)
-{
- size_t max_size = sizeof(struct cf_trailer_entry);
- enum cpumf_ctr_set i;
-
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
- size_t size = cf_diag_ctrset_size(i, info);
-
- if (size)
- max_size += size * sizeof(u64) +
- sizeof(struct cf_ctrset_entry);
- }
- debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__,
- max_size);
-
- return max_size;
-}
-
-/* Read a counter set. The counter set number determines which counter set and
- * the CPUM-CF first and second version number determine the number of
- * available counters in this counter set.
- * Each counter set starts with header containing the counter set number and
- * the number of 8 byte counters.
- *
- * The functions returns the number of bytes occupied by this counter set
- * including the header.
- * If there is no counter in the counter set, this counter set is useless and
- * zero is returned on this case.
- */
-static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset,
- size_t room)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- size_t ctrset_size, need = 0;
- int rc = 3; /* Assume write failure */
-
- ctrdata->def = CF_DIAG_CTRSET_DEF;
- ctrdata->set = ctrset;
- ctrdata->res1 = 0;
- ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info);
-
- if (ctrset_size) { /* Save data */
- need = ctrset_size * sizeof(u64) + sizeof(*ctrdata);
- if (need <= room)
- rc = ctr_stcctm(ctrset, ctrset_size,
- (u64 *)(ctrdata + 1));
- if (rc != 3)
- ctrdata->ctr = ctrset_size;
- else
- need = 0;
- }
-
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s ctrset %d ctrset_size %zu cfvn %d csvn %d"
- " need %zd rc %d\n",
- __func__, ctrset, ctrset_size, cpuhw->info.cfvn,
- cpuhw->info.csvn, need, rc);
- return need;
-}
-
-/* Read out all counter sets and save them in the provided data buffer.
- * The last 64 byte host an artificial trailer entry.
- */
-static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth)
-{
- struct cf_trailer_entry *trailer;
- size_t offset = 0, done;
- int i;
-
- memset(data, 0, sz);
- sz -= sizeof(*trailer); /* Always room for trailer */
- for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
- struct cf_ctrset_entry *ctrdata = data + offset;
-
- if (!(auth & cpumf_ctr_ctl[i]))
- continue; /* Counter set not authorized */
-
- done = cf_diag_getctrset(ctrdata, i, sz - offset);
- offset += done;
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s ctrset %d offset %zu done %zu\n",
- __func__, i, offset, done);
- }
- trailer = data + offset;
- cf_diag_trailer(trailer);
- return offset + sizeof(*trailer);
-}
-
-/* Calculate the difference for each counter in a counter set. */
-static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters)
-{
- for (; --counters >= 0; ++pstart, ++pstop)
- if (*pstop >= *pstart)
- *pstop -= *pstart;
- else
- *pstop = *pstart - *pstop;
-}
-
-/* Scan the counter sets and calculate the difference of each counter
- * in each set. The result is the increment of each counter during the
- * period the counter set has been activated.
- *
- * Return true on success.
- */
-static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth)
-{
- struct cf_trailer_entry *trailer_start, *trailer_stop;
- struct cf_ctrset_entry *ctrstart, *ctrstop;
- size_t offset = 0;
-
- auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
- do {
- ctrstart = (struct cf_ctrset_entry *)(csd->start + offset);
- ctrstop = (struct cf_ctrset_entry *)(csd->data + offset);
-
- if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
- pr_err("cpum_cf_diag counter set compare error "
- "in set %i\n", ctrstart->set);
- return 0;
- }
- auth &= ~cpumf_ctr_ctl[ctrstart->set];
- if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
- cf_diag_diffctrset((u64 *)(ctrstart + 1),
- (u64 *)(ctrstop + 1), ctrstart->ctr);
- offset += ctrstart->ctr * sizeof(u64) +
- sizeof(*ctrstart);
- }
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s set %d ctr %d offset %zu auth %lx\n",
- __func__, ctrstart->set, ctrstart->ctr,
- offset, auth);
- } while (ctrstart->def && auth);
-
- /* Save time_stamp from start of event in stop's trailer */
- trailer_start = (struct cf_trailer_entry *)(csd->start + offset);
- trailer_stop = (struct cf_trailer_entry *)(csd->data + offset);
- trailer_stop->progusage[0] = trailer_start->timestamp;
-
- return 1;
-}
-
-/* Create perf event sample with the counter sets as raw data. The sample
- * is then pushed to the event subsystem and the function checks for
- * possible event overflows. If an event overflow occurs, the PMU is
- * stopped.
- *
- * Return non-zero if an event overflow occurred.
- */
-static int cf_diag_push_sample(struct perf_event *event,
- struct cf_diag_csd *csd)
-{
- struct perf_sample_data data;
- struct perf_raw_record raw;
- struct pt_regs regs;
- int overflow;
-
- /* Setup perf sample */
- perf_sample_data_init(&data, 0, event->hw.last_period);
- memset(&regs, 0, sizeof(regs));
- memset(&raw, 0, sizeof(raw));
-
- if (event->attr.sample_type & PERF_SAMPLE_CPU)
- data.cpu_entry.cpu = event->cpu;
- if (event->attr.sample_type & PERF_SAMPLE_RAW) {
- raw.frag.size = csd->used;
- raw.frag.data = csd->data;
- raw.size = csd->used;
- data.raw = &raw;
- }
-
- overflow = perf_event_overflow(event, &data, &regs);
- debug_sprintf_event(cf_diag_dbg, 6,
- "%s event %p cpu %d sample_type %#llx raw %d "
- "ov %d\n", __func__, event, event->cpu,
- event->attr.sample_type, raw.size, overflow);
- if (overflow)
- event->pmu->stop(event, 0);
-
- perf_event_update_userpage(event);
- return overflow;
-}
-
-static void cf_diag_start(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
- struct hw_perf_event *hwc = &event->hw;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x hwc-state %#x\n",
- __func__, event, event->cpu, flags, hwc->state);
- if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
- return;
-
- /* (Re-)enable and activate all counter sets */
- lcctl(0); /* Reset counter sets */
- hwc->state = 0;
- ctr_set_multiple_enable(&cpuhw->state, hwc->config_base);
- lcctl(cpuhw->state); /* Enable counter sets */
- csd->used = cf_diag_getctr(csd->start, sizeof(csd->start),
- event->hw.config_base);
- ctr_set_multiple_start(&cpuhw->state, hwc->config_base);
- /* Function cf_diag_enable() starts the counter sets. */
-}
-
-static void cf_diag_stop(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd);
- struct hw_perf_event *hwc = &event->hw;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x hwc-state %#x\n",
- __func__, event, event->cpu, flags, hwc->state);
-
- /* Deactivate all counter sets */
- ctr_set_multiple_stop(&cpuhw->state, hwc->config_base);
- local64_inc(&event->count);
- csd->used = cf_diag_getctr(csd->data, sizeof(csd->data),
- event->hw.config_base);
- if (cf_diag_diffctr(csd, event->hw.config_base))
- cf_diag_push_sample(event, csd);
- hwc->state |= PERF_HES_STOPPED;
-}
-
-static int cf_diag_add(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
- int err = 0;
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x cpuhw %p\n",
- __func__, event, event->cpu, flags, cpuhw);
-
- if (cpuhw->flags & PMU_F_IN_USE) {
- err = -EAGAIN;
- goto out;
- }
-
- event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
- cpuhw->flags |= PMU_F_IN_USE;
- if (flags & PERF_EF_START)
- cf_diag_start(event, PERF_EF_RELOAD);
-out:
- debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err);
- return err;
-}
-
-static void cf_diag_del(struct perf_event *event, int flags)
-{
- struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events);
-
- debug_sprintf_event(cf_diag_dbg, 5,
- "%s event %p cpu %d flags %#x\n",
- __func__, event, event->cpu, flags);
-
- cf_diag_stop(event, PERF_EF_UPDATE);
- ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base);
- ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base);
- cpuhw->flags &= ~PMU_F_IN_USE;
-}
-
-CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG);
-
-static struct attribute *cf_diag_events_attr[] = {
- CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG),
- NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-
-static struct attribute *cf_diag_format_attr[] = {
- &format_attr_event.attr,
- NULL,
-};
-
-static struct attribute_group cf_diag_events_group = {
- .name = "events",
- .attrs = cf_diag_events_attr,
-};
-static struct attribute_group cf_diag_format_group = {
- .name = "format",
- .attrs = cf_diag_format_attr,
-};
-static const struct attribute_group *cf_diag_attr_groups[] = {
- &cf_diag_events_group,
- &cf_diag_format_group,
- NULL,
-};
-
-/* Performance monitoring unit for s390x */
-static struct pmu cf_diag = {
- .task_ctx_nr = perf_sw_context,
- .pmu_enable = cf_diag_enable,
- .pmu_disable = cf_diag_disable,
- .event_init = cf_diag_event_init,
- .add = cf_diag_add,
- .del = cf_diag_del,
- .start = cf_diag_start,
- .stop = cf_diag_stop,
- .read = cf_diag_read,
-
- .attr_groups = cf_diag_attr_groups
-};
-
-/* Get the CPU speed, try sampling facility first and CPU attributes second. */
-static void cf_diag_get_cpu_speed(void)
-{
- if (cpum_sf_avail()) { /* Sampling facility first */
- struct hws_qsi_info_block si;
-
- memset(&si, 0, sizeof(si));
- if (!qsi(&si)) {
- cf_diag_cpu_speed = si.cpu_speed;
- return;
- }
- }
-
- if (test_facility(34)) { /* CPU speed extract static part */
- unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
-
- if (mhz != -1UL)
- cf_diag_cpu_speed = mhz & 0xffffffff;
- }
-}
-
-/* Initialize the counter set PMU to generate complete counter set data as
- * event raw data. This relies on the CPU Measurement Counter Facility device
- * already being loaded and initialized.
- */
-static int __init cf_diag_init(void)
-{
- struct cpumf_ctr_info info;
- size_t need;
- int rc;
-
- if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info))
- return -ENODEV;
- cf_diag_get_cpu_speed();
-
- /* Make sure the counter set data fits into predefined buffer. */
- need = cf_diag_ctrset_maxsize(&info);
- if (need > sizeof(((struct cf_diag_csd *)0)->start)) {
- pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n",
- need);
- return -ENOMEM;
- }
-
- /* Setup s390dbf facility */
- cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128);
- if (!cf_diag_dbg) {
- pr_err("Registration of s390dbf(cpum_cf_diag) failed\n");
- return -ENOMEM;
- }
- debug_register_view(cf_diag_dbg, &debug_sprintf_view);
-
- rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1);
- if (rc) {
- debug_unregister_view(cf_diag_dbg, &debug_sprintf_view);
- debug_unregister(cf_diag_dbg);
- pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n",
- rc);
- }
- return rc;
-}
-arch_initcall(cf_diag_init);
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 8b33e03e47b8..0d64aafd158f 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -238,6 +238,134 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5);
CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_HPAGE_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z15, DTLB2_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z15, L1D_L2D_SOURCED_WRITES, 0x0085);
+CPUMF_EVENT_ATTR(cf_z15, ITLB2_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z15, ITLB2_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z15, L1I_L2I_SOURCED_WRITES, 0x0088);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_CRSTE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z15, TLB2_ENGINES_BUSY, 0x008b);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z15, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z15, L1C_TLB2_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES, 0x0090);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES, 0x0091);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV, 0x0092);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES, 0x0093);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x0094);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x0095);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES, 0x0096);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x0097);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x0098);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES, 0x0099);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x009a);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x009b);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES, 0x009c);
+CPUMF_EVENT_ATTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES, 0x009d);
+CPUMF_EVENT_ATTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO, 0x009e);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af);
+CPUMF_EVENT_ATTR(cf_z15, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
+CPUMF_EVENT_ATTR(cf_z15, VX_BCD_EXECUTION_SLOTS, 0x00e1);
+CPUMF_EVENT_ATTR(cf_z15, DECIMAL_INSTRUCTIONS, 0x00e2);
+CPUMF_EVENT_ATTR(cf_z15, LAST_HOST_TRANSLATIONS, 0x00e8);
+CPUMF_EVENT_ATTR(cf_z15, TX_NC_TABORT, 0x00f3);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4);
+CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108);
+CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109);
+CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082);
+CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083);
+CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084);
+CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086);
+CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a);
+CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c);
+CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d);
+CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093);
+CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a);
+CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d);
+CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e);
+CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7);
+CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab);
+CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2);
+CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5);
+CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6);
+CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7);
+CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
+CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1);
+CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2);
+CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8);
+CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5);
+CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd);
+CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109);
+CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d);
+CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e);
+CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0);
+CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1);
+
static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = {
CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES),
CPUMF_EVENT_PTR(cf_fvn1, INSTRUCTIONS),
@@ -286,7 +414,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = {
NULL,
};
-static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = {
+static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = {
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS),
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES),
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS),
@@ -516,6 +644,141 @@ static struct attribute *cpumcf_z14_pmu_event_attr[] __initdata = {
NULL,
};
+static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = {
+ CPUMF_EVENT_PTR(cf_z15, L1D_RO_EXCL_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_HPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, DTLB2_GPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_L2D_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, ITLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, ITLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_L2I_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_PTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_CRSTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, TLB2_ENGINES_BUSY),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TEND),
+ CPUMF_EVENT_PTR(cf_z15, TX_NC_TEND),
+ CPUMF_EVENT_PTR(cf_z15, L1C_TLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_OFFDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1D_ONCHIP_L3_SOURCED_WRITES_RO),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCHIP_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFCLUSTER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_MEMORY_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L3_SOURCED_WRITES_IV),
+ CPUMF_EVENT_PTR(cf_z15, L1I_ONDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, L1I_OFFDRAWER_L4_SOURCED_WRITES),
+ CPUMF_EVENT_PTR(cf_z15, BCD_DFP_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z15, VX_BCD_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z15, DECIMAL_INSTRUCTIONS),
+ CPUMF_EVENT_PTR(cf_z15, LAST_HOST_TRANSLATIONS),
+ CPUMF_EVENT_PTR(cf_z15, TX_NC_TABORT),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_NO_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z15, TX_C_TABORT_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_ACCESS),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CYCLES),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CC),
+ CPUMF_EVENT_PTR(cf_z15, DFLT_CCFINISH),
+ CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+ CPUMF_EVENT_PTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+ NULL,
+};
+
+static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = {
+ CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES),
+ CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TEND),
+ CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND),
+ CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY),
+ CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS),
+ CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS),
+ CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS),
+ CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES),
+ CPUMF_EVENT_PTR(cf_z16, SORTL),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CC),
+ CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK),
+ CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK),
+ CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE),
+ CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE),
+ NULL,
+};
+
/* END: CPUM_CF COUNTER DEFINITIONS ===================================== */
static struct attribute_group cpumcf_pmu_events_group = {
@@ -596,8 +859,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
case 1 ... 5:
csvn = cpumcf_svn_12345_pmu_event_attr;
break;
- case 6:
- csvn = cpumcf_svn_6_pmu_event_attr;
+ case 6 ... 7:
+ csvn = cpumcf_svn_67_pmu_event_attr;
break;
default:
csvn = none;
@@ -624,9 +887,15 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
break;
case 0x3906:
case 0x3907:
+ model = cpumcf_z14_pmu_event_attr;
+ break;
case 0x8561:
case 0x8562:
- model = cpumcf_z14_pmu_event_attr;
+ model = cpumcf_z15_pmu_event_attr;
+ break;
+ case 0x3931:
+ case 0x3932:
+ model = cpumcf_z16_pmu_event_attr;
break;
default:
model = none;
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index c07fdcd73726..06efad5b4f93 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -22,6 +22,7 @@
#include <asm/irq.h>
#include <asm/debug.h>
#include <asm/timex.h>
+#include <linux/io.h>
/* Minimum number of sample-data-block-tables:
* At least one table is required for the sampling buffer structure.
@@ -42,7 +43,7 @@
#define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8)
static inline int require_table_link(const void *sdbt)
{
- return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
+ return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET;
}
/* Minimum and maximum sampling buffer sizes:
@@ -99,6 +100,57 @@ static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf);
/* Debug feature */
static debug_info_t *sfdbg;
+/* Sampling control helper functions */
+static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi,
+ unsigned long freq)
+{
+ return (USEC_PER_SEC / freq) * qsi->cpu_speed;
+}
+
+static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi,
+ unsigned long rate)
+{
+ return USEC_PER_SEC * qsi->cpu_speed / rate;
+}
+
+/* Return TOD timestamp contained in an trailer entry */
+static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te)
+{
+ /* TOD in STCKE format */
+ if (te->header.t)
+ return *((unsigned long long *)&te->timestamp[1]);
+
+ /* TOD in STCK format */
+ return *((unsigned long long *)&te->timestamp[0]);
+}
+
+/* Return pointer to trailer entry of an sample data block */
+static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v)
+{
+ void *ret;
+
+ ret = (void *)v;
+ ret += PAGE_SIZE;
+ ret -= sizeof(struct hws_trailer_entry);
+
+ return ret;
+}
+
+/*
+ * Return true if the entry in the sample data block table (sdbt)
+ * is a link to the next sdbt
+ */
+static inline int is_link_entry(unsigned long *s)
+{
+ return *s & 0x1UL ? 1 : 0;
+}
+
+/* Return pointer to the linked sdbt */
+static inline unsigned long *get_next_sdbt(unsigned long *s)
+{
+ return phys_to_virt(*s & ~0x1UL);
+}
+
/*
* sf_disable() - Switch off sampling facility
*/
@@ -140,7 +192,7 @@ static void free_sampling_buffer(struct sf_buffer *sfb)
if (is_link_entry(curr)) {
curr = get_next_sdbt(curr);
if (sdbt)
- free_page((unsigned long) sdbt);
+ free_page((unsigned long)sdbt);
/* If the origin is reached, sampling buffer is freed */
if (curr == sfb->sdbt)
@@ -150,7 +202,7 @@ static void free_sampling_buffer(struct sf_buffer *sfb)
} else {
/* Process SDB pointer */
if (*curr) {
- free_page(*curr);
+ free_page((unsigned long)phys_to_virt(*curr));
curr++;
}
}
@@ -163,17 +215,18 @@ static void free_sampling_buffer(struct sf_buffer *sfb)
static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags)
{
- unsigned long sdb, *trailer;
+ struct hws_trailer_entry *te;
+ unsigned long sdb;
/* Allocate and initialize sample-data-block */
sdb = get_zeroed_page(gfp_flags);
if (!sdb)
return -ENOMEM;
- trailer = trailer_entry_ptr(sdb);
- *trailer = SDB_TE_ALERT_REQ_MASK;
+ te = trailer_entry_ptr(sdb);
+ te->header.a = 1;
/* Link SDB into the sample-data-block-table */
- *sdbt = sdb;
+ *sdbt = virt_to_phys((void *)sdb);
return 0;
}
@@ -225,14 +278,14 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
for (i = 0; i < num_sdb; i++) {
/* Allocate a new SDB-table if it is full. */
if (require_table_link(tail)) {
- new = (unsigned long *) get_zeroed_page(gfp_flags);
+ new = (unsigned long *)get_zeroed_page(gfp_flags);
if (!new) {
rc = -ENOMEM;
break;
}
sfb->num_sdbt++;
/* Link current page to tail of chain */
- *tail = (unsigned long)(void *) new + 1;
+ *tail = virt_to_phys((void *)new) + 1;
tail_prev = tail;
tail = new;
}
@@ -251,7 +304,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
*/
if (tail_prev) {
sfb->num_sdbt--;
- free_page((unsigned long) new);
+ free_page((unsigned long)new);
tail = tail_prev;
}
break;
@@ -262,7 +315,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb,
}
/* Link sampling buffer to its origin */
- *tail = (unsigned long) sfb->sdbt + 1;
+ *tail = virt_to_phys(sfb->sdbt) + 1;
sfb->tail = tail;
debug_sprintf_event(sfdbg, 4, "%s: new buffer"
@@ -290,7 +343,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
return -EINVAL;
/* Allocate the sample-data-block-table origin */
- sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!sfb->sdbt)
return -ENOMEM;
sfb->num_sdb = 0;
@@ -300,7 +353,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb)
* realloc_sampling_buffer() invocation.
*/
sfb->tail = sfb->sdbt;
- *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1;
+ *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1;
/* Allocate requested number of sample-data-blocks */
rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL);
@@ -372,28 +425,33 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
{
- unsigned long n_sdb, freq, factor;
+ unsigned long n_sdb, freq;
size_t sample_size;
/* Calculate sampling buffers using 4K pages
*
- * 1. Determine the sample data size which depends on the used
- * sampling functions, for example, basic-sampling or
- * basic-sampling with diagnostic-sampling.
+ * 1. The sampling size is 32 bytes for basic sampling. This size
+ * is the same for all machine types. Diagnostic
+ * sampling uses auxlilary data buffer setup which provides the
+ * memory for SDBs using linux common code auxiliary trace
+ * setup.
*
- * 2. Use the sampling frequency as input. The sampling buffer is
- * designed for almost one second. This can be adjusted through
- * the "factor" variable.
- * In any case, alloc_sampling_buffer() sets the Alert Request
+ * 2. Function alloc_sampling_buffer() sets the Alert Request
* Control indicator to trigger a measurement-alert to harvest
- * sample-data-blocks (sdb).
+ * sample-data-blocks (SDB). This is done per SDB. This
+ * measurement alert interrupt fires quick enough to handle
+ * one SDB, on very high frequency and work loads there might
+ * be 2 to 3 SBDs available for sample processing.
+ * Currently there is no need for setup alert request on every
+ * n-th page. This is counterproductive as one IRQ triggers
+ * a very high number of samples to be processed at one IRQ.
*
- * 3. Compute the number of sample-data-blocks and ensure a minimum
- * of CPUM_SF_MIN_SDB. Also ensure the upper limit does not
- * exceed a "calculated" maximum. The symbolic maximum is
- * designed for basic-sampling only and needs to be increased if
- * diagnostic-sampling is active.
- * See also the remarks for these symbolic constants.
+ * 3. Use the sampling frequency as input.
+ * Compute the number of SDBs and ensure a minimum
+ * of CPUM_SF_MIN_SDB. Depending on frequency add some more
+ * SDBs to handle a higher sampling rate.
+ * Use a minimum of CPUM_SF_MIN_SDB and allow for 100 samples
+ * (one SDB) for every 10000 HZ frequency increment.
*
* 4. Compute the number of sample-data-block-tables (SDBT) and
* ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up
@@ -401,10 +459,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc)
*/
sample_size = sizeof(struct hws_basic_entry);
freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc));
- factor = 1;
- n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size));
- if (n_sdb < CPUM_SF_MIN_SDB)
- n_sdb = CPUM_SF_MIN_SDB;
+ n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000);
/* If there is already a sampling buffer allocated, it is very likely
* that the sampling facility is enabled too. If the event to be
@@ -539,11 +594,10 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
#define PMC_FAILURE 2
static void setup_pmc_cpu(void *flags)
{
- int err;
struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf);
+ int err = 0;
- err = 0;
- switch (*((int *) flags)) {
+ switch (*((int *)flags)) {
case PMC_INIT:
memset(cpusf, 0, sizeof(*cpusf));
err = qsi(&cpusf->qsi);
@@ -551,28 +605,18 @@ static void setup_pmc_cpu(void *flags)
break;
cpusf->flags |= PMU_F_RESERVED;
err = sf_disable();
- if (err)
- pr_err("Switching off the sampling facility failed "
- "with rc %i\n", err);
- debug_sprintf_event(sfdbg, 5,
- "%s: initialized: cpuhw %p\n", __func__,
- cpusf);
break;
case PMC_RELEASE:
cpusf->flags &= ~PMU_F_RESERVED;
err = sf_disable();
- if (err) {
- pr_err("Switching off the sampling facility failed "
- "with rc %i\n", err);
- } else
+ if (!err)
deallocate_buffers(cpusf);
- debug_sprintf_event(sfdbg, 5,
- "%s: released: cpuhw %p\n", __func__,
- cpusf);
break;
}
- if (err)
- *((int *) flags) |= PMC_FAILURE;
+ if (err) {
+ *((int *)flags) |= PMC_FAILURE;
+ pr_err("Switching off the sampling facility failed with rc %i\n", err);
+ }
}
static void release_pmc_hardware(void)
@@ -669,8 +713,9 @@ static void cpumsf_output_event_pid(struct perf_event *event,
/* Protect callchain buffers, tasks */
rcu_read_lock();
- perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ perf_prepare_sample(data, event, regs);
+ perf_prepare_header(&header, data, event, regs);
+ if (perf_output_begin(&handle, data, event, header.size))
goto out;
/* Update the process ID (see also kernel/events/core.c) */
@@ -832,10 +877,6 @@ static int __hw_perf_event_init(struct perf_event *event)
SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE;
}
- /* Check and set other sampling flags */
- if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS)
- SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS;
-
err = __hw_perf_event_init_rate(event, &si);
if (err)
goto out;
@@ -879,12 +920,21 @@ out:
return err;
}
+static bool is_callchain_event(struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ return sample_type & (PERF_SAMPLE_CALLCHAIN | PERF_SAMPLE_REGS_USER |
+ PERF_SAMPLE_STACK_USER);
+}
+
static int cpumsf_pmu_event_init(struct perf_event *event)
{
int err;
/* No support for taken branch sampling */
- if (has_branch_stack(event))
+ /* No support for callchain, stacks and registers */
+ if (has_branch_stack(event) || is_callchain_event(event))
return -EOPNOTSUPP;
switch (event->attr.type) {
@@ -908,10 +958,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
return -ENOENT;
}
- /* Check online status of the CPU to which the event is pinned */
- if (event->cpu >= 0 && !cpu_online(event->cpu))
- return -ENODEV;
-
/* Force reset of idle/hv excludes regardless of what the
* user requested.
*/
@@ -971,8 +1017,7 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
err = lsctl(&cpuhw->lsctl);
if (err) {
cpuhw->flags &= ~PMU_F_ENABLED;
- pr_err("Loading sampling controls failed: op %i err %i\n",
- 1, err);
+ pr_err("Loading sampling controls failed: op 1 err %i\n", err);
return;
}
@@ -1006,8 +1051,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
err = lsctl(&inactive);
if (err) {
- pr_err("Loading sampling controls failed: op %i err %i\n",
- 2, err);
+ pr_err("Loading sampling controls failed: op 2 err %i\n", err);
return;
}
@@ -1164,11 +1208,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
struct hws_trailer_entry *te;
struct hws_basic_entry *sample;
- te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
- sample = (struct hws_basic_entry *) *sdbt;
- while ((unsigned long *) sample < (unsigned long *) te) {
+ te = trailer_entry_ptr((unsigned long)sdbt);
+ sample = (struct hws_basic_entry *)sdbt;
+ while ((unsigned long *)sample < (unsigned long *)te) {
/* Check for an empty sample */
- if (!sample->def)
+ if (!sample->def || sample->LS)
break;
/* Update perf event period */
@@ -1195,7 +1239,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
"%s: Found unknown"
" sampling data entry: te->f %i"
" basic.def %#4x (%p)\n", __func__,
- te->f, sample->def, sample);
+ te->header.f, sample->def, sample);
/* Sample slot is not yet written or other record.
*
* This condition can occur if the buffer was reused
@@ -1206,7 +1250,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
* that are not full. Stop processing if the first
* invalid format was detected.
*/
- if (!te->f)
+ if (!te->header.f)
break;
}
@@ -1224,18 +1268,16 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
* The sampling buffer position are retrieved and saved in the TEAR_REG
* register of the specified perf event.
*
- * Only full sample-data-blocks are processed. Specify the flash_all flag
- * to also walk through partially filled sample-data-blocks. It is ignored
- * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag
- * enforces the processing of full sample-data-blocks only (trailer entries
- * with the block-full-indicator bit set).
+ * Only full sample-data-blocks are processed. Specify the flush_all flag
+ * to also walk through partially filled sample-data-blocks.
*/
static void hw_perf_event_update(struct perf_event *event, int flush_all)
{
+ unsigned long long event_overflow, sampl_overflow, num_sdb;
+ union hws_trailer_header old, prev, new;
struct hw_perf_event *hwc = &event->hw;
struct hws_trailer_entry *te;
- unsigned long *sdbt;
- unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags;
+ unsigned long *sdbt, sdb;
int done;
/*
@@ -1245,50 +1287,52 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
if (SAMPL_DIAG_MODE(&event->hw))
return;
- if (flush_all && SDB_FULL_BLOCKS(hwc))
- flush_all = 0;
-
- sdbt = (unsigned long *) TEAR_REG(hwc);
+ sdbt = (unsigned long *)TEAR_REG(hwc);
done = event_overflow = sampl_overflow = num_sdb = 0;
while (!done) {
/* Get the trailer entry of the sample-data-block */
- te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt);
+ sdb = (unsigned long)phys_to_virt(*sdbt);
+ te = trailer_entry_ptr(sdb);
/* Leave loop if no more work to do (block full indicator) */
- if (!te->f) {
+ if (!te->header.f) {
done = 1;
if (!flush_all)
break;
}
/* Check the sample overflow count */
- if (te->overflow)
+ if (te->header.overflow)
/* Account sample overflows and, if a particular limit
* is reached, extend the sampling buffer.
* For details, see sfb_account_overflows().
*/
- sampl_overflow += te->overflow;
+ sampl_overflow += te->header.overflow;
/* Timestamps are valid for full sample-data-blocks only */
- debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx "
+ debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx/%#lx "
"overflow %llu timestamp %#llx\n",
- __func__, (unsigned long)sdbt, te->overflow,
- (te->f) ? trailer_timestamp(te) : 0ULL);
+ __func__, sdb, (unsigned long)sdbt,
+ te->header.overflow,
+ (te->header.f) ? trailer_timestamp(te) : 0ULL);
/* Collect all samples from a single sample-data-block and
* flag if an (perf) event overflow happened. If so, the PMU
* is stopped and remaining samples will be discarded.
*/
- hw_collect_samples(event, sdbt, &event_overflow);
+ hw_collect_samples(event, (unsigned long *)sdb, &event_overflow);
num_sdb++;
/* Reset trailer (using compare-double-and-swap) */
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te_flags |= SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- te->flags, te->overflow,
- te_flags, 0ULL));
+ old.val = prev.val;
+ new.val = prev.val;
+ new.f = 0;
+ new.a = 1;
+ new.overflow = 0;
+ prev.val = cmpxchg128(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
/* Advance to next sample-data-block */
sdbt++;
@@ -1303,18 +1347,28 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
*/
if (flush_all && done)
break;
-
- /* If an event overflow happened, discard samples by
- * processing any remaining sample-data-blocks.
- */
- if (event_overflow)
- flush_all = 1;
}
/* Account sample overflows in the event hardware structure */
if (sampl_overflow)
OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) +
sampl_overflow, 1 + num_sdb);
+
+ /* Perf_event_overflow() and perf_event_account_interrupt() limit
+ * the interrupt rate to an upper limit. Roughly 1000 samples per
+ * task tick.
+ * Hitting this limit results in a large number
+ * of throttled REF_REPORT_THROTTLE entries and the samples
+ * are dropped.
+ * Slightly increase the interval to avoid hitting this limit.
+ */
+ if (event_overflow) {
+ SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10);
+ debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n",
+ __func__,
+ DIV_ROUND_UP(SAMPL_RATE(hwc), 10));
+ }
+
if (sampl_overflow || event_overflow)
debug_sprintf_event(sfdbg, 4, "%s: "
"overflows: sample %llu event %llu"
@@ -1323,10 +1377,26 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
OVERFLOW_REG(hwc), num_sdb);
}
-#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb)
-#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0)
-#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark)
-#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark)
+static inline unsigned long aux_sdb_index(struct aux_buffer *aux,
+ unsigned long i)
+{
+ return i % aux->sfb.num_sdb;
+}
+
+static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end)
+{
+ return end >= start ? end - start + 1 : 0;
+}
+
+static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux)
+{
+ return aux_sdb_num(aux->head, aux->alert_mark);
+}
+
+static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux)
+{
+ return aux_sdb_num(aux->head, aux->empty_mark);
+}
/*
* Get trailer entry by index of SDB.
@@ -1336,9 +1406,9 @@ static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux,
{
unsigned long sdb;
- index = AUX_SDB_INDEX(aux, index);
+ index = aux_sdb_index(aux, index);
sdb = aux->sdb_index[index];
- return (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+ return trailer_entry_ptr(sdb);
}
/*
@@ -1360,10 +1430,10 @@ static void aux_output_end(struct perf_output_handle *handle)
if (!aux)
return;
- range_scan = AUX_SDB_NUM_ALERT(aux);
+ range_scan = aux_sdb_num_alert(aux);
for (i = 0, idx = aux->head; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- if (!(te->flags & SDB_TE_BUFFER_FULL_MASK))
+ if (!te->header.f)
break;
}
/* i is num of SDBs which are full */
@@ -1371,9 +1441,10 @@ static void aux_output_end(struct perf_output_handle *handle)
/* Remove alert indicators in the buffer */
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags &= ~SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 0;
- debug_sprintf_event(sfdbg, 6, "%s: collect %#lx SDBs\n", __func__, i);
+ debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n",
+ __func__, i, range_scan, aux->head);
}
/*
@@ -1389,9 +1460,7 @@ static int aux_output_begin(struct perf_output_handle *handle,
struct aux_buffer *aux,
struct cpu_hw_sf *cpuhw)
{
- unsigned long range;
- unsigned long i, range_scan, idx;
- unsigned long head, base, offset;
+ unsigned long range, i, range_scan, idx, head, base, offset;
struct hws_trailer_entry *te;
if (WARN_ON_ONCE(handle->head & ~PAGE_MASK))
@@ -1406,14 +1475,18 @@ static int aux_output_begin(struct perf_output_handle *handle,
* SDBs between aux->head and aux->empty_mark are already ready
* for new data. range_scan is num of SDBs not within them.
*/
- if (range > AUX_SDB_NUM_EMPTY(aux)) {
- range_scan = range - AUX_SDB_NUM_EMPTY(aux);
+ debug_sprintf_event(sfdbg, 6,
+ "%s: range %ld head %ld alert %ld empty %ld\n",
+ __func__, range, aux->head, aux->alert_mark,
+ aux->empty_mark);
+ if (range > aux_sdb_num_empty(aux)) {
+ range_scan = range - aux_sdb_num_empty(aux);
idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
- te->flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK;
- te->flags = te->flags & ~SDB_TE_ALERT_REQ_MASK;
- te->overflow = 0;
+ te->header.f = 0;
+ te->header.a = 0;
+ te->header.overflow = 0;
}
/* Save the position of empty SDBs */
aux->empty_mark = aux->head + range - 1;
@@ -1422,24 +1495,20 @@ static int aux_output_begin(struct perf_output_handle *handle,
/* Set alert indicator */
aux->alert_mark = aux->head + range/2 - 1;
te = aux_sdb_trailer(aux, aux->alert_mark);
- te->flags = te->flags | SDB_TE_ALERT_REQ_MASK;
+ te->header.a = 1;
/* Reset hardware buffer head */
- head = AUX_SDB_INDEX(aux, aux->head);
+ head = aux_sdb_index(aux, aux->head);
base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE];
offset = head % CPUM_SF_SDB_PER_TABLE;
- cpuhw->lsctl.tear = base + offset * sizeof(unsigned long);
- cpuhw->lsctl.dear = aux->sdb_index[head];
+ cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long);
+ cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]);
- debug_sprintf_event(sfdbg, 6, "%s: "
- "head->alert_mark->empty_mark (num_alert, range)"
- "[%#lx -> %#lx -> %#lx] (%#lx, %#lx) "
- "tear index %#lx, tear %#lx dear %#lx\n", __func__,
+ debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld "
+ "index %ld tear %#lx dear %#lx\n", __func__,
aux->head, aux->alert_mark, aux->empty_mark,
- AUX_SDB_NUM_ALERT(aux), range,
head / CPUM_SF_SDB_PER_TABLE,
- cpuhw->lsctl.tear,
- cpuhw->lsctl.dear);
+ cpuhw->lsctl.tear, cpuhw->lsctl.dear);
return 0;
}
@@ -1453,15 +1522,16 @@ static int aux_output_begin(struct perf_output_handle *handle,
static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
+ union hws_trailer_header old, prev, new;
struct hws_trailer_entry *te;
te = aux_sdb_trailer(aux, alert_index);
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- *overflow = orig_overflow;
- if (orig_flags & SDB_TE_BUFFER_FULL_MASK) {
+ old.val = prev.val;
+ new.val = prev.val;
+ *overflow = old.overflow;
+ if (old.f) {
/*
* SDB is already set by hardware.
* Abort and try to set somewhere
@@ -1469,10 +1539,10 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
*/
return false;
}
- new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 1;
+ new.overflow = 0;
+ prev.val = cmpxchg128(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
return true;
}
@@ -1501,11 +1571,15 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
unsigned long long *overflow)
{
- unsigned long long orig_overflow, orig_flags, new_flags;
- unsigned long i, range_scan, idx;
+ unsigned long i, range_scan, idx, idx_old;
+ union hws_trailer_header old, prev, new;
+ unsigned long long orig_overflow;
struct hws_trailer_entry *te;
- if (range <= AUX_SDB_NUM_EMPTY(aux))
+ debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld "
+ "empty %ld\n", __func__, range, aux->head,
+ aux->alert_mark, aux->empty_mark);
+ if (range <= aux_sdb_num_empty(aux))
/*
* No need to scan. All SDBs in range are marked as empty.
* Just set alert indicator. Should check race with hardware
@@ -1526,27 +1600,32 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
* Start scanning from one SDB behind empty_mark. If the new alert
* indicator fall into this range, set it.
*/
- range_scan = range - AUX_SDB_NUM_EMPTY(aux);
- idx = aux->empty_mark + 1;
+ range_scan = range - aux_sdb_num_empty(aux);
+ idx_old = idx = aux->empty_mark + 1;
for (i = 0; i < range_scan; i++, idx++) {
te = aux_sdb_trailer(aux, idx);
+ prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- orig_flags = te->flags;
- orig_overflow = te->overflow;
- new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK;
+ old.val = prev.val;
+ new.val = prev.val;
+ orig_overflow = old.overflow;
+ new.f = 0;
+ new.overflow = 0;
if (idx == aux->alert_mark)
- new_flags |= SDB_TE_ALERT_REQ_MASK;
+ new.a = 1;
else
- new_flags &= ~SDB_TE_ALERT_REQ_MASK;
- } while (!cmpxchg_double(&te->flags, &te->overflow,
- orig_flags, orig_overflow,
- new_flags, 0ULL));
+ new.a = 0;
+ prev.val = cmpxchg128(&te->header.val, old.val, new.val);
+ } while (prev.val != old.val);
*overflow += orig_overflow;
}
/* Update empty_mark to new position */
aux->empty_mark = aux->head + range - 1;
+ debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld "
+ "empty %ld\n", __func__, range_scan, idx_old,
+ idx - 1, aux->empty_mark);
return true;
}
@@ -1567,10 +1646,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
return;
/* Inform user space new data arrived */
- size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
+ size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
+ debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__,
+ size >> PAGE_SHIFT);
perf_aux_output_end(handle, size);
- num_sdb = aux->sfb.num_sdb;
+ num_sdb = aux->sfb.num_sdb;
while (!done) {
/* Get an output handle */
aux = perf_aux_output_begin(handle, cpuhw->event);
@@ -1578,9 +1659,6 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
pr_err("The AUX buffer with %lu pages for the "
"diagnostic-sampling mode is full\n",
num_sdb);
- debug_sprintf_event(sfdbg, 1,
- "%s: AUX buffer used up\n",
- __func__);
break;
}
if (WARN_ON_ONCE(!aux))
@@ -1602,14 +1680,14 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
size = range << PAGE_SHIFT;
perf_aux_output_end(&cpuhw->handle, size);
pr_err("Sample data caused the AUX buffer with %lu "
- "pages to overflow\n", num_sdb);
- debug_sprintf_event(sfdbg, 1, "%s: head %#lx range %#lx "
- "overflow %#llx\n", __func__,
+ "pages to overflow\n", aux->sfb.num_sdb);
+ debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld "
+ "overflow %lld\n", __func__,
aux->head, range, overflow);
} else {
- size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT;
+ size = aux_sdb_num_alert(aux) << PAGE_SHIFT;
perf_aux_output_end(&cpuhw->handle, size);
- debug_sprintf_event(sfdbg, 6, "%s: head %#lx alert %#lx "
+ debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld "
"already full, try another\n",
__func__,
aux->head, aux->alert_mark);
@@ -1617,11 +1695,9 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw)
}
if (done)
- debug_sprintf_event(sfdbg, 6, "%s: aux_reset_buffer "
- "[%#lx -> %#lx -> %#lx] (%#lx, %#lx)\n",
- __func__, aux->head, aux->alert_mark,
- aux->empty_mark, AUX_SDB_NUM_ALERT(aux),
- range);
+ debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld "
+ "empty %ld\n", __func__, aux->head,
+ aux->alert_mark, aux->empty_mark);
}
/*
@@ -1644,19 +1720,18 @@ static void aux_buffer_free(void *data)
kfree(aux->sdb_index);
kfree(aux);
- debug_sprintf_event(sfdbg, 4, "%s: free "
- "%lu SDBTs\n", __func__, num_sdbt);
+ debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt);
}
static void aux_sdb_init(unsigned long sdb)
{
struct hws_trailer_entry *te;
- te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb);
+ te = trailer_entry_ptr(sdb);
/* Save clock base */
te->clock_base = 1;
- memcpy(&te->progusage2, &tod_clock_base[1], 8);
+ te->progusage2 = tod_clock_base.tod;
}
/*
@@ -1697,13 +1772,13 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
}
/* Allocate aux_buffer struct for the event */
- aux = kmalloc(sizeof(struct aux_buffer), GFP_KERNEL);
+ aux = kzalloc(sizeof(struct aux_buffer), GFP_KERNEL);
if (!aux)
goto no_aux;
sfb = &aux->sfb;
/* Allocate sdbt_index for fast reference */
- n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE;
+ n_sdbt = DIV_ROUND_UP(nr_pages, CPUM_SF_SDB_PER_TABLE);
aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL);
if (!aux->sdbt_index)
goto no_sdbt_index;
@@ -1715,7 +1790,7 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
/* Allocate the first SDBT */
sfb->num_sdbt = 0;
- sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!sfb->sdbt)
goto no_sdbt;
aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt;
@@ -1727,23 +1802,23 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
*/
for (i = 0; i < nr_pages; i++, tail++) {
if (require_table_link(tail)) {
- new = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ new = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (!new)
goto no_sdbt;
aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new;
/* Link current page to tail of chain */
- *tail = (unsigned long)(void *) new + 1;
+ *tail = virt_to_phys(new) + 1;
tail = new;
}
/* Tail is the entry in a SDBT */
- *tail = (unsigned long)pages[i];
+ *tail = virt_to_phys(pages[i]);
aux->sdb_index[i] = (unsigned long)pages[i];
aux_sdb_init((unsigned long)pages[i]);
}
sfb->num_sdb = nr_pages;
/* Link the last entry in the SDBT to the first SDBT */
- *tail = (unsigned long) sfb->sdbt + 1;
+ *tail = virt_to_phys(sfb->sdbt) + 1;
sfb->tail = tail;
/*
@@ -1753,8 +1828,8 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages,
*/
aux->empty_mark = sfb->num_sdb - 1;
- debug_sprintf_event(sfdbg, 4, "%s: setup %lu SDBTs and %lu SDBs\n",
- __func__, sfb->num_sdbt, sfb->num_sdb);
+ debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__,
+ sfb->num_sdbt, sfb->num_sdb);
return aux;
@@ -1776,7 +1851,7 @@ static void cpumsf_pmu_read(struct perf_event *event)
/* Nothing to do ... updates are interrupt-driven */
}
-/* Check if the new sampling period/freqeuncy is appropriate.
+/* Check if the new sampling period/frequency is appropriate.
*
* Return non-zero on error and zero on passed checks.
*/
@@ -1883,9 +1958,9 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags)
cpuhw->lsctl.h = 1;
cpuhw->lsctl.interval = SAMPL_RATE(&event->hw);
if (!SAMPL_DIAG_MODE(&event->hw)) {
- cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt;
- cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt;
- TEAR_REG(&event->hw) = (unsigned long) cpuhw->sfb.sdbt;
+ cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt);
+ cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt;
+ TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt;
}
/* Ensure sampling functions are in the disabled state. If disabled,
@@ -2202,4 +2277,4 @@ out:
}
arch_initcall(init_cpum_sampling_pmu);
-core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0640);
+core_param(cpum_sfb_size, CPUM_SF_MAX_SDB, sfb_size, 0644);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 1e75cc983546..dfa77da2fd2e 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -15,7 +15,10 @@
#include <linux/export.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
#include <linux/sysfs.h>
+#include <asm/stacktrace.h>
#include <asm/irq.h>
#include <asm/cpu_mf.h>
#include <asm/lowcore.h>
@@ -23,27 +26,6 @@
#include <asm/sysinfo.h>
#include <asm/unwind.h>
-const char *perf_pmu_name(void)
-{
- if (cpum_cf_avail() || cpum_sf_avail())
- return "CPU-Measurement Facilities (CPU-MF)";
- return "pmu";
-}
-EXPORT_SYMBOL(perf_pmu_name);
-
-int perf_num_counters(void)
-{
- int num = 0;
-
- if (cpum_cf_avail())
- num += PERF_CPUM_CF_MAX_CTR;
- if (cpum_sf_avail())
- num += PERF_CPUM_SF_MAX_CTR;
-
- return num;
-}
-EXPORT_SYMBOL(perf_num_counters);
-
static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
{
struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];
@@ -51,7 +33,7 @@ static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs)
if (!stack)
return NULL;
- return (struct kvm_s390_sie_block *) stack->empty1[0];
+ return (struct kvm_s390_sie_block *)stack->sie_control_block;
}
static bool is_in_guest(struct pt_regs *regs)
@@ -233,6 +215,44 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
}
}
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
+ struct pt_regs *regs)
+{
+ struct stack_frame_user __user *sf;
+ unsigned long ip, sp;
+ bool first = true;
+
+ if (is_compat_task())
+ return;
+ perf_callchain_store(entry, instruction_pointer(regs));
+ sf = (void __user *)user_stack_pointer(regs);
+ pagefault_disable();
+ while (entry->nr < entry->max_stack) {
+ if (__get_user(sp, &sf->back_chain))
+ break;
+ if (__get_user(ip, &sf->gprs[8]))
+ break;
+ if (ip & 0x1) {
+ /*
+ * If the instruction address is invalid, and this
+ * is the first stack frame, assume r14 has not
+ * been written to the stack yet. Otherwise exit.
+ */
+ if (first && !(regs->gprs[14] & 0x1))
+ ip = regs->gprs[14];
+ else
+ break;
+ }
+ perf_callchain_store(entry, ip);
+ /* Sanity check: ABI requires SP to be aligned 8 bytes. */
+ if (!sp || sp & 0x7)
+ break;
+ sf = (void __user *)sp;
+ first = false;
+ }
+ pagefault_enable();
+}
+
/* Perf definitions for PMU event attributes in sysfs */
ssize_t cpumf_events_sysfs_show(struct device *dev,
struct device_attribute *attr, char *page)
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
new file mode 100644
index 000000000000..bf8a672b15a4
--- /dev/null
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -0,0 +1,773 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Facility
+ *
+ * Copyright IBM Corp. 2022
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define KMSG_COMPONENT "pai_crypto"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/perf_event.h>
+#include <asm/ctlreg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+static debug_info_t *cfm_dbg;
+static unsigned int paicrypt_cnt; /* Size of the mapped counter sets */
+ /* extracted with QPACI instruction */
+
+DEFINE_STATIC_KEY_FALSE(pai_key);
+
+struct pai_userdata {
+ u16 num;
+ u64 value;
+} __packed;
+
+struct paicrypt_map {
+ unsigned long *page; /* Page for CPU to store counters */
+ struct pai_userdata *save; /* Page to store no-zero counters */
+ unsigned int active_events; /* # of PAI crypto users */
+ refcount_t refcnt; /* Reference count mapped buffers */
+ enum paievt_mode mode; /* Type of event */
+ struct perf_event *event; /* Perf event for sampling */
+};
+
+struct paicrypt_mapptr {
+ struct paicrypt_map *mapptr;
+};
+
+static struct paicrypt_root { /* Anchor to per CPU data */
+ refcount_t refcnt; /* Overall active events */
+ struct paicrypt_mapptr __percpu *mapptr;
+} paicrypt_root;
+
+/* Free per CPU data when the last event is removed. */
+static void paicrypt_root_free(void)
+{
+ if (refcount_dec_and_test(&paicrypt_root.refcnt)) {
+ free_percpu(paicrypt_root.mapptr);
+ paicrypt_root.mapptr = NULL;
+ }
+ debug_sprintf_event(cfm_dbg, 5, "%s root.refcount %d\n", __func__,
+ refcount_read(&paicrypt_root.refcnt));
+}
+
+/*
+ * On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int paicrypt_root_alloc(void)
+{
+ if (!refcount_inc_not_zero(&paicrypt_root.refcnt)) {
+ /* The memory is already zeroed. */
+ paicrypt_root.mapptr = alloc_percpu(struct paicrypt_mapptr);
+ if (!paicrypt_root.mapptr)
+ return -ENOMEM;
+ refcount_set(&paicrypt_root.refcnt, 1);
+ }
+ return 0;
+}
+
+/* Release the PMU if event is the last perf event */
+static DEFINE_MUTEX(pai_reserve_mutex);
+
+/* Adjust usage counters and remove allocated memory when all users are
+ * gone.
+ */
+static void paicrypt_event_destroy(struct perf_event *event)
+{
+ struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr,
+ event->cpu);
+ struct paicrypt_map *cpump = mp->mapptr;
+
+ cpump->event = NULL;
+ static_branch_dec(&pai_key);
+ mutex_lock(&pai_reserve_mutex);
+ debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d"
+ " mode %d refcnt %u\n", __func__,
+ event->attr.config, event->cpu,
+ cpump->active_events, cpump->mode,
+ refcount_read(&cpump->refcnt));
+ if (refcount_dec_and_test(&cpump->refcnt)) {
+ debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n",
+ __func__, (unsigned long)cpump->page,
+ cpump->save);
+ free_page((unsigned long)cpump->page);
+ kvfree(cpump->save);
+ kfree(cpump);
+ mp->mapptr = NULL;
+ }
+ paicrypt_root_free();
+ mutex_unlock(&pai_reserve_mutex);
+}
+
+static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel)
+{
+ if (kernel)
+ nr += PAI_CRYPTO_MAXCTR;
+ return page[nr];
+}
+
+/* Read the counter values. Return value from location in CMP. For event
+ * CRYPTO_ALL sum up all events.
+ */
+static u64 paicrypt_getdata(struct perf_event *event, bool kernel)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+ u64 sum = 0;
+ int i;
+
+ if (event->attr.config != PAI_CRYPTO_BASE) {
+ return paicrypt_getctr(cpump->page,
+ event->attr.config - PAI_CRYPTO_BASE,
+ kernel);
+ }
+
+ for (i = 1; i <= paicrypt_cnt; i++) {
+ u64 val = paicrypt_getctr(cpump->page, i, kernel);
+
+ if (!val)
+ continue;
+ sum += val;
+ }
+ return sum;
+}
+
+static u64 paicrypt_getall(struct perf_event *event)
+{
+ u64 sum = 0;
+
+ if (!event->attr.exclude_kernel)
+ sum += paicrypt_getdata(event, true);
+ if (!event->attr.exclude_user)
+ sum += paicrypt_getdata(event, false);
+
+ return sum;
+}
+
+/* Used to avoid races in checking concurrent access of counting and
+ * sampling for crypto events
+ *
+ * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is
+ * allowed and when this event is running, no counting event is allowed.
+ * Several counting events are allowed in parallel, but no sampling event
+ * is allowed while one (or more) counting events are running.
+ *
+ * This function is called in process context and it is save to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static struct paicrypt_map *paicrypt_busy(struct perf_event *event)
+{
+ struct perf_event_attr *a = &event->attr;
+ struct paicrypt_map *cpump = NULL;
+ struct paicrypt_mapptr *mp;
+ int rc;
+
+ mutex_lock(&pai_reserve_mutex);
+
+ /* Allocate root node */
+ rc = paicrypt_root_alloc();
+ if (rc)
+ goto unlock;
+
+ /* Allocate node for this event */
+ mp = per_cpu_ptr(paicrypt_root.mapptr, event->cpu);
+ cpump = mp->mapptr;
+ if (!cpump) { /* Paicrypt_map allocated? */
+ cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
+ if (!cpump) {
+ rc = -ENOMEM;
+ goto free_root;
+ }
+ }
+
+ if (a->sample_period) { /* Sampling requested */
+ if (cpump->mode != PAI_MODE_NONE)
+ rc = -EBUSY; /* ... sampling/counting active */
+ } else { /* Counting requested */
+ if (cpump->mode == PAI_MODE_SAMPLING)
+ rc = -EBUSY; /* ... and sampling active */
+ }
+ /*
+ * This error case triggers when there is a conflict:
+ * Either sampling requested and counting already active, or visa
+ * versa. Therefore the struct paicrypto_map for this CPU is
+ * needed or the error could not have occurred. Only adjust root
+ * node refcount.
+ */
+ if (rc)
+ goto free_root;
+
+ /* Allocate memory for counter page and counter extraction.
+ * Only the first counting event has to allocate a page.
+ */
+ if (cpump->page) {
+ refcount_inc(&cpump->refcnt);
+ goto unlock;
+ }
+
+ rc = -ENOMEM;
+ cpump->page = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+ if (!cpump->page)
+ goto free_paicrypt_map;
+ cpump->save = kvmalloc_array(paicrypt_cnt + 1,
+ sizeof(struct pai_userdata), GFP_KERNEL);
+ if (!cpump->save) {
+ free_page((unsigned long)cpump->page);
+ cpump->page = NULL;
+ goto free_paicrypt_map;
+ }
+
+ /* Set mode and reference count */
+ rc = 0;
+ refcount_set(&cpump->refcnt, 1);
+ cpump->mode = a->sample_period ? PAI_MODE_SAMPLING : PAI_MODE_COUNTING;
+ mp->mapptr = cpump;
+ debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d"
+ " mode %d refcnt %u page %#lx save %p rc %d\n",
+ __func__, a->sample_period, cpump->active_events,
+ cpump->mode, refcount_read(&cpump->refcnt),
+ (unsigned long)cpump->page, cpump->save, rc);
+ goto unlock;
+
+free_paicrypt_map:
+ kfree(cpump);
+ mp->mapptr = NULL;
+free_root:
+ paicrypt_root_free();
+
+unlock:
+ mutex_unlock(&pai_reserve_mutex);
+ return rc ? ERR_PTR(rc) : cpump;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paicrypt_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *a = &event->attr;
+ struct paicrypt_map *cpump;
+
+ /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */
+ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+ return -ENOENT;
+ /* PAI crypto event must be in valid range */
+ if (a->config < PAI_CRYPTO_BASE ||
+ a->config > PAI_CRYPTO_BASE + paicrypt_cnt)
+ return -EINVAL;
+ /* Allow only CPU wide operation, no process context for now. */
+ if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1)
+ return -ENOENT;
+ /* Allow only CRYPTO_ALL for sampling. */
+ if (a->sample_period && a->config != PAI_CRYPTO_BASE)
+ return -EINVAL;
+
+ cpump = paicrypt_busy(event);
+ if (IS_ERR(cpump))
+ return PTR_ERR(cpump);
+
+ event->destroy = paicrypt_event_destroy;
+
+ if (a->sample_period) {
+ a->sample_period = 1;
+ a->freq = 0;
+ /* Register for paicrypt_sched_task() to be called */
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+ /* Add raw data which contain the memory mapped counters */
+ a->sample_type |= PERF_SAMPLE_RAW;
+ /* Turn off inheritance */
+ a->inherit = 0;
+ }
+
+ static_branch_inc(&pai_key);
+ return 0;
+}
+
+static void paicrypt_read(struct perf_event *event)
+{
+ u64 prev, new, delta;
+
+ prev = local64_read(&event->hw.prev_count);
+ new = paicrypt_getall(event);
+ local64_set(&event->hw.prev_count, new);
+ delta = (prev <= new) ? new - prev
+ : (-1ULL - prev) + new + 1; /* overflow */
+ local64_add(delta, &event->count);
+}
+
+static void paicrypt_start(struct perf_event *event, int flags)
+{
+ u64 sum;
+
+ /* Event initialization sets last_tag to 0. When later on the events
+ * are deleted and re-added, do not reset the event count value to zero.
+ * Events are added, deleted and re-added when 2 or more events
+ * are active at the same time.
+ */
+ if (!event->attr.sample_period) { /* Counting */
+ if (!event->hw.last_tag) {
+ event->hw.last_tag = 1;
+ sum = paicrypt_getall(event); /* Get current value */
+ local64_set(&event->hw.prev_count, sum);
+ }
+ } else { /* Sampling */
+ perf_sched_cb_inc(event->pmu);
+ }
+}
+
+static int paicrypt_add(struct perf_event *event, int flags)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+ unsigned long ccd;
+
+ if (++cpump->active_events == 1) {
+ ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET;
+ WRITE_ONCE(S390_lowcore.ccd, ccd);
+ local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
+ }
+ cpump->event = event;
+ if (flags & PERF_EF_START)
+ paicrypt_start(event, PERF_EF_RELOAD);
+ event->hw.state = 0;
+ return 0;
+}
+
+static void paicrypt_stop(struct perf_event *event, int flags)
+{
+ if (!event->attr.sample_period) /* Counting */
+ paicrypt_read(event);
+ else /* Sampling */
+ perf_sched_cb_dec(event->pmu);
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paicrypt_del(struct perf_event *event, int flags)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+
+ paicrypt_stop(event, PERF_EF_UPDATE);
+ if (--cpump->active_events == 0) {
+ local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
+ WRITE_ONCE(S390_lowcore.ccd, 0);
+ }
+}
+
+/* Create raw data and save it in buffer. Returns number of bytes copied.
+ * Saves only positive counter entries of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page,
+ bool exclude_user, bool exclude_kernel)
+{
+ int i, outidx = 0;
+
+ for (i = 1; i <= paicrypt_cnt; i++) {
+ u64 val = 0;
+
+ if (!exclude_kernel)
+ val += paicrypt_getctr(page, i, true);
+ if (!exclude_user)
+ val += paicrypt_getctr(page, i, false);
+ if (val) {
+ userdata[outidx].num = i;
+ userdata[outidx].value = val;
+ outidx++;
+ }
+ }
+ return outidx * sizeof(struct pai_userdata);
+}
+
+static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
+ struct perf_event *event)
+{
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ int overflow;
+
+ /* Setup perf sample */
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+ memset(&data, 0, sizeof(data));
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ if (event->attr.sample_type & PERF_SAMPLE_TID) {
+ data.tid_entry.pid = task_tgid_nr(current);
+ data.tid_entry.tid = task_pid_nr(current);
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_TIME)
+ data.time = event->clock();
+ if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+ data.id = event->id;
+ if (event->attr.sample_type & PERF_SAMPLE_CPU) {
+ data.cpu_entry.cpu = smp_processor_id();
+ data.cpu_entry.reserved = 0;
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = rawsize;
+ raw.frag.data = cpump->save;
+ perf_sample_save_raw_data(&data, &raw);
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ perf_event_update_userpage(event);
+ /* Clear lowcore page after read */
+ memset(cpump->page, 0, PAGE_SIZE);
+ return overflow;
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static int paicrypt_have_sample(void)
+{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
+ struct perf_event *event = cpump->event;
+ size_t rawsize;
+ int rc = 0;
+
+ if (!event) /* No event active */
+ return 0;
+ rawsize = paicrypt_copy(cpump->save, cpump->page,
+ cpump->event->attr.exclude_user,
+ cpump->event->attr.exclude_kernel);
+ if (rawsize) /* No incremented counters */
+ rc = paicrypt_push_sample(rawsize, cpump, event);
+ return rc;
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event CRYPTO_ALL is allowed.
+ */
+static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+ /* We started with a clean page on event installation. So read out
+ * results on schedule_out and if page was dirty, clear values.
+ */
+ if (!sched_in)
+ paicrypt_have_sample();
+}
+
+/* Attribute definitions for paicrypt interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1000 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paicrypt_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group paicrypt_events_group = {
+ .name = "events",
+ .attrs = NULL /* Filled in attr_event_init() */
+};
+
+static struct attribute_group paicrypt_format_group = {
+ .name = "format",
+ .attrs = paicrypt_format_attr,
+};
+
+static const struct attribute_group *paicrypt_attr_groups[] = {
+ &paicrypt_events_group,
+ &paicrypt_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paicrypt = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = paicrypt_event_init,
+ .add = paicrypt_add,
+ .del = paicrypt_del,
+ .start = paicrypt_start,
+ .stop = paicrypt_stop,
+ .read = paicrypt_read,
+ .sched_task = paicrypt_sched_task,
+ .attr_groups = paicrypt_attr_groups
+};
+
+/* List of symbolic PAI counter names. */
+static const char * const paicrypt_ctrnames[] = {
+ [0] = "CRYPTO_ALL",
+ [1] = "KM_DEA",
+ [2] = "KM_TDEA_128",
+ [3] = "KM_TDEA_192",
+ [4] = "KM_ENCRYPTED_DEA",
+ [5] = "KM_ENCRYPTED_TDEA_128",
+ [6] = "KM_ENCRYPTED_TDEA_192",
+ [7] = "KM_AES_128",
+ [8] = "KM_AES_192",
+ [9] = "KM_AES_256",
+ [10] = "KM_ENCRYPTED_AES_128",
+ [11] = "KM_ENCRYPTED_AES_192",
+ [12] = "KM_ENCRYPTED_AES_256",
+ [13] = "KM_XTS_AES_128",
+ [14] = "KM_XTS_AES_256",
+ [15] = "KM_XTS_ENCRYPTED_AES_128",
+ [16] = "KM_XTS_ENCRYPTED_AES_256",
+ [17] = "KMC_DEA",
+ [18] = "KMC_TDEA_128",
+ [19] = "KMC_TDEA_192",
+ [20] = "KMC_ENCRYPTED_DEA",
+ [21] = "KMC_ENCRYPTED_TDEA_128",
+ [22] = "KMC_ENCRYPTED_TDEA_192",
+ [23] = "KMC_AES_128",
+ [24] = "KMC_AES_192",
+ [25] = "KMC_AES_256",
+ [26] = "KMC_ENCRYPTED_AES_128",
+ [27] = "KMC_ENCRYPTED_AES_192",
+ [28] = "KMC_ENCRYPTED_AES_256",
+ [29] = "KMC_PRNG",
+ [30] = "KMA_GCM_AES_128",
+ [31] = "KMA_GCM_AES_192",
+ [32] = "KMA_GCM_AES_256",
+ [33] = "KMA_GCM_ENCRYPTED_AES_128",
+ [34] = "KMA_GCM_ENCRYPTED_AES_192",
+ [35] = "KMA_GCM_ENCRYPTED_AES_256",
+ [36] = "KMF_DEA",
+ [37] = "KMF_TDEA_128",
+ [38] = "KMF_TDEA_192",
+ [39] = "KMF_ENCRYPTED_DEA",
+ [40] = "KMF_ENCRYPTED_TDEA_128",
+ [41] = "KMF_ENCRYPTED_TDEA_192",
+ [42] = "KMF_AES_128",
+ [43] = "KMF_AES_192",
+ [44] = "KMF_AES_256",
+ [45] = "KMF_ENCRYPTED_AES_128",
+ [46] = "KMF_ENCRYPTED_AES_192",
+ [47] = "KMF_ENCRYPTED_AES_256",
+ [48] = "KMCTR_DEA",
+ [49] = "KMCTR_TDEA_128",
+ [50] = "KMCTR_TDEA_192",
+ [51] = "KMCTR_ENCRYPTED_DEA",
+ [52] = "KMCTR_ENCRYPTED_TDEA_128",
+ [53] = "KMCTR_ENCRYPTED_TDEA_192",
+ [54] = "KMCTR_AES_128",
+ [55] = "KMCTR_AES_192",
+ [56] = "KMCTR_AES_256",
+ [57] = "KMCTR_ENCRYPTED_AES_128",
+ [58] = "KMCTR_ENCRYPTED_AES_192",
+ [59] = "KMCTR_ENCRYPTED_AES_256",
+ [60] = "KMO_DEA",
+ [61] = "KMO_TDEA_128",
+ [62] = "KMO_TDEA_192",
+ [63] = "KMO_ENCRYPTED_DEA",
+ [64] = "KMO_ENCRYPTED_TDEA_128",
+ [65] = "KMO_ENCRYPTED_TDEA_192",
+ [66] = "KMO_AES_128",
+ [67] = "KMO_AES_192",
+ [68] = "KMO_AES_256",
+ [69] = "KMO_ENCRYPTED_AES_128",
+ [70] = "KMO_ENCRYPTED_AES_192",
+ [71] = "KMO_ENCRYPTED_AES_256",
+ [72] = "KIMD_SHA_1",
+ [73] = "KIMD_SHA_256",
+ [74] = "KIMD_SHA_512",
+ [75] = "KIMD_SHA3_224",
+ [76] = "KIMD_SHA3_256",
+ [77] = "KIMD_SHA3_384",
+ [78] = "KIMD_SHA3_512",
+ [79] = "KIMD_SHAKE_128",
+ [80] = "KIMD_SHAKE_256",
+ [81] = "KIMD_GHASH",
+ [82] = "KLMD_SHA_1",
+ [83] = "KLMD_SHA_256",
+ [84] = "KLMD_SHA_512",
+ [85] = "KLMD_SHA3_224",
+ [86] = "KLMD_SHA3_256",
+ [87] = "KLMD_SHA3_384",
+ [88] = "KLMD_SHA3_512",
+ [89] = "KLMD_SHAKE_128",
+ [90] = "KLMD_SHAKE_256",
+ [91] = "KMAC_DEA",
+ [92] = "KMAC_TDEA_128",
+ [93] = "KMAC_TDEA_192",
+ [94] = "KMAC_ENCRYPTED_DEA",
+ [95] = "KMAC_ENCRYPTED_TDEA_128",
+ [96] = "KMAC_ENCRYPTED_TDEA_192",
+ [97] = "KMAC_AES_128",
+ [98] = "KMAC_AES_192",
+ [99] = "KMAC_AES_256",
+ [100] = "KMAC_ENCRYPTED_AES_128",
+ [101] = "KMAC_ENCRYPTED_AES_192",
+ [102] = "KMAC_ENCRYPTED_AES_256",
+ [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA",
+ [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128",
+ [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192",
+ [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA",
+ [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128",
+ [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192",
+ [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128",
+ [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192",
+ [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256",
+ [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128",
+ [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192",
+ [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256A",
+ [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128",
+ [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256",
+ [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128",
+ [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256",
+ [119] = "PCC_SCALAR_MULTIPLY_P256",
+ [120] = "PCC_SCALAR_MULTIPLY_P384",
+ [121] = "PCC_SCALAR_MULTIPLY_P521",
+ [122] = "PCC_SCALAR_MULTIPLY_ED25519",
+ [123] = "PCC_SCALAR_MULTIPLY_ED448",
+ [124] = "PCC_SCALAR_MULTIPLY_X25519",
+ [125] = "PCC_SCALAR_MULTIPLY_X448",
+ [126] = "PRNO_SHA_512_DRNG",
+ [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO",
+ [128] = "PRNO_TRNG",
+ [129] = "KDSA_ECDSA_VERIFY_P256",
+ [130] = "KDSA_ECDSA_VERIFY_P384",
+ [131] = "KDSA_ECDSA_VERIFY_P521",
+ [132] = "KDSA_ECDSA_SIGN_P256",
+ [133] = "KDSA_ECDSA_SIGN_P384",
+ [134] = "KDSA_ECDSA_SIGN_P521",
+ [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256",
+ [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384",
+ [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521",
+ [138] = "KDSA_EDDSA_VERIFY_ED25519",
+ [139] = "KDSA_EDDSA_VERIFY_ED448",
+ [140] = "KDSA_EDDSA_SIGN_ED25519",
+ [141] = "KDSA_EDDSA_SIGN_ED448",
+ [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519",
+ [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448",
+ [144] = "PCKMO_ENCRYPT_DEA_KEY",
+ [145] = "PCKMO_ENCRYPT_TDEA_128_KEY",
+ [146] = "PCKMO_ENCRYPT_TDEA_192_KEY",
+ [147] = "PCKMO_ENCRYPT_AES_128_KEY",
+ [148] = "PCKMO_ENCRYPT_AES_192_KEY",
+ [149] = "PCKMO_ENCRYPT_AES_256_KEY",
+ [150] = "PCKMO_ENCRYPT_ECC_P256_KEY",
+ [151] = "PCKMO_ENCRYPT_ECC_P384_KEY",
+ [152] = "PCKMO_ENCRYPT_ECC_P521_KEY",
+ [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY",
+ [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY",
+ [155] = "IBM_RESERVED_155",
+ [156] = "IBM_RESERVED_156",
+};
+
+static void __init attr_event_free(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ struct device_attribute *dap;
+
+ dap = container_of(attrs[i], struct device_attribute, attr);
+ pa = container_of(dap, struct perf_pmu_events_attr, attr);
+ kfree(pa);
+ }
+ kfree(attrs);
+}
+
+static int __init attr_event_init_one(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+
+ pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+ if (!pa)
+ return -ENOMEM;
+
+ sysfs_attr_init(&pa->attr.attr);
+ pa->id = PAI_CRYPTO_BASE + num;
+ pa->attr.attr.name = paicrypt_ctrnames[num];
+ pa->attr.attr.mode = 0444;
+ pa->attr.show = cpumf_events_sysfs_show;
+ pa->attr.store = NULL;
+ attrs[num] = &pa->attr.attr;
+ return 0;
+}
+
+/* Create PMU sysfs event attributes on the fly. */
+static int __init attr_event_init(void)
+{
+ struct attribute **attrs;
+ int ret, i;
+
+ attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs),
+ GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) {
+ ret = attr_event_init_one(attrs, i);
+ if (ret) {
+ attr_event_free(attrs, i - 1);
+ return ret;
+ }
+ }
+ attrs[i] = NULL;
+ paicrypt_events_group.attrs = attrs;
+ return 0;
+}
+
+static int __init paicrypt_init(void)
+{
+ struct qpaci_info_block ib;
+ int rc;
+
+ if (!test_facility(196))
+ return 0;
+
+ qpaci(&ib);
+ paicrypt_cnt = ib.num_cc;
+ if (paicrypt_cnt == 0)
+ return 0;
+ if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR)
+ paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1;
+
+ rc = attr_event_init(); /* Export known PAI crypto events */
+ if (rc) {
+ pr_err("Creation of PMU pai_crypto /sysfs failed\n");
+ return rc;
+ }
+
+ /* Setup s390dbf facility */
+ cfm_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
+ if (!cfm_dbg) {
+ pr_err("Registration of s390dbf pai_crypto failed\n");
+ return -ENOMEM;
+ }
+ debug_register_view(cfm_dbg, &debug_sprintf_view);
+
+ rc = perf_pmu_register(&paicrypt, "pai_crypto", -1);
+ if (rc) {
+ pr_err("Registering the pai_crypto PMU failed with rc=%i\n",
+ rc);
+ debug_unregister_view(cfm_dbg, &debug_sprintf_view);
+ debug_unregister(cfm_dbg);
+ return rc;
+ }
+ return 0;
+}
+
+device_initcall(paicrypt_init);
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
new file mode 100644
index 000000000000..af7f2b538c8f
--- /dev/null
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -0,0 +1,671 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Performance event support - Processor Activity Instrumentation Extension
+ * Facility
+ *
+ * Copyright IBM Corp. 2022
+ * Author(s): Thomas Richter <tmricht@linux.ibm.com>
+ */
+#define KMSG_COMPONENT "pai_ext"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/perf_event.h>
+#include <asm/ctlreg.h>
+#include <asm/pai.h>
+#include <asm/debug.h>
+
+#define PAIE1_CB_SZ 0x200 /* Size of PAIE1 control block */
+#define PAIE1_CTRBLOCK_SZ 0x400 /* Size of PAIE1 counter blocks */
+
+static debug_info_t *paiext_dbg;
+static unsigned int paiext_cnt; /* Extracted with QPACI instruction */
+
+struct pai_userdata {
+ u16 num;
+ u64 value;
+} __packed;
+
+/* Create the PAI extension 1 control block area.
+ * The PAI extension control block 1 is pointed to by lowcore
+ * address 0x1508 for each CPU. This control block is 512 bytes in size
+ * and requires a 512 byte boundary alignment.
+ */
+struct paiext_cb { /* PAI extension 1 control block */
+ u64 header; /* Not used */
+ u64 reserved1;
+ u64 acc; /* Addr to analytics counter control block */
+ u8 reserved2[488];
+} __packed;
+
+struct paiext_map {
+ unsigned long *area; /* Area for CPU to store counters */
+ struct pai_userdata *save; /* Area to store non-zero counters */
+ enum paievt_mode mode; /* Type of event */
+ unsigned int active_events; /* # of PAI Extension users */
+ refcount_t refcnt;
+ struct perf_event *event; /* Perf event for sampling */
+ struct paiext_cb *paiext_cb; /* PAI extension control block area */
+};
+
+struct paiext_mapptr {
+ struct paiext_map *mapptr;
+};
+
+static struct paiext_root { /* Anchor to per CPU data */
+ refcount_t refcnt; /* Overall active events */
+ struct paiext_mapptr __percpu *mapptr;
+} paiext_root;
+
+/* Free per CPU data when the last event is removed. */
+static void paiext_root_free(void)
+{
+ if (refcount_dec_and_test(&paiext_root.refcnt)) {
+ free_percpu(paiext_root.mapptr);
+ paiext_root.mapptr = NULL;
+ }
+}
+
+/* On initialization of first event also allocate per CPU data dynamically.
+ * Start with an array of pointers, the array size is the maximum number of
+ * CPUs possible, which might be larger than the number of CPUs currently
+ * online.
+ */
+static int paiext_root_alloc(void)
+{
+ if (!refcount_inc_not_zero(&paiext_root.refcnt)) {
+ /* The memory is already zeroed. */
+ paiext_root.mapptr = alloc_percpu(struct paiext_mapptr);
+ if (!paiext_root.mapptr) {
+ /* Returning without refcnt adjustment is ok. The
+ * error code is handled by paiext_alloc() which
+ * decrements refcnt when an event can not be
+ * created.
+ */
+ return -ENOMEM;
+ }
+ refcount_set(&paiext_root.refcnt, 1);
+ }
+ return 0;
+}
+
+/* Protects against concurrent increment of sampler and counter member
+ * increments at the same time and prohibits concurrent execution of
+ * counting and sampling events.
+ * Ensures that analytics counter block is deallocated only when the
+ * sampling and counting on that cpu is zero.
+ * For details see paiext_alloc().
+ */
+static DEFINE_MUTEX(paiext_reserve_mutex);
+
+/* Free all memory allocated for event counting/sampling setup */
+static void paiext_free(struct paiext_mapptr *mp)
+{
+ kfree(mp->mapptr->area);
+ kfree(mp->mapptr->paiext_cb);
+ kvfree(mp->mapptr->save);
+ kfree(mp->mapptr);
+ mp->mapptr = NULL;
+}
+
+/* Release the PMU if event is the last perf event */
+static void paiext_event_destroy(struct perf_event *event)
+{
+ struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
+ struct paiext_map *cpump = mp->mapptr;
+
+ mutex_lock(&paiext_reserve_mutex);
+ cpump->event = NULL;
+ if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */
+ paiext_free(mp);
+ paiext_root_free();
+ mutex_unlock(&paiext_reserve_mutex);
+ debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__,
+ event->cpu, mp->mapptr);
+
+}
+
+/* Used to avoid races in checking concurrent access of counting and
+ * sampling for pai_extension events.
+ *
+ * Only one instance of event pai_ext/NNPA_ALL/ for sampling is
+ * allowed and when this event is running, no counting event is allowed.
+ * Several counting events are allowed in parallel, but no sampling event
+ * is allowed while one (or more) counting events are running.
+ *
+ * This function is called in process context and it is safe to block.
+ * When the event initialization functions fails, no other call back will
+ * be invoked.
+ *
+ * Allocate the memory for the event.
+ */
+static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event)
+{
+ struct paiext_mapptr *mp;
+ struct paiext_map *cpump;
+ int rc;
+
+ mutex_lock(&paiext_reserve_mutex);
+
+ rc = paiext_root_alloc();
+ if (rc)
+ goto unlock;
+
+ mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
+ cpump = mp->mapptr;
+ if (!cpump) { /* Paiext_map allocated? */
+ rc = -ENOMEM;
+ cpump = kzalloc(sizeof(*cpump), GFP_KERNEL);
+ if (!cpump)
+ goto undo;
+
+ /* Allocate memory for counter area and counter extraction.
+ * These are
+ * - a 512 byte block and requires 512 byte boundary alignment.
+ * - a 1KB byte block and requires 1KB boundary alignment.
+ * Only the first counting event has to allocate the area.
+ *
+ * Note: This works with commit 59bb47985c1d by default.
+ * Backporting this to kernels without this commit might
+ * need adjustment.
+ */
+ mp->mapptr = cpump;
+ cpump->area = kzalloc(PAIE1_CTRBLOCK_SZ, GFP_KERNEL);
+ cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL);
+ cpump->save = kvmalloc_array(paiext_cnt + 1,
+ sizeof(struct pai_userdata),
+ GFP_KERNEL);
+ if (!cpump->save || !cpump->area || !cpump->paiext_cb) {
+ paiext_free(mp);
+ goto undo;
+ }
+ refcount_set(&cpump->refcnt, 1);
+ cpump->mode = a->sample_period ? PAI_MODE_SAMPLING
+ : PAI_MODE_COUNTING;
+ } else {
+ /* Multiple invocation, check what is active.
+ * Supported are multiple counter events or only one sampling
+ * event concurrently at any one time.
+ */
+ if (cpump->mode == PAI_MODE_SAMPLING ||
+ (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) {
+ rc = -EBUSY;
+ goto undo;
+ }
+ refcount_inc(&cpump->refcnt);
+ }
+
+ rc = 0;
+ cpump->event = event;
+
+undo:
+ if (rc) {
+ /* Error in allocation of event, decrement anchor. Since
+ * the event in not created, its destroy() function is never
+ * invoked. Adjust the reference counter for the anchor.
+ */
+ paiext_root_free();
+ }
+unlock:
+ mutex_unlock(&paiext_reserve_mutex);
+ /* If rc is non-zero, no increment of counter/sampler was done. */
+ return rc;
+}
+
+/* The PAI extension 1 control block supports up to 128 entries. Return
+ * the index within PAIE1_CB given the event number. Also validate event
+ * number.
+ */
+static int paiext_event_valid(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (cfg >= PAI_NNPA_BASE && cfg <= PAI_NNPA_BASE + paiext_cnt) {
+ /* Offset NNPA in paiext_cb */
+ event->hw.config_base = offsetof(struct paiext_cb, acc);
+ return 0;
+ }
+ return -EINVAL;
+}
+
+/* Might be called on different CPU than the one the event is intended for. */
+static int paiext_event_init(struct perf_event *event)
+{
+ struct perf_event_attr *a = &event->attr;
+ int rc;
+
+ /* PMU pai_ext registered as PERF_TYPE_RAW, check event type */
+ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
+ return -ENOENT;
+ /* PAI extension event must be valid and in supported range */
+ rc = paiext_event_valid(event);
+ if (rc)
+ return rc;
+ /* Allow only CPU wide operation, no process context for now. */
+ if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1)
+ return -ENOENT;
+ /* Allow only event NNPA_ALL for sampling. */
+ if (a->sample_period && a->config != PAI_NNPA_BASE)
+ return -EINVAL;
+ /* Prohibit exclude_user event selection */
+ if (a->exclude_user)
+ return -EINVAL;
+
+ rc = paiext_alloc(a, event);
+ if (rc)
+ return rc;
+ event->destroy = paiext_event_destroy;
+
+ if (a->sample_period) {
+ a->sample_period = 1;
+ a->freq = 0;
+ /* Register for paicrypt_sched_task() to be called */
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+ /* Add raw data which are the memory mapped counters */
+ a->sample_type |= PERF_SAMPLE_RAW;
+ /* Turn off inheritance */
+ a->inherit = 0;
+ }
+
+ return 0;
+}
+
+static u64 paiext_getctr(unsigned long *area, int nr)
+{
+ return area[nr];
+}
+
+/* Read the counter values. Return value from location in buffer. For event
+ * NNPA_ALL sum up all events.
+ */
+static u64 paiext_getdata(struct perf_event *event)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ u64 sum = 0;
+ int i;
+
+ if (event->attr.config != PAI_NNPA_BASE)
+ return paiext_getctr(cpump->area,
+ event->attr.config - PAI_NNPA_BASE);
+
+ for (i = 1; i <= paiext_cnt; i++)
+ sum += paiext_getctr(cpump->area, i);
+
+ return sum;
+}
+
+static u64 paiext_getall(struct perf_event *event)
+{
+ return paiext_getdata(event);
+}
+
+static void paiext_read(struct perf_event *event)
+{
+ u64 prev, new, delta;
+
+ prev = local64_read(&event->hw.prev_count);
+ new = paiext_getall(event);
+ local64_set(&event->hw.prev_count, new);
+ delta = new - prev;
+ local64_add(delta, &event->count);
+}
+
+static void paiext_start(struct perf_event *event, int flags)
+{
+ u64 sum;
+
+ if (!event->attr.sample_period) { /* Counting */
+ if (!event->hw.last_tag) {
+ event->hw.last_tag = 1;
+ sum = paiext_getall(event); /* Get current value */
+ local64_set(&event->hw.prev_count, sum);
+ }
+ } else { /* Sampling */
+ perf_sched_cb_inc(event->pmu);
+ }
+}
+
+static int paiext_add(struct perf_event *event, int flags)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct paiext_cb *pcb = cpump->paiext_cb;
+
+ if (++cpump->active_events == 1) {
+ S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb);
+ pcb->acc = virt_to_phys(cpump->area) | 0x1;
+ /* Enable CPU instruction lookup for PAIE1 control block */
+ local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT);
+ debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
+ __func__, S390_lowcore.aicd, pcb->acc);
+ }
+ cpump->event = event;
+ if (flags & PERF_EF_START)
+ paiext_start(event, PERF_EF_RELOAD);
+ event->hw.state = 0;
+ return 0;
+}
+
+static void paiext_stop(struct perf_event *event, int flags)
+{
+ if (!event->attr.sample_period) /* Counting */
+ paiext_read(event);
+ else /* Sampling */
+ perf_sched_cb_dec(event->pmu);
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+static void paiext_del(struct perf_event *event, int flags)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct paiext_cb *pcb = cpump->paiext_cb;
+
+ paiext_stop(event, PERF_EF_UPDATE);
+ if (--cpump->active_events == 0) {
+ /* Disable CPU instruction lookup for PAIE1 control block */
+ local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT);
+ pcb->acc = 0;
+ S390_lowcore.aicd = 0;
+ debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
+ __func__, S390_lowcore.aicd, pcb->acc);
+ }
+}
+
+/* Create raw data and save it in buffer. Returns number of bytes copied.
+ * Saves only positive counter entries of the form
+ * 2 bytes: Number of counter
+ * 8 bytes: Value of counter
+ */
+static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area)
+{
+ int i, outidx = 0;
+
+ for (i = 1; i <= paiext_cnt; i++) {
+ u64 val = paiext_getctr(area, i);
+
+ if (val) {
+ userdata[outidx].num = i;
+ userdata[outidx].value = val;
+ outidx++;
+ }
+ }
+ return outidx * sizeof(*userdata);
+}
+
+/* Write sample when one or more counters values are nonzero.
+ *
+ * Note: The function paiext_sched_task() and paiext_push_sample() are not
+ * invoked after function paiext_del() has been called because of function
+ * perf_sched_cb_dec().
+ * The function paiext_sched_task() and paiext_push_sample() are only
+ * called when sampling is active. Function perf_sched_cb_inc()
+ * has been invoked to install function paiext_sched_task() as call back
+ * to run at context switch time (see paiext_add()).
+ *
+ * This causes function perf_event_context_sched_out() and
+ * perf_event_context_sched_in() to check whether the PMU has installed an
+ * sched_task() callback. That callback is not active after paiext_del()
+ * returns and has deleted the event on that CPU.
+ */
+static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
+ struct perf_event *event)
+{
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ int overflow;
+
+ /* Setup perf sample */
+ memset(&regs, 0, sizeof(regs));
+ memset(&raw, 0, sizeof(raw));
+ memset(&data, 0, sizeof(data));
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+ if (event->attr.sample_type & PERF_SAMPLE_TID) {
+ data.tid_entry.pid = task_tgid_nr(current);
+ data.tid_entry.tid = task_pid_nr(current);
+ }
+ if (event->attr.sample_type & PERF_SAMPLE_TIME)
+ data.time = event->clock();
+ if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
+ data.id = event->id;
+ if (event->attr.sample_type & PERF_SAMPLE_CPU)
+ data.cpu_entry.cpu = smp_processor_id();
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.frag.size = rawsize;
+ raw.frag.data = cpump->save;
+ perf_sample_save_raw_data(&data, &raw);
+ }
+
+ overflow = perf_event_overflow(event, &data, &regs);
+ perf_event_update_userpage(event);
+ /* Clear lowcore area after read */
+ memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ);
+ return overflow;
+}
+
+/* Check if there is data to be saved on schedule out of a task. */
+static int paiext_have_sample(void)
+{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
+ struct perf_event *event = cpump->event;
+ size_t rawsize;
+ int rc = 0;
+
+ if (!event)
+ return 0;
+ rawsize = paiext_copy(cpump->save, cpump->area);
+ if (rawsize) /* Incremented counters */
+ rc = paiext_push_sample(rawsize, cpump, event);
+ return rc;
+}
+
+/* Called on schedule-in and schedule-out. No access to event structure,
+ * but for sampling only event NNPA_ALL is allowed.
+ */
+static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+ /* We started with a clean page on event installation. So read out
+ * results on schedule_out and if page was dirty, clear values.
+ */
+ if (!sched_in)
+ paiext_have_sample();
+}
+
+/* Attribute definitions for pai extension1 interface. As with other CPU
+ * Measurement Facilities, there is one attribute per mapped counter.
+ * The number of mapped counters may vary per machine generation. Use
+ * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction
+ * to determine the number of mapped counters. The instructions returns
+ * a positive number, which is the highest number of supported counters.
+ * All counters less than this number are also supported, there are no
+ * holes. A returned number of zero means no support for mapped counters.
+ *
+ * The identification of the counter is a unique number. The chosen range
+ * is 0x1800 + offset in mapped kernel page.
+ * All CPU Measurement Facility counters identifiers must be unique and
+ * the numbers from 0 to 496 are already used for the CPU Measurement
+ * Counter facility. Number 0x1000 to 0x103e are used for PAI cryptography
+ * counters.
+ * Numbers 0xb0000, 0xbc000 and 0xbd000 are already
+ * used for the CPU Measurement Sampling facility.
+ */
+PMU_FORMAT_ATTR(event, "config:0-63");
+
+static struct attribute *paiext_format_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group paiext_events_group = {
+ .name = "events",
+ .attrs = NULL, /* Filled in attr_event_init() */
+};
+
+static struct attribute_group paiext_format_group = {
+ .name = "format",
+ .attrs = paiext_format_attr,
+};
+
+static const struct attribute_group *paiext_attr_groups[] = {
+ &paiext_events_group,
+ &paiext_format_group,
+ NULL,
+};
+
+/* Performance monitoring unit for mapped counters */
+static struct pmu paiext = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = paiext_event_init,
+ .add = paiext_add,
+ .del = paiext_del,
+ .start = paiext_start,
+ .stop = paiext_stop,
+ .read = paiext_read,
+ .sched_task = paiext_sched_task,
+ .attr_groups = paiext_attr_groups,
+};
+
+/* List of symbolic PAI extension 1 NNPA counter names. */
+static const char * const paiext_ctrnames[] = {
+ [0] = "NNPA_ALL",
+ [1] = "NNPA_ADD",
+ [2] = "NNPA_SUB",
+ [3] = "NNPA_MUL",
+ [4] = "NNPA_DIV",
+ [5] = "NNPA_MIN",
+ [6] = "NNPA_MAX",
+ [7] = "NNPA_LOG",
+ [8] = "NNPA_EXP",
+ [9] = "NNPA_IBM_RESERVED_9",
+ [10] = "NNPA_RELU",
+ [11] = "NNPA_TANH",
+ [12] = "NNPA_SIGMOID",
+ [13] = "NNPA_SOFTMAX",
+ [14] = "NNPA_BATCHNORM",
+ [15] = "NNPA_MAXPOOL2D",
+ [16] = "NNPA_AVGPOOL2D",
+ [17] = "NNPA_LSTMACT",
+ [18] = "NNPA_GRUACT",
+ [19] = "NNPA_CONVOLUTION",
+ [20] = "NNPA_MATMUL_OP",
+ [21] = "NNPA_MATMUL_OP_BCAST23",
+ [22] = "NNPA_SMALLBATCH",
+ [23] = "NNPA_LARGEDIM",
+ [24] = "NNPA_SMALLTENSOR",
+ [25] = "NNPA_1MFRAME",
+ [26] = "NNPA_2GFRAME",
+ [27] = "NNPA_ACCESSEXCEPT",
+};
+
+static void __init attr_event_free(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+ struct device_attribute *dap;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ dap = container_of(attrs[i], struct device_attribute, attr);
+ pa = container_of(dap, struct perf_pmu_events_attr, attr);
+ kfree(pa);
+ }
+ kfree(attrs);
+}
+
+static int __init attr_event_init_one(struct attribute **attrs, int num)
+{
+ struct perf_pmu_events_attr *pa;
+
+ pa = kzalloc(sizeof(*pa), GFP_KERNEL);
+ if (!pa)
+ return -ENOMEM;
+
+ sysfs_attr_init(&pa->attr.attr);
+ pa->id = PAI_NNPA_BASE + num;
+ pa->attr.attr.name = paiext_ctrnames[num];
+ pa->attr.attr.mode = 0444;
+ pa->attr.show = cpumf_events_sysfs_show;
+ pa->attr.store = NULL;
+ attrs[num] = &pa->attr.attr;
+ return 0;
+}
+
+/* Create PMU sysfs event attributes on the fly. */
+static int __init attr_event_init(void)
+{
+ struct attribute **attrs;
+ int ret, i;
+
+ attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs),
+ GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) {
+ ret = attr_event_init_one(attrs, i);
+ if (ret) {
+ attr_event_free(attrs, i - 1);
+ return ret;
+ }
+ }
+ attrs[i] = NULL;
+ paiext_events_group.attrs = attrs;
+ return 0;
+}
+
+static int __init paiext_init(void)
+{
+ struct qpaci_info_block ib;
+ int rc = -ENOMEM;
+
+ if (!test_facility(197))
+ return 0;
+
+ qpaci(&ib);
+ paiext_cnt = ib.num_nnpa;
+ if (paiext_cnt >= PAI_NNPA_MAXCTR)
+ paiext_cnt = PAI_NNPA_MAXCTR;
+ if (!paiext_cnt)
+ return 0;
+
+ rc = attr_event_init();
+ if (rc) {
+ pr_err("Creation of PMU " KMSG_COMPONENT " /sysfs failed\n");
+ return rc;
+ }
+
+ /* Setup s390dbf facility */
+ paiext_dbg = debug_register(KMSG_COMPONENT, 2, 256, 128);
+ if (!paiext_dbg) {
+ pr_err("Registration of s390dbf " KMSG_COMPONENT " failed\n");
+ rc = -ENOMEM;
+ goto out_init;
+ }
+ debug_register_view(paiext_dbg, &debug_sprintf_view);
+
+ rc = perf_pmu_register(&paiext, KMSG_COMPONENT, -1);
+ if (rc) {
+ pr_err("Registration of " KMSG_COMPONENT " PMU failed with "
+ "rc=%i\n", rc);
+ goto out_pmu;
+ }
+
+ return 0;
+
+out_pmu:
+ debug_unregister_view(paiext_dbg, &debug_sprintf_view);
+ debug_unregister(paiext_dbg);
+out_init:
+ attr_event_free(paiext_events_group.attrs,
+ ARRAY_SIZE(paiext_ctrnames) + 1);
+ return rc;
+}
+
+device_initcall(paiext_init);
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index 4352a504f235..3d93656bd948 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -20,8 +20,10 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
return 0;
idx -= PERF_REG_S390_FP0;
- fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx)
- : current->thread.fpu.fprs[idx];
+ if (cpu_has_vx())
+ fp = *(freg_t *)(current->thread.fpu.vxrs + idx);
+ else
+ fp = current->thread.fpu.fprs[idx];
return fp.ui;
}
@@ -53,8 +55,7 @@ u64 perf_reg_abi(struct task_struct *task)
}
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
/*
* Use the regs from the first interruption and let
diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S
deleted file mode 100644
index 59dee9d3bebf..000000000000
--- a/arch/s390/kernel/pgm_check.S
+++ /dev/null
@@ -1,147 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Program check table.
- *
- * Copyright IBM Corp. 2012
- */
-
-#include <linux/linkage.h>
-
-#define PGM_CHECK(handler) .quad handler
-#define PGM_CHECK_DEFAULT PGM_CHECK(default_trap_handler)
-
-/*
- * The program check table contains exactly 128 (0x00-0x7f) entries. Each
- * line defines the function to be called corresponding to the program check
- * interruption code.
- */
-.section .rodata, "a"
-ENTRY(pgm_check_table)
-PGM_CHECK_DEFAULT /* 00 */
-PGM_CHECK(illegal_op) /* 01 */
-PGM_CHECK(privileged_op) /* 02 */
-PGM_CHECK(execute_exception) /* 03 */
-PGM_CHECK(do_protection_exception) /* 04 */
-PGM_CHECK(addressing_exception) /* 05 */
-PGM_CHECK(specification_exception) /* 06 */
-PGM_CHECK(data_exception) /* 07 */
-PGM_CHECK(overflow_exception) /* 08 */
-PGM_CHECK(divide_exception) /* 09 */
-PGM_CHECK(overflow_exception) /* 0a */
-PGM_CHECK(divide_exception) /* 0b */
-PGM_CHECK(hfp_overflow_exception) /* 0c */
-PGM_CHECK(hfp_underflow_exception) /* 0d */
-PGM_CHECK(hfp_significance_exception) /* 0e */
-PGM_CHECK(hfp_divide_exception) /* 0f */
-PGM_CHECK(do_dat_exception) /* 10 */
-PGM_CHECK(do_dat_exception) /* 11 */
-PGM_CHECK(translation_exception) /* 12 */
-PGM_CHECK(special_op_exception) /* 13 */
-PGM_CHECK_DEFAULT /* 14 */
-PGM_CHECK(operand_exception) /* 15 */
-PGM_CHECK_DEFAULT /* 16 */
-PGM_CHECK_DEFAULT /* 17 */
-PGM_CHECK(transaction_exception) /* 18 */
-PGM_CHECK_DEFAULT /* 19 */
-PGM_CHECK_DEFAULT /* 1a */
-PGM_CHECK(vector_exception) /* 1b */
-PGM_CHECK(space_switch_exception) /* 1c */
-PGM_CHECK(hfp_sqrt_exception) /* 1d */
-PGM_CHECK_DEFAULT /* 1e */
-PGM_CHECK_DEFAULT /* 1f */
-PGM_CHECK_DEFAULT /* 20 */
-PGM_CHECK_DEFAULT /* 21 */
-PGM_CHECK_DEFAULT /* 22 */
-PGM_CHECK_DEFAULT /* 23 */
-PGM_CHECK_DEFAULT /* 24 */
-PGM_CHECK_DEFAULT /* 25 */
-PGM_CHECK_DEFAULT /* 26 */
-PGM_CHECK_DEFAULT /* 27 */
-PGM_CHECK_DEFAULT /* 28 */
-PGM_CHECK_DEFAULT /* 29 */
-PGM_CHECK_DEFAULT /* 2a */
-PGM_CHECK_DEFAULT /* 2b */
-PGM_CHECK_DEFAULT /* 2c */
-PGM_CHECK_DEFAULT /* 2d */
-PGM_CHECK_DEFAULT /* 2e */
-PGM_CHECK_DEFAULT /* 2f */
-PGM_CHECK_DEFAULT /* 30 */
-PGM_CHECK_DEFAULT /* 31 */
-PGM_CHECK_DEFAULT /* 32 */
-PGM_CHECK_DEFAULT /* 33 */
-PGM_CHECK_DEFAULT /* 34 */
-PGM_CHECK_DEFAULT /* 35 */
-PGM_CHECK_DEFAULT /* 36 */
-PGM_CHECK_DEFAULT /* 37 */
-PGM_CHECK(do_dat_exception) /* 38 */
-PGM_CHECK(do_dat_exception) /* 39 */
-PGM_CHECK(do_dat_exception) /* 3a */
-PGM_CHECK(do_dat_exception) /* 3b */
-PGM_CHECK_DEFAULT /* 3c */
-PGM_CHECK_DEFAULT /* 3d */
-PGM_CHECK_DEFAULT /* 3e */
-PGM_CHECK_DEFAULT /* 3f */
-PGM_CHECK_DEFAULT /* 40 */
-PGM_CHECK_DEFAULT /* 41 */
-PGM_CHECK_DEFAULT /* 42 */
-PGM_CHECK_DEFAULT /* 43 */
-PGM_CHECK_DEFAULT /* 44 */
-PGM_CHECK_DEFAULT /* 45 */
-PGM_CHECK_DEFAULT /* 46 */
-PGM_CHECK_DEFAULT /* 47 */
-PGM_CHECK_DEFAULT /* 48 */
-PGM_CHECK_DEFAULT /* 49 */
-PGM_CHECK_DEFAULT /* 4a */
-PGM_CHECK_DEFAULT /* 4b */
-PGM_CHECK_DEFAULT /* 4c */
-PGM_CHECK_DEFAULT /* 4d */
-PGM_CHECK_DEFAULT /* 4e */
-PGM_CHECK_DEFAULT /* 4f */
-PGM_CHECK_DEFAULT /* 50 */
-PGM_CHECK_DEFAULT /* 51 */
-PGM_CHECK_DEFAULT /* 52 */
-PGM_CHECK_DEFAULT /* 53 */
-PGM_CHECK_DEFAULT /* 54 */
-PGM_CHECK_DEFAULT /* 55 */
-PGM_CHECK_DEFAULT /* 56 */
-PGM_CHECK_DEFAULT /* 57 */
-PGM_CHECK_DEFAULT /* 58 */
-PGM_CHECK_DEFAULT /* 59 */
-PGM_CHECK_DEFAULT /* 5a */
-PGM_CHECK_DEFAULT /* 5b */
-PGM_CHECK_DEFAULT /* 5c */
-PGM_CHECK_DEFAULT /* 5d */
-PGM_CHECK_DEFAULT /* 5e */
-PGM_CHECK_DEFAULT /* 5f */
-PGM_CHECK_DEFAULT /* 60 */
-PGM_CHECK_DEFAULT /* 61 */
-PGM_CHECK_DEFAULT /* 62 */
-PGM_CHECK_DEFAULT /* 63 */
-PGM_CHECK_DEFAULT /* 64 */
-PGM_CHECK_DEFAULT /* 65 */
-PGM_CHECK_DEFAULT /* 66 */
-PGM_CHECK_DEFAULT /* 67 */
-PGM_CHECK_DEFAULT /* 68 */
-PGM_CHECK_DEFAULT /* 69 */
-PGM_CHECK_DEFAULT /* 6a */
-PGM_CHECK_DEFAULT /* 6b */
-PGM_CHECK_DEFAULT /* 6c */
-PGM_CHECK_DEFAULT /* 6d */
-PGM_CHECK_DEFAULT /* 6e */
-PGM_CHECK_DEFAULT /* 6f */
-PGM_CHECK_DEFAULT /* 70 */
-PGM_CHECK_DEFAULT /* 71 */
-PGM_CHECK_DEFAULT /* 72 */
-PGM_CHECK_DEFAULT /* 73 */
-PGM_CHECK_DEFAULT /* 74 */
-PGM_CHECK_DEFAULT /* 75 */
-PGM_CHECK_DEFAULT /* 76 */
-PGM_CHECK_DEFAULT /* 77 */
-PGM_CHECK_DEFAULT /* 78 */
-PGM_CHECK_DEFAULT /* 79 */
-PGM_CHECK_DEFAULT /* 7a */
-PGM_CHECK_DEFAULT /* 7b */
-PGM_CHECK_DEFAULT /* 7c */
-PGM_CHECK_DEFAULT /* 7d */
-PGM_CHECK_DEFAULT /* 7e */
-PGM_CHECK_DEFAULT /* 7f */
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 6ccef5f29761..4e3b366589fb 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -29,8 +29,9 @@
#include <linux/random.h>
#include <linux/export.h>
#include <linux/init_task.h>
+#include <linux/entry-common.h>
+#include <linux/io.h>
#include <asm/cpu_mf.h>
-#include <asm/io.h>
#include <asm/processor.h>
#include <asm/vtimer.h>
#include <asm/exec.h>
@@ -43,9 +44,22 @@
#include <asm/unwind.h>
#include "entry.h"
-asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
+void ret_from_fork(void) asm("ret_from_fork");
-extern void kernel_thread_starter(void);
+void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs)
+{
+ void (*func)(void *arg);
+
+ schedule_tail(prev);
+
+ if (!user_mode(regs)) {
+ /* Kernel thread */
+ func = (void *)regs->gprs[9];
+ func((void *)regs->gprs[10]);
+ }
+ clear_pt_regs_flag(regs, PIF_SYSCALL);
+ syscall_exit_to_user_mode(regs);
+}
void flush_thread(void)
{
@@ -75,14 +89,28 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
*/
save_fpu_regs();
- memcpy(dst, src, arch_task_struct_size);
+ *dst = *src;
dst->thread.fpu.regs = dst->thread.fpu.fprs;
+
+ /*
+ * Don't transfer over the runtime instrumentation or the guarded
+ * storage control block pointers. These fields are cleared here instead
+ * of in copy_thread() to avoid premature freeing of associated memory
+ * on fork() failure. Wait to clear the RI flag because ->stack still
+ * refers to the source thread.
+ */
+ dst->thread.ri_cb = NULL;
+ dst->thread.gs_cb = NULL;
+ dst->thread.gs_bc_cb = NULL;
+
return 0;
}
-int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
+ unsigned long clone_flags = args->flags;
+ unsigned long new_stackp = args->stack;
+ unsigned long tls = args->tls;
struct fake_frame
{
struct stack_frame sf;
@@ -94,7 +122,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
/* Save access registers to new thread structure. */
save_access_regs(&p->thread.acrs[0]);
/* start new process with ar4 pointing to the correct address space */
- p->thread.mm_segment = get_fs();
/* Don't copy debug registers */
memset(&p->thread.per_user, 0, sizeof(p->thread.per_user));
memset(&p->thread.per_event, 0, sizeof(p->thread.per_event));
@@ -106,26 +133,26 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
p->thread.system_timer = 0;
p->thread.hardirq_timer = 0;
p->thread.softirq_timer = 0;
+ p->thread.last_break = 1;
frame->sf.back_chain = 0;
+ frame->sf.gprs[11 - 6] = (unsigned long)&frame->childregs;
+ frame->sf.gprs[12 - 6] = (unsigned long)p;
/* new return point is ret_from_fork */
- frame->sf.gprs[8] = (unsigned long) ret_from_fork;
+ frame->sf.gprs[14 - 6] = (unsigned long)ret_from_fork;
/* fake return stack for resume(), don't go back to schedule */
- frame->sf.gprs[9] = (unsigned long) frame;
+ frame->sf.gprs[15 - 6] = (unsigned long)frame;
/* Store access registers to kernel stack of new process. */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(args->fn)) {
/* kernel thread */
memset(&frame->childregs, 0, sizeof(struct pt_regs));
- frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT |
- PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
- frame->childregs.psw.addr =
- (unsigned long) kernel_thread_starter;
- frame->childregs.gprs[9] = new_stackp; /* function */
- frame->childregs.gprs[10] = arg;
- frame->childregs.gprs[11] = (unsigned long) do_exit;
+ frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO |
+ PSW_MASK_EXT | PSW_MASK_MCHECK;
+ frame->childregs.gprs[9] = (unsigned long)args->fn;
+ frame->childregs.gprs[10] = (unsigned long)args->fn_arg;
frame->childregs.orig_gpr2 = -1;
-
+ frame->childregs.last_break = 1;
return 0;
}
frame->childregs = *current_pt_regs();
@@ -133,13 +160,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
frame->childregs.flags = 0;
if (new_stackp)
frame->childregs.gprs[15] = new_stackp;
-
- /* Don't copy runtime instrumentation info */
- p->thread.ri_cb = NULL;
+ /*
+ * Clear the runtime instrumentation flag after the above childregs
+ * copy. The CB pointer was already cleared in arch_dup_task_struct().
+ */
frame->childregs.psw.mask &= ~PSW_MASK_RI;
- /* Don't copy guarded storage control block */
- p->thread.gs_cb = NULL;
- p->thread.gs_bc_cb = NULL;
/* Set a new TLS ? */
if (clone_flags & CLONE_SETTLS) {
@@ -150,39 +175,27 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
p->thread.acrs[1] = (unsigned int)tls;
}
}
+ /*
+ * s390 stores the svc return address in arch_data when calling
+ * sigreturn()/restart_syscall() via vdso. 1 means no valid address
+ * stored.
+ */
+ p->restart_block.arch_data = 1;
return 0;
}
-asmlinkage void execve_tail(void)
+void execve_tail(void)
{
current->thread.fpu.fpc = 0;
asm volatile("sfpc %0" : : "d" (0));
}
-/*
- * fill in the FPU structure for a core dump.
- */
-int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
-{
- save_fpu_regs();
- fpregs->fpc = current->thread.fpu.fpc;
- fpregs->pad = 0;
- if (MACHINE_HAS_VX)
- convert_vx_to_fp((freg_t *)&fpregs->fprs,
- current->thread.fpu.vxrs);
- else
- memcpy(&fpregs->fprs, current->thread.fpu.fprs,
- sizeof(fpregs->fprs));
- return 1;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-unsigned long get_wchan(struct task_struct *p)
+unsigned long __get_wchan(struct task_struct *p)
{
struct unwind_state state;
unsigned long ip = 0;
- if (!p || p == current || p->state == TASK_RUNNING || !task_stack_page(p))
+ if (!task_stack_page(p))
return 0;
if (!try_get_task_stack(p))
@@ -209,13 +222,13 @@ unsigned long get_wchan(struct task_struct *p)
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() & ~PAGE_MASK;
+ sp -= get_random_u32_below(PAGE_SIZE);
return sp & ~0xf;
}
static inline unsigned long brk_rnd(void)
{
- return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT;
+ return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
@@ -225,16 +238,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
ret = PAGE_ALIGN(mm->brk + brk_rnd());
return (ret > mm->brk) ? ret : mm->brk;
}
-
-void set_fs_fixup(void)
-{
- struct pt_regs *regs = current_pt_regs();
- static bool warned;
-
- set_fs(USER_DS);
- if (warned)
- return;
- WARN(1, "Unbalanced set_fs - int code: 0x%x\n", regs->int_code);
- show_registers(regs);
- warned = true;
-}
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 6ebc2117c66c..65c1464eea4f 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -8,9 +8,9 @@
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/stop_machine.h>
-#include <linux/cpufeature.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
+#include <linux/random.h>
#include <linux/sched/mm.h>
#include <linux/init.h>
#include <linux/seq_file.h>
@@ -23,8 +23,12 @@
#include <asm/elf.h>
#include <asm/lowcore.h>
#include <asm/param.h>
+#include <asm/sclp.h>
#include <asm/smp.h>
+unsigned long __read_mostly elf_hwcap;
+char elf_platform[ELF_PLATFORM_SIZE];
+
struct cpu_info {
unsigned int cpu_mhz_dynamic;
unsigned int cpu_mhz_static;
@@ -91,23 +95,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, current);
}
-/*
- * cpu_have_feature - Test CPU features on module initialization
- */
-int cpu_have_feature(unsigned int num)
-{
- return elf_hwcap & (1UL << num);
-}
-EXPORT_SYMBOL(cpu_have_feature);
-
static void show_facilities(struct seq_file *m)
{
unsigned int bit;
- long *facilities;
- facilities = (long *)&S390_lowcore.stfle_fac_list;
seq_puts(m, "facilities :");
- for_each_set_bit_inv(bit, facilities, MAX_FACILITY_BIT)
+ for_each_set_bit_inv(bit, (long *)&stfle_fac_list, MAX_FACILITY_BIT)
seq_printf(m, " %d", bit);
seq_putc(m, '\n');
}
@@ -115,15 +108,33 @@ static void show_facilities(struct seq_file *m)
static void show_cpu_summary(struct seq_file *m, void *v)
{
static const char *hwcap_str[] = {
- "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp",
- "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs",
- "vxe2", "vxp", "sort", "dflt"
- };
- static const char * const int_hwcap_str[] = {
- "sie"
+ [HWCAP_NR_ESAN3] = "esan3",
+ [HWCAP_NR_ZARCH] = "zarch",
+ [HWCAP_NR_STFLE] = "stfle",
+ [HWCAP_NR_MSA] = "msa",
+ [HWCAP_NR_LDISP] = "ldisp",
+ [HWCAP_NR_EIMM] = "eimm",
+ [HWCAP_NR_DFP] = "dfp",
+ [HWCAP_NR_HPAGE] = "edat",
+ [HWCAP_NR_ETF3EH] = "etf3eh",
+ [HWCAP_NR_HIGH_GPRS] = "highgprs",
+ [HWCAP_NR_TE] = "te",
+ [HWCAP_NR_VXRS] = "vx",
+ [HWCAP_NR_VXRS_BCD] = "vxd",
+ [HWCAP_NR_VXRS_EXT] = "vxe",
+ [HWCAP_NR_GS] = "gs",
+ [HWCAP_NR_VXRS_EXT2] = "vxe2",
+ [HWCAP_NR_VXRS_PDE] = "vxp",
+ [HWCAP_NR_SORT] = "sort",
+ [HWCAP_NR_DFLT] = "dflt",
+ [HWCAP_NR_VXRS_PDE2] = "vxp2",
+ [HWCAP_NR_NNPA] = "nnpa",
+ [HWCAP_NR_PCI_MIO] = "pcimio",
+ [HWCAP_NR_SIE] = "sie",
};
int i, cpu;
+ BUILD_BUG_ON(ARRAY_SIZE(hwcap_str) != HWCAP_NR_MAX);
seq_printf(m, "vendor_id : IBM/S390\n"
"# processors : %i\n"
"bogomips per cpu: %lu.%02lu\n",
@@ -134,9 +145,6 @@ static void show_cpu_summary(struct seq_file *m, void *v)
for (i = 0; i < ARRAY_SIZE(hwcap_str); i++)
if (hwcap_str[i] && (elf_hwcap & (1UL << i)))
seq_printf(m, "%s ", hwcap_str[i]);
- for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++)
- if (int_hwcap_str[i] && (int_hwcap & (1UL << i)))
- seq_printf(m, "%s ", int_hwcap_str[i]);
seq_puts(m, "\n");
show_facilities(m);
show_cacheinfo(m);
@@ -151,10 +159,155 @@ static void show_cpu_summary(struct seq_file *m, void *v)
}
}
+static int __init setup_hwcaps(void)
+{
+ /* instructions named N3, "backported" to esa-mode */
+ elf_hwcap |= HWCAP_ESAN3;
+
+ /* z/Architecture mode active */
+ elf_hwcap |= HWCAP_ZARCH;
+
+ /* store-facility-list-extended */
+ if (test_facility(7))
+ elf_hwcap |= HWCAP_STFLE;
+
+ /* message-security assist */
+ if (test_facility(17))
+ elf_hwcap |= HWCAP_MSA;
+
+ /* long-displacement */
+ if (test_facility(19))
+ elf_hwcap |= HWCAP_LDISP;
+
+ /* extended-immediate */
+ elf_hwcap |= HWCAP_EIMM;
+
+ /* extended-translation facility 3 enhancement */
+ if (test_facility(22) && test_facility(30))
+ elf_hwcap |= HWCAP_ETF3EH;
+
+ /* decimal floating point & perform floating point operation */
+ if (test_facility(42) && test_facility(44))
+ elf_hwcap |= HWCAP_DFP;
+
+ /* huge page support */
+ if (MACHINE_HAS_EDAT1)
+ elf_hwcap |= HWCAP_HPAGE;
+
+ /* 64-bit register support for 31-bit processes */
+ elf_hwcap |= HWCAP_HIGH_GPRS;
+
+ /* transactional execution */
+ if (MACHINE_HAS_TE)
+ elf_hwcap |= HWCAP_TE;
+
+ /* vector */
+ if (test_facility(129)) {
+ elf_hwcap |= HWCAP_VXRS;
+ if (test_facility(134))
+ elf_hwcap |= HWCAP_VXRS_BCD;
+ if (test_facility(135))
+ elf_hwcap |= HWCAP_VXRS_EXT;
+ if (test_facility(148))
+ elf_hwcap |= HWCAP_VXRS_EXT2;
+ if (test_facility(152))
+ elf_hwcap |= HWCAP_VXRS_PDE;
+ if (test_facility(192))
+ elf_hwcap |= HWCAP_VXRS_PDE2;
+ }
+
+ if (test_facility(150))
+ elf_hwcap |= HWCAP_SORT;
+
+ if (test_facility(151))
+ elf_hwcap |= HWCAP_DFLT;
+
+ if (test_facility(165))
+ elf_hwcap |= HWCAP_NNPA;
+
+ /* guarded storage */
+ if (MACHINE_HAS_GS)
+ elf_hwcap |= HWCAP_GS;
+
+ if (MACHINE_HAS_PCI_MIO)
+ elf_hwcap |= HWCAP_PCI_MIO;
+
+ /* virtualization support */
+ if (sclp.has_sief2)
+ elf_hwcap |= HWCAP_SIE;
+
+ return 0;
+}
+arch_initcall(setup_hwcaps);
+
+static int __init setup_elf_platform(void)
+{
+ struct cpuid cpu_id;
+
+ get_cpu_id(&cpu_id);
+ add_device_randomness(&cpu_id, sizeof(cpu_id));
+ switch (cpu_id.machine) {
+ default: /* Use "z10" as default. */
+ strcpy(elf_platform, "z10");
+ break;
+ case 0x2817:
+ case 0x2818:
+ strcpy(elf_platform, "z196");
+ break;
+ case 0x2827:
+ case 0x2828:
+ strcpy(elf_platform, "zEC12");
+ break;
+ case 0x2964:
+ case 0x2965:
+ strcpy(elf_platform, "z13");
+ break;
+ case 0x3906:
+ case 0x3907:
+ strcpy(elf_platform, "z14");
+ break;
+ case 0x8561:
+ case 0x8562:
+ strcpy(elf_platform, "z15");
+ break;
+ case 0x3931:
+ case 0x3932:
+ strcpy(elf_platform, "z16");
+ break;
+ }
+ return 0;
+}
+arch_initcall(setup_elf_platform);
+
+static void show_cpu_topology(struct seq_file *m, unsigned long n)
+{
+#ifdef CONFIG_SCHED_TOPOLOGY
+ seq_printf(m, "physical id : %d\n", topology_physical_package_id(n));
+ seq_printf(m, "core id : %d\n", topology_core_id(n));
+ seq_printf(m, "book id : %d\n", topology_book_id(n));
+ seq_printf(m, "drawer id : %d\n", topology_drawer_id(n));
+ seq_printf(m, "dedicated : %d\n", topology_cpu_dedicated(n));
+ seq_printf(m, "address : %d\n", smp_cpu_get_cpu_address(n));
+ seq_printf(m, "siblings : %d\n", cpumask_weight(topology_core_cpumask(n)));
+ seq_printf(m, "cpu cores : %d\n", topology_booted_cores(n));
+#endif /* CONFIG_SCHED_TOPOLOGY */
+}
+
+static void show_cpu_ids(struct seq_file *m, unsigned long n)
+{
+ struct cpuid *id = &per_cpu(cpu_info.cpu_id, n);
+
+ seq_printf(m, "version : %02X\n", id->version);
+ seq_printf(m, "identification : %06X\n", id->ident);
+ seq_printf(m, "machine : %04X\n", id->machine);
+}
+
static void show_cpu_mhz(struct seq_file *m, unsigned long n)
{
struct cpu_info *c = per_cpu_ptr(&cpu_info, n);
+ if (!machine_has_cpu_mhz)
+ return;
seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic);
seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static);
}
@@ -165,12 +318,13 @@ static void show_cpu_mhz(struct seq_file *m, unsigned long n)
static int show_cpuinfo(struct seq_file *m, void *v)
{
unsigned long n = (unsigned long) v - 1;
+ unsigned long first = cpumask_first(cpu_online_mask);
- if (!n)
+ if (n == first)
show_cpu_summary(m, v);
- if (!machine_has_cpu_mhz)
- return 0;
seq_printf(m, "\ncpu number : %ld\n", n);
+ show_cpu_topology(m, n);
+ show_cpu_ids(m, n);
show_cpu_mhz(m, n);
return 0;
}
@@ -179,12 +333,14 @@ static inline void *c_update(loff_t *pos)
{
if (*pos)
*pos = cpumask_next(*pos - 1, cpu_online_mask);
+ else
+ *pos = cpumask_first(cpu_online_mask);
return *pos < nr_cpu_ids ? (void *)*pos + 1 : NULL;
}
static void *c_start(struct seq_file *m, loff_t *pos)
{
- get_online_cpus();
+ cpus_read_lock();
return c_update(pos);
}
@@ -196,7 +352,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
static void c_stop(struct seq_file *m, void *v)
{
- put_online_cpus();
+ cpus_read_unlock();
}
const struct seq_operations cpuinfo_op = {
@@ -205,21 +361,3 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
-
-int s390_isolate_bp(void)
-{
- if (!test_facility(82))
- return -EOPNOTSUPP;
- set_thread_flag(TIF_ISOLATE_BP);
- return 0;
-}
-EXPORT_SYMBOL(s390_isolate_bp);
-
-int s390_isolate_bp_guest(void)
-{
- if (!test_facility(82))
- return -EOPNOTSUPP;
- set_thread_flag(TIF_ISOLATE_BP_GUEST);
- return 0;
-}
-EXPORT_SYMBOL(s390_isolate_bp_guest);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 58faa12542a1..f1897a8bb221 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -7,6 +7,7 @@
* Martin Schwidefsky (schwidefsky@de.ibm.com)
*/
+#include "asm/ptrace.h"
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
@@ -20,18 +21,16 @@
#include <linux/signal.h>
#include <linux/elf.h>
#include <linux/regset.h>
-#include <linux/tracehook.h>
#include <linux/seccomp.h>
#include <linux/compat.h>
#include <trace/syscall.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/switch_to.h>
#include <asm/runtime_instr.h>
#include <asm/facility.h>
+#include <asm/fpu/api.h>
#include "entry.h"
@@ -39,20 +38,24 @@
#include "compat_ptrace.h"
#endif
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
void update_cr_regs(struct task_struct *task)
{
struct pt_regs *regs = task_pt_regs(task);
struct thread_struct *thread = &task->thread;
- struct per_regs old, new;
union ctlreg0 cr0_old, cr0_new;
union ctlreg2 cr2_old, cr2_new;
int cr0_changed, cr2_changed;
-
- __ctl_store(cr0_old.val, 0, 0);
- __ctl_store(cr2_old.val, 2, 2);
+ union {
+ struct ctlreg regs[3];
+ struct {
+ struct ctlreg control;
+ struct ctlreg start;
+ struct ctlreg end;
+ };
+ } old, new;
+
+ local_ctl_store(0, &cr0_old.reg);
+ local_ctl_store(2, &cr2_old.reg);
cr0_new = cr0_old;
cr2_new = cr2_old;
/* Take care of the enable/disable of transactional execution. */
@@ -80,38 +83,38 @@ void update_cr_regs(struct task_struct *task)
cr0_changed = cr0_new.val != cr0_old.val;
cr2_changed = cr2_new.val != cr2_old.val;
if (cr0_changed)
- __ctl_load(cr0_new.val, 0, 0);
+ local_ctl_load(0, &cr0_new.reg);
if (cr2_changed)
- __ctl_load(cr2_new.val, 2, 2);
+ local_ctl_load(2, &cr2_new.reg);
/* Copy user specified PER registers */
- new.control = thread->per_user.control;
- new.start = thread->per_user.start;
- new.end = thread->per_user.end;
+ new.control.val = thread->per_user.control;
+ new.start.val = thread->per_user.start;
+ new.end.val = thread->per_user.end;
/* merge TIF_SINGLE_STEP into user specified PER registers. */
if (test_tsk_thread_flag(task, TIF_SINGLE_STEP) ||
test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) {
if (test_tsk_thread_flag(task, TIF_BLOCK_STEP))
- new.control |= PER_EVENT_BRANCH;
+ new.control.val |= PER_EVENT_BRANCH;
else
- new.control |= PER_EVENT_IFETCH;
- new.control |= PER_CONTROL_SUSPENSION;
- new.control |= PER_EVENT_TRANSACTION_END;
+ new.control.val |= PER_EVENT_IFETCH;
+ new.control.val |= PER_CONTROL_SUSPENSION;
+ new.control.val |= PER_EVENT_TRANSACTION_END;
if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP))
- new.control |= PER_EVENT_IFETCH;
- new.start = 0;
- new.end = -1UL;
+ new.control.val |= PER_EVENT_IFETCH;
+ new.start.val = 0;
+ new.end.val = -1UL;
}
/* Take care of the PER enablement bit in the PSW. */
- if (!(new.control & PER_EVENT_MASK)) {
+ if (!(new.control.val & PER_EVENT_MASK)) {
regs->psw.mask &= ~PSW_MASK_PER;
return;
}
regs->psw.mask |= PSW_MASK_PER;
- __ctl_store(old, 9, 11);
+ __local_ctl_store(9, 11, old.regs);
if (memcmp(&new, &old, sizeof(struct per_regs)) != 0)
- __ctl_load(new, 9, 11);
+ __local_ctl_load(9, 11, new.regs);
}
void user_enable_single_step(struct task_struct *task)
@@ -142,7 +145,7 @@ void ptrace_disable(struct task_struct *task)
memset(&task->thread.per_user, 0, sizeof(task->thread.per_user));
memset(&task->thread.per_event, 0, sizeof(task->thread.per_event));
clear_tsk_thread_flag(task, TIF_SINGLE_STEP);
- clear_pt_regs_flag(task_pt_regs(task), PIF_PER_TRAP);
+ clear_tsk_thread_flag(task, TIF_PER_TRAP);
task->thread.per_flags = 0;
}
@@ -151,38 +154,36 @@ void ptrace_disable(struct task_struct *task)
static inline unsigned long __peek_user_per(struct task_struct *child,
addr_t addr)
{
- struct per_struct_kernel *dummy = NULL;
-
- if (addr == (addr_t) &dummy->cr9)
+ if (addr == offsetof(struct per_struct_kernel, cr9))
/* Control bits of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
PER_EVENT_IFETCH : child->thread.per_user.control;
- else if (addr == (addr_t) &dummy->cr10)
+ else if (addr == offsetof(struct per_struct_kernel, cr10))
/* Start address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
0 : child->thread.per_user.start;
- else if (addr == (addr_t) &dummy->cr11)
+ else if (addr == offsetof(struct per_struct_kernel, cr11))
/* End address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
-1UL : child->thread.per_user.end;
- else if (addr == (addr_t) &dummy->bits)
+ else if (addr == offsetof(struct per_struct_kernel, bits))
/* Single-step bit. */
return test_thread_flag(TIF_SINGLE_STEP) ?
(1UL << (BITS_PER_LONG - 1)) : 0;
- else if (addr == (addr_t) &dummy->starting_addr)
+ else if (addr == offsetof(struct per_struct_kernel, starting_addr))
/* Start address of the user specified per set. */
return child->thread.per_user.start;
- else if (addr == (addr_t) &dummy->ending_addr)
+ else if (addr == offsetof(struct per_struct_kernel, ending_addr))
/* End address of the user specified per set. */
return child->thread.per_user.end;
- else if (addr == (addr_t) &dummy->perc_atmid)
+ else if (addr == offsetof(struct per_struct_kernel, perc_atmid))
/* PER code, ATMID and AI of the last PER trap */
return (unsigned long)
child->thread.per_event.cause << (BITS_PER_LONG - 16);
- else if (addr == (addr_t) &dummy->address)
+ else if (addr == offsetof(struct per_struct_kernel, address))
/* Address of the last PER trap */
return child->thread.per_event.address;
- else if (addr == (addr_t) &dummy->access_id)
+ else if (addr == offsetof(struct per_struct_kernel, access_id))
/* Access id of the last PER trap */
return (unsigned long)
child->thread.per_event.paid << (BITS_PER_LONG - 8);
@@ -200,73 +201,72 @@ static inline unsigned long __peek_user_per(struct task_struct *child,
*/
static unsigned long __peek_user(struct task_struct *child, addr_t addr)
{
- struct user *dummy = NULL;
addr_t offset, tmp;
- if (addr < (addr_t) &dummy->regs.acrs) {
+ if (addr < offsetof(struct user, regs.acrs)) {
/*
* psw and gprs are stored on the stack
*/
tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr);
- if (addr == (addr_t) &dummy->regs.psw.mask) {
+ if (addr == offsetof(struct user, regs.psw.mask)) {
/* Return a clean psw mask. */
tmp &= PSW_MASK_USER | PSW_MASK_RI;
tmp |= PSW_USER_BITS;
}
- } else if (addr < (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr < offsetof(struct user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy->regs.acrs;
+ offset = addr - offsetof(struct user, regs.acrs);
/*
* Very special case: old & broken 64 bit gdb reading
* from acrs[15]. Result is a 64 bit value. Read the
* 32 bit acrs[15] value and shift it by 32. Sick...
*/
- if (addr == (addr_t) &dummy->regs.acrs[15])
+ if (addr == offsetof(struct user, regs.acrs[15]))
tmp = ((unsigned long) child->thread.acrs[15]) << 32;
else
tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset);
- } else if (addr == (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr == offsetof(struct user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
tmp = (addr_t) task_pt_regs(child)->orig_gpr2;
- } else if (addr < (addr_t) &dummy->regs.fp_regs) {
+ } else if (addr < offsetof(struct user, regs.fp_regs)) {
/*
* prevent reads of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
tmp = 0;
- } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
tmp = child->thread.fpu.fpc;
tmp <<= BITS_PER_LONG - 32;
- } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
- if (MACHINE_HAS_VX)
+ offset = addr - offsetof(struct user, regs.fp_regs.fprs);
+ if (cpu_has_vx())
tmp = *(addr_t *)
((addr_t) child->thread.fpu.vxrs + 2*offset);
else
tmp = *(addr_t *)
((addr_t) child->thread.fpu.fprs + offset);
- } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy->regs.per_info;
+ addr -= offsetof(struct user, regs.per_info);
tmp = __peek_user_per(child, addr);
} else
@@ -285,8 +285,8 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
* an alignment of 4. Programmers from hell...
*/
mask = __ADDR_MASK;
- if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs &&
- addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2)
+ if (addr >= offsetof(struct user, regs.acrs) &&
+ addr < offsetof(struct user, regs.orig_gpr2))
mask = 3;
if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK)
return -EIO;
@@ -298,8 +298,6 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
static inline void __poke_user_per(struct task_struct *child,
addr_t addr, addr_t data)
{
- struct per_struct_kernel *dummy = NULL;
-
/*
* There are only three fields in the per_info struct that the
* debugger user can write to.
@@ -312,14 +310,14 @@ static inline void __poke_user_per(struct task_struct *child,
* addresses are used only if single stepping is not in effect.
* Writes to any other field in per_info are ignored.
*/
- if (addr == (addr_t) &dummy->cr9)
+ if (addr == offsetof(struct per_struct_kernel, cr9))
/* PER event mask of the user specified per set. */
child->thread.per_user.control =
data & (PER_EVENT_MASK | PER_CONTROL_MASK);
- else if (addr == (addr_t) &dummy->starting_addr)
+ else if (addr == offsetof(struct per_struct_kernel, starting_addr))
/* Starting address of the user specified per set. */
child->thread.per_user.start = data;
- else if (addr == (addr_t) &dummy->ending_addr)
+ else if (addr == offsetof(struct per_struct_kernel, ending_addr))
/* Ending address of the user specified per set. */
child->thread.per_user.end = data;
}
@@ -332,14 +330,15 @@ static inline void __poke_user_per(struct task_struct *child,
*/
static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
{
- struct user *dummy = NULL;
addr_t offset;
- if (addr < (addr_t) &dummy->regs.acrs) {
+
+ if (addr < offsetof(struct user, regs.acrs)) {
+ struct pt_regs *regs = task_pt_regs(child);
/*
* psw and gprs are stored on the stack
*/
- if (addr == (addr_t) &dummy->regs.psw.mask) {
+ if (addr == offsetof(struct user, regs.psw.mask)) {
unsigned long mask = PSW_MASK_USER;
mask |= is_ri_task(child) ? PSW_MASK_RI : 0;
@@ -353,64 +352,69 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
/* Invalid addressing mode bits */
return -EINVAL;
}
- *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr) = data;
- } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) {
+ if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
+ addr == offsetof(struct user, regs.gprs[2])) {
+ struct pt_regs *regs = task_pt_regs(child);
+
+ regs->int_code = 0x20000 | (data & 0xffff);
+ }
+ *(addr_t *)((addr_t) &regs->psw + addr) = data;
+ } else if (addr < offsetof(struct user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy->regs.acrs;
+ offset = addr - offsetof(struct user, regs.acrs);
/*
* Very special case: old & broken 64 bit gdb writing
* to acrs[15] with a 64 bit value. Ignore the lower
* half of the value and write the upper 32 bit to
* acrs[15]. Sick...
*/
- if (addr == (addr_t) &dummy->regs.acrs[15])
+ if (addr == offsetof(struct user, regs.acrs[15]))
child->thread.acrs[15] = (unsigned int) (data >> 32);
else
*(addr_t *)((addr_t) &child->thread.acrs + offset) = data;
- } else if (addr == (addr_t) &dummy->regs.orig_gpr2) {
+ } else if (addr == offsetof(struct user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
task_pt_regs(child)->orig_gpr2 = data;
- } else if (addr < (addr_t) &dummy->regs.fp_regs) {
+ } else if (addr < offsetof(struct user, regs.fp_regs)) {
/*
* prevent writes of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
return 0;
- } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
- if ((unsigned int) data != 0 ||
- test_fp_ctl(data >> (BITS_PER_LONG - 32)))
+ if ((unsigned int)data != 0)
return -EINVAL;
child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32);
- } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
- if (MACHINE_HAS_VX)
+ offset = addr - offsetof(struct user, regs.fp_regs.fprs);
+ if (cpu_has_vx())
*(addr_t *)((addr_t)
child->thread.fpu.vxrs + 2*offset) = data;
else
*(addr_t *)((addr_t)
child->thread.fpu.fprs + offset) = data;
- } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy->regs.per_info;
+ addr -= offsetof(struct user, regs.per_info);
__poke_user_per(child, addr, data);
}
@@ -427,8 +431,8 @@ static int poke_user(struct task_struct *child, addr_t addr, addr_t data)
* an alignment of 4. Programmers from hell indeed...
*/
mask = __ADDR_MASK;
- if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs &&
- addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2)
+ if (addr >= offsetof(struct user, regs.acrs) &&
+ addr < offsetof(struct user, regs.orig_gpr2))
mask = 3;
if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK)
return -EIO;
@@ -477,9 +481,7 @@ long arch_ptrace(struct task_struct *child, long request,
}
return 0;
case PTRACE_GET_LAST_BREAK:
- put_user(child->thread.last_break,
- (unsigned long __user *) data);
- return 0;
+ return put_user(child->thread.last_break, (unsigned long __user *)data);
case PTRACE_ENABLE_TE:
if (!MACHINE_HAS_TE)
return -EIO;
@@ -536,37 +538,35 @@ long arch_ptrace(struct task_struct *child, long request,
static inline __u32 __peek_user_per_compat(struct task_struct *child,
addr_t addr)
{
- struct compat_per_struct_kernel *dummy32 = NULL;
-
- if (addr == (addr_t) &dummy32->cr9)
+ if (addr == offsetof(struct compat_per_struct_kernel, cr9))
/* Control bits of the active per set. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
PER_EVENT_IFETCH : child->thread.per_user.control;
- else if (addr == (addr_t) &dummy32->cr10)
+ else if (addr == offsetof(struct compat_per_struct_kernel, cr10))
/* Start address of the active per set. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
0 : child->thread.per_user.start;
- else if (addr == (addr_t) &dummy32->cr11)
+ else if (addr == offsetof(struct compat_per_struct_kernel, cr11))
/* End address of the active per set. */
return test_thread_flag(TIF_SINGLE_STEP) ?
PSW32_ADDR_INSN : child->thread.per_user.end;
- else if (addr == (addr_t) &dummy32->bits)
+ else if (addr == offsetof(struct compat_per_struct_kernel, bits))
/* Single-step bit. */
return (__u32) test_thread_flag(TIF_SINGLE_STEP) ?
0x80000000 : 0;
- else if (addr == (addr_t) &dummy32->starting_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
/* Start address of the user specified per set. */
return (__u32) child->thread.per_user.start;
- else if (addr == (addr_t) &dummy32->ending_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
/* End address of the user specified per set. */
return (__u32) child->thread.per_user.end;
- else if (addr == (addr_t) &dummy32->perc_atmid)
+ else if (addr == offsetof(struct compat_per_struct_kernel, perc_atmid))
/* PER code, ATMID and AI of the last PER trap */
return (__u32) child->thread.per_event.cause << 16;
- else if (addr == (addr_t) &dummy32->address)
+ else if (addr == offsetof(struct compat_per_struct_kernel, address))
/* Address of the last PER trap */
return (__u32) child->thread.per_event.address;
- else if (addr == (addr_t) &dummy32->access_id)
+ else if (addr == offsetof(struct compat_per_struct_kernel, access_id))
/* Access id of the last PER trap */
return (__u32) child->thread.per_event.paid << 24;
return 0;
@@ -577,21 +577,20 @@ static inline __u32 __peek_user_per_compat(struct task_struct *child,
*/
static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
{
- struct compat_user *dummy32 = NULL;
addr_t offset;
__u32 tmp;
- if (addr < (addr_t) &dummy32->regs.acrs) {
+ if (addr < offsetof(struct compat_user, regs.acrs)) {
struct pt_regs *regs = task_pt_regs(child);
/*
* psw and gprs are stored on the stack
*/
- if (addr == (addr_t) &dummy32->regs.psw.mask) {
+ if (addr == offsetof(struct compat_user, regs.psw.mask)) {
/* Fake a 31 bit psw mask. */
tmp = (__u32)(regs->psw.mask >> 32);
tmp &= PSW32_MASK_USER | PSW32_MASK_RI;
tmp |= PSW32_USER_BITS;
- } else if (addr == (addr_t) &dummy32->regs.psw.addr) {
+ } else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
/* Fake a 31 bit psw address. */
tmp = (__u32) regs->psw.addr |
(__u32)(regs->psw.mask & PSW_MASK_BA);
@@ -599,50 +598,50 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
/* gpr 0-15 */
tmp = *(__u32 *)((addr_t) &regs->psw + addr*2 + 4);
}
- } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy32->regs.acrs;
+ offset = addr - offsetof(struct compat_user, regs.acrs);
tmp = *(__u32*)((addr_t) &child->thread.acrs + offset);
- } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4);
- } else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
/*
* prevent reads of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
tmp = 0;
- } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
tmp = child->thread.fpu.fpc;
- } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
- if (MACHINE_HAS_VX)
+ offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
+ if (cpu_has_vx())
tmp = *(__u32 *)
((addr_t) child->thread.fpu.vxrs + 2*offset);
else
tmp = *(__u32 *)
((addr_t) child->thread.fpu.fprs + offset);
- } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy32->regs.per_info;
+ addr -= offsetof(struct compat_user, regs.per_info);
tmp = __peek_user_per_compat(child, addr);
} else
@@ -669,16 +668,14 @@ static int peek_user_compat(struct task_struct *child,
static inline void __poke_user_per_compat(struct task_struct *child,
addr_t addr, __u32 data)
{
- struct compat_per_struct_kernel *dummy32 = NULL;
-
- if (addr == (addr_t) &dummy32->cr9)
+ if (addr == offsetof(struct compat_per_struct_kernel, cr9))
/* PER event mask of the user specified per set. */
child->thread.per_user.control =
data & (PER_EVENT_MASK | PER_CONTROL_MASK);
- else if (addr == (addr_t) &dummy32->starting_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, starting_addr))
/* Starting address of the user specified per set. */
child->thread.per_user.start = data;
- else if (addr == (addr_t) &dummy32->ending_addr)
+ else if (addr == offsetof(struct compat_per_struct_kernel, ending_addr))
/* Ending address of the user specified per set. */
child->thread.per_user.end = data;
}
@@ -689,16 +686,15 @@ static inline void __poke_user_per_compat(struct task_struct *child,
static int __poke_user_compat(struct task_struct *child,
addr_t addr, addr_t data)
{
- struct compat_user *dummy32 = NULL;
__u32 tmp = (__u32) data;
addr_t offset;
- if (addr < (addr_t) &dummy32->regs.acrs) {
+ if (addr < offsetof(struct compat_user, regs.acrs)) {
struct pt_regs *regs = task_pt_regs(child);
/*
* psw, gprs, acrs and orig_gpr2 are stored on the stack
*/
- if (addr == (addr_t) &dummy32->regs.psw.mask) {
+ if (addr == offsetof(struct compat_user, regs.psw.mask)) {
__u32 mask = PSW32_MASK_USER;
mask |= is_ri_task(child) ? PSW32_MASK_RI : 0;
@@ -712,62 +708,66 @@ static int __poke_user_compat(struct task_struct *child,
regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) |
(regs->psw.mask & PSW_MASK_BA) |
(__u64)(tmp & mask) << 32;
- } else if (addr == (addr_t) &dummy32->regs.psw.addr) {
+ } else if (addr == offsetof(struct compat_user, regs.psw.addr)) {
/* Build a 64 bit psw address from 31 bit address. */
regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN;
/* Transfer 31 bit amode bit to psw mask. */
regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) |
(__u64)(tmp & PSW32_ADDR_AMODE);
} else {
+ if (test_pt_regs_flag(regs, PIF_SYSCALL) &&
+ addr == offsetof(struct compat_user, regs.gprs[2])) {
+ struct pt_regs *regs = task_pt_regs(child);
+
+ regs->int_code = 0x20000 | (data & 0xffff);
+ }
/* gpr 0-15 */
*(__u32*)((addr_t) &regs->psw + addr*2 + 4) = tmp;
}
- } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr < offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* access registers are stored in the thread structure
*/
- offset = addr - (addr_t) &dummy32->regs.acrs;
+ offset = addr - offsetof(struct compat_user, regs.acrs);
*(__u32*)((addr_t) &child->thread.acrs + offset) = tmp;
- } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) {
+ } else if (addr == offsetof(struct compat_user, regs.orig_gpr2)) {
/*
* orig_gpr2 is stored on the kernel stack
*/
*(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp;
- } else if (addr < (addr_t) &dummy32->regs.fp_regs) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs)) {
/*
* prevent writess of padding hole between
* orig_gpr2 and fp_regs on s390.
*/
return 0;
- } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) {
+ } else if (addr == offsetof(struct compat_user, regs.fp_regs.fpc)) {
/*
* floating point control reg. is in the thread structure
*/
- if (test_fp_ctl(tmp))
- return -EINVAL;
child->thread.fpu.fpc = data;
- } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
* floating point regs. are either in child->thread.fpu
* or the child->thread.fpu.vxrs array
*/
- offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
- if (MACHINE_HAS_VX)
+ offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
+ if (cpu_has_vx())
*(__u32 *)((addr_t)
child->thread.fpu.vxrs + 2*offset) = tmp;
else
*(__u32 *)((addr_t)
child->thread.fpu.fprs + offset) = tmp;
- } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
+ } else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
/*
* Handle access to the per_info structure.
*/
- addr -= (addr_t) &dummy32->regs.per_info;
+ addr -= offsetof(struct compat_user, regs.per_info);
__poke_user_per_compat(child, addr, data);
}
@@ -827,92 +827,26 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
}
return 0;
case PTRACE_GET_LAST_BREAK:
- put_user(child->thread.last_break,
- (unsigned int __user *) data);
- return 0;
+ return put_user(child->thread.last_break, (unsigned int __user *)data);
}
return compat_ptrace_request(child, request, addr, data);
}
#endif
-asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
-{
- unsigned long mask = -1UL;
-
- /*
- * The sysc_tracesys code in entry.S stored the system
- * call number to gprs[2].
- */
- if (test_thread_flag(TIF_SYSCALL_TRACE) &&
- (tracehook_report_syscall_entry(regs) ||
- regs->gprs[2] >= NR_syscalls)) {
- /*
- * Tracing decided this syscall should not happen or the
- * debugger stored an invalid system call number. Skip
- * the system call and the system call restart handling.
- */
- clear_pt_regs_flag(regs, PIF_SYSCALL);
- return -1;
- }
-
- /* Do the secure computing check after ptrace. */
- if (secure_computing()) {
- /* seccomp failures shouldn't expose any additional code. */
- return -1;
- }
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_enter(regs, regs->gprs[2]);
-
- if (is_compat_task())
- mask = 0xffffffff;
-
- audit_syscall_entry(regs->gprs[2], regs->orig_gpr2 & mask,
- regs->gprs[3] &mask, regs->gprs[4] &mask,
- regs->gprs[5] &mask);
-
- return regs->gprs[2];
-}
-
-asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
-{
- audit_syscall_exit(regs);
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_exit(regs, regs->gprs[2]);
-
- if (test_thread_flag(TIF_SYSCALL_TRACE))
- tracehook_report_syscall_exit(regs, 0);
-}
-
/*
* user_regset definitions.
*/
static int s390_regs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
+ unsigned pos;
if (target == current)
save_access_regs(target->thread.acrs);
- if (kbuf) {
- unsigned long *k = kbuf;
- while (count > 0) {
- *k++ = __peek_user(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- unsigned long __user *u = ubuf;
- while (count > 0) {
- if (__put_user(__peek_user(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ for (pos = 0; pos < sizeof(s390_regs); pos += sizeof(long))
+ membuf_store(&to, __peek_user(target, pos));
return 0;
}
@@ -953,8 +887,8 @@ static int s390_regs_set(struct task_struct *target,
}
static int s390_fpregs_get(struct task_struct *target,
- const struct user_regset *regset, unsigned int pos,
- unsigned int count, void *kbuf, void __user *ubuf)
+ const struct user_regset *regset,
+ struct membuf to)
{
_s390_fp_regs fp_regs;
@@ -964,8 +898,7 @@ static int s390_fpregs_get(struct task_struct *target,
fp_regs.fpc = target->thread.fpu.fpc;
fpregs_store(&fp_regs, &target->thread.fpu);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &fp_regs, 0, -1);
+ return membuf_write(&to, &fp_regs, sizeof(fp_regs));
}
static int s390_fpregs_set(struct task_struct *target,
@@ -979,19 +912,18 @@ static int s390_fpregs_set(struct task_struct *target,
if (target == current)
save_fpu_regs();
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_vx_to_fp(fprs, target->thread.fpu.vxrs);
else
memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs));
- /* If setting FPC, must validate it first. */
if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) {
u32 ufpc[2] = { target->thread.fpu.fpc, 0 };
rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc,
0, offsetof(s390_fp_regs, fprs));
if (rc)
return rc;
- if (ufpc[1] != 0 || test_fp_ctl(ufpc[0]))
+ if (ufpc[1] != 0)
return -EINVAL;
target->thread.fpu.fpc = ufpc[0];
}
@@ -1002,7 +934,7 @@ static int s390_fpregs_set(struct task_struct *target,
if (rc)
return rc;
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_fp_to_vx(target->thread.fpu.vxrs, fprs);
else
memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs));
@@ -1012,20 +944,9 @@ static int s390_fpregs_set(struct task_struct *target,
static int s390_last_break_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- if (count > 0) {
- if (kbuf) {
- unsigned long *k = kbuf;
- *k = target->thread.last_break;
- } else {
- unsigned long __user *u = ubuf;
- if (__put_user(target->thread.last_break, u))
- return -EFAULT;
- }
- }
- return 0;
+ return membuf_store(&to, target->thread.last_break);
}
static int s390_last_break_set(struct task_struct *target,
@@ -1038,16 +959,15 @@ static int s390_last_break_set(struct task_struct *target,
static int s390_tdb_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct pt_regs *regs = task_pt_regs(target);
- unsigned char *data;
+ size_t size;
if (!(regs->int_code & 0x200))
return -ENODATA;
- data = target->thread.trap_tdb;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, 256);
+ size = sizeof(target->thread.trap_tdb.data);
+ return membuf_write(&to, target->thread.trap_tdb.data, size);
}
static int s390_tdb_set(struct task_struct *target,
@@ -1060,19 +980,18 @@ static int s390_tdb_set(struct task_struct *target,
static int s390_vxrs_low_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
__u64 vxrs[__NUM_VXRS_LOW];
int i;
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
return -ENODEV;
if (target == current)
save_fpu_regs();
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
+ vxrs[i] = target->thread.fpu.vxrs[i].low;
+ return membuf_write(&to, vxrs, sizeof(vxrs));
}
static int s390_vxrs_low_set(struct task_struct *target,
@@ -1083,36 +1002,32 @@ static int s390_vxrs_low_set(struct task_struct *target,
__u64 vxrs[__NUM_VXRS_LOW];
int i, rc;
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
return -ENODEV;
if (target == current)
save_fpu_regs();
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
+ vxrs[i] = target->thread.fpu.vxrs[i].low;
rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
if (rc == 0)
for (i = 0; i < __NUM_VXRS_LOW; i++)
- *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i];
+ target->thread.fpu.vxrs[i].low = vxrs[i];
return rc;
}
static int s390_vxrs_high_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- __vector128 vxrs[__NUM_VXRS_HIGH];
-
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
return -ENODEV;
if (target == current)
save_fpu_regs();
- memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(vxrs));
-
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
+ return membuf_write(&to, target->thread.fpu.vxrs + __NUM_VXRS_LOW,
+ __NUM_VXRS_HIGH * sizeof(__vector128));
}
static int s390_vxrs_high_set(struct task_struct *target,
@@ -1122,7 +1037,7 @@ static int s390_vxrs_high_set(struct task_struct *target,
{
int rc;
- if (!MACHINE_HAS_VX)
+ if (!cpu_has_vx())
return -ENODEV;
if (target == current)
save_fpu_regs();
@@ -1134,12 +1049,9 @@ static int s390_vxrs_high_set(struct task_struct *target,
static int s390_system_call_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- unsigned int *data = &target->thread.system_call;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(unsigned int));
+ return membuf_store(&to, target->thread.system_call);
}
static int s390_system_call_set(struct task_struct *target,
@@ -1154,8 +1066,7 @@ static int s390_system_call_set(struct task_struct *target,
static int s390_gs_cb_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct gs_cb *data = target->thread.gs_cb;
@@ -1165,8 +1076,7 @@ static int s390_gs_cb_get(struct task_struct *target,
return -ENODATA;
if (target == current)
save_gs_cb(data);
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct gs_cb));
+ return membuf_write(&to, data, sizeof(struct gs_cb));
}
static int s390_gs_cb_set(struct task_struct *target,
@@ -1201,7 +1111,7 @@ static int s390_gs_cb_set(struct task_struct *target,
target->thread.gs_cb = data;
*target->thread.gs_cb = gs_cb;
if (target == current) {
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
restore_gs_cb(target->thread.gs_cb);
}
preempt_enable();
@@ -1210,8 +1120,7 @@ static int s390_gs_cb_set(struct task_struct *target,
static int s390_gs_bc_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct gs_cb *data = target->thread.gs_bc_cb;
@@ -1219,8 +1128,7 @@ static int s390_gs_bc_get(struct task_struct *target,
return -ENODEV;
if (!data)
return -ENODATA;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct gs_cb));
+ return membuf_write(&to, data, sizeof(struct gs_cb));
}
static int s390_gs_bc_set(struct task_struct *target,
@@ -1256,7 +1164,6 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
cb->pc == 1 &&
cb->qc == 0 &&
cb->reserved2 == 0 &&
- cb->key == PAGE_DEFAULT_KEY &&
cb->reserved3 == 0 &&
cb->reserved4 == 0 &&
cb->reserved5 == 0 &&
@@ -1271,8 +1178,7 @@ static bool is_ri_cb_valid(struct runtime_instr_cb *cb)
static int s390_runtime_instr_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
struct runtime_instr_cb *data = target->thread.ri_cb;
@@ -1281,8 +1187,7 @@ static int s390_runtime_instr_get(struct task_struct *target,
if (!data)
return -ENODATA;
- return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- data, 0, sizeof(struct runtime_instr_cb));
+ return membuf_write(&to, data, sizeof(struct runtime_instr_cb));
}
static int s390_runtime_instr_set(struct task_struct *target,
@@ -1320,7 +1225,11 @@ static int s390_runtime_instr_set(struct task_struct *target,
kfree(data);
return -EINVAL;
}
-
+ /*
+ * Override access key in any case, since user space should
+ * not be able to set it, nor should it care about it.
+ */
+ ri_cb.key = PAGE_DEFAULT_KEY >> 4;
preempt_disable();
if (!target->thread.ri_cb)
target->thread.ri_cb = data;
@@ -1338,7 +1247,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(s390_regs) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_regs_get,
+ .regset_get = s390_regs_get,
.set = s390_regs_set,
},
{
@@ -1346,7 +1255,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(s390_fp_regs) / sizeof(long),
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_fpregs_get,
+ .regset_get = s390_fpregs_get,
.set = s390_fpregs_set,
},
{
@@ -1354,7 +1263,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = sizeof(unsigned int),
.align = sizeof(unsigned int),
- .get = s390_system_call_get,
+ .regset_get = s390_system_call_get,
.set = s390_system_call_set,
},
{
@@ -1362,7 +1271,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_last_break_get,
+ .regset_get = s390_last_break_get,
.set = s390_last_break_set,
},
{
@@ -1370,7 +1279,7 @@ static const struct user_regset s390_regsets[] = {
.n = 1,
.size = 256,
.align = 1,
- .get = s390_tdb_get,
+ .regset_get = s390_tdb_get,
.set = s390_tdb_set,
},
{
@@ -1378,7 +1287,7 @@ static const struct user_regset s390_regsets[] = {
.n = __NUM_VXRS_LOW,
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_vxrs_low_get,
+ .regset_get = s390_vxrs_low_get,
.set = s390_vxrs_low_set,
},
{
@@ -1386,7 +1295,7 @@ static const struct user_regset s390_regsets[] = {
.n = __NUM_VXRS_HIGH,
.size = sizeof(__vector128),
.align = sizeof(__vector128),
- .get = s390_vxrs_high_get,
+ .regset_get = s390_vxrs_high_get,
.set = s390_vxrs_high_set,
},
{
@@ -1394,7 +1303,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_cb_get,
+ .regset_get = s390_gs_cb_get,
.set = s390_gs_cb_set,
},
{
@@ -1402,7 +1311,7 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_bc_get,
+ .regset_get = s390_gs_bc_get,
.set = s390_gs_bc_set,
},
{
@@ -1410,13 +1319,13 @@ static const struct user_regset s390_regsets[] = {
.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_runtime_instr_get,
+ .regset_get = s390_runtime_instr_get,
.set = s390_runtime_instr_set,
},
};
static const struct user_regset_view user_s390_view = {
- .name = UTS_MACHINE,
+ .name = "s390x",
.e_machine = EM_S390,
.regsets = s390_regsets,
.n = ARRAY_SIZE(s390_regsets)
@@ -1425,28 +1334,15 @@ static const struct user_regset_view user_s390_view = {
#ifdef CONFIG_COMPAT
static int s390_compat_regs_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
+ unsigned n;
+
if (target == current)
save_access_regs(target->thread.acrs);
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count > 0) {
- *k++ = __peek_user_compat(target, pos);
- count -= sizeof(*k);
- pos += sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count > 0) {
- if (__put_user(__peek_user_compat(target, pos), u++))
- return -EFAULT;
- count -= sizeof(*u);
- pos += sizeof(*u);
- }
- }
+ for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t))
+ membuf_store(&to, __peek_user_compat(target, n));
return 0;
}
@@ -1488,29 +1384,14 @@ static int s390_compat_regs_set(struct task_struct *target,
static int s390_compat_regs_high_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
compat_ulong_t *gprs_high;
+ int i;
- gprs_high = (compat_ulong_t *)
- &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)];
- if (kbuf) {
- compat_ulong_t *k = kbuf;
- while (count > 0) {
- *k++ = *gprs_high;
- gprs_high += 2;
- count -= sizeof(*k);
- }
- } else {
- compat_ulong_t __user *u = ubuf;
- while (count > 0) {
- if (__put_user(*gprs_high, u++))
- return -EFAULT;
- gprs_high += 2;
- count -= sizeof(*u);
- }
- }
+ gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs;
+ for (i = 0; i < NUM_GPRS; i++, gprs_high += 2)
+ membuf_store(&to, *gprs_high);
return 0;
}
@@ -1549,23 +1430,11 @@ static int s390_compat_regs_high_set(struct task_struct *target,
static int s390_compat_last_break_get(struct task_struct *target,
const struct user_regset *regset,
- unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf)
+ struct membuf to)
{
- compat_ulong_t last_break;
+ compat_ulong_t last_break = target->thread.last_break;
- if (count > 0) {
- last_break = target->thread.last_break;
- if (kbuf) {
- unsigned long *k = kbuf;
- *k = last_break;
- } else {
- unsigned long __user *u = ubuf;
- if (__put_user(last_break, u))
- return -EFAULT;
- }
- }
- return 0;
+ return membuf_store(&to, (unsigned long)last_break);
}
static int s390_compat_last_break_set(struct task_struct *target,
@@ -1582,7 +1451,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_compat_regs) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_compat_regs_get,
+ .regset_get = s390_compat_regs_get,
.set = s390_compat_regs_set,
},
{
@@ -1590,7 +1459,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_fp_regs) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_fpregs_get,
+ .regset_get = s390_fpregs_get,
.set = s390_fpregs_set,
},
{
@@ -1598,7 +1467,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = sizeof(compat_uint_t),
.align = sizeof(compat_uint_t),
- .get = s390_system_call_get,
+ .regset_get = s390_system_call_get,
.set = s390_system_call_set,
},
{
@@ -1606,7 +1475,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = sizeof(long),
.align = sizeof(long),
- .get = s390_compat_last_break_get,
+ .regset_get = s390_compat_last_break_get,
.set = s390_compat_last_break_set,
},
{
@@ -1614,7 +1483,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = 1,
.size = 256,
.align = 1,
- .get = s390_tdb_get,
+ .regset_get = s390_tdb_get,
.set = s390_tdb_set,
},
{
@@ -1622,7 +1491,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = __NUM_VXRS_LOW,
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_vxrs_low_get,
+ .regset_get = s390_vxrs_low_get,
.set = s390_vxrs_low_set,
},
{
@@ -1630,7 +1499,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = __NUM_VXRS_HIGH,
.size = sizeof(__vector128),
.align = sizeof(__vector128),
- .get = s390_vxrs_high_get,
+ .regset_get = s390_vxrs_high_get,
.set = s390_vxrs_high_set,
},
{
@@ -1638,7 +1507,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t),
.size = sizeof(compat_long_t),
.align = sizeof(compat_long_t),
- .get = s390_compat_regs_high_get,
+ .regset_get = s390_compat_regs_high_get,
.set = s390_compat_regs_high_set,
},
{
@@ -1646,7 +1515,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_cb_get,
+ .regset_get = s390_gs_cb_get,
.set = s390_gs_cb_set,
},
{
@@ -1654,7 +1523,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct gs_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_gs_bc_get,
+ .regset_get = s390_gs_bc_get,
.set = s390_gs_bc_set,
},
{
@@ -1662,7 +1531,7 @@ static const struct user_regset s390_compat_regsets[] = {
.n = sizeof(struct runtime_instr_cb) / sizeof(__u64),
.size = sizeof(__u64),
.align = sizeof(__u64),
- .get = s390_runtime_instr_get,
+ .regset_get = s390_runtime_instr_get,
.set = s390_runtime_instr_set,
},
};
diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S
index 4a22163962eb..88087a32ebc6 100644
--- a/arch/s390/kernel/reipl.S
+++ b/arch/s390/kernel/reipl.S
@@ -19,7 +19,7 @@
# r2 = Function to be called after store status
# r3 = Parameter for function
#
-ENTRY(store_status)
+SYM_CODE_START(store_status)
/* Save register one and load save area base */
stg %r1,__LC_SAVE_AREA_RESTART
/* General purpose registers */
@@ -61,7 +61,7 @@ ENTRY(store_status)
stpx 0(%r1)
/* Clock comparator - seven bytes */
lghi %r1,__LC_CLOCK_COMP_SAVE_AREA
- larl %r4,.Lclkcmp
+ larl %r4,clkcmp
stckc 0(%r4)
mvc 1(7,%r1),1(%r4)
/* Program status word */
@@ -73,9 +73,9 @@ ENTRY(store_status)
lgr %r9,%r2
lgr %r2,%r3
BR_EX %r9
-ENDPROC(store_status)
+SYM_CODE_END(store_status)
.section .bss
- .align 8
-.Lclkcmp: .quad 0x0000000000000000
+ .balign 8
+SYM_DATA_LOCAL(clkcmp, .quad 0x0000000000000000)
.previous
diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S
index fe396673e8a6..0ae297c82afd 100644
--- a/arch/s390/kernel/relocate_kernel.S
+++ b/arch/s390/kernel/relocate_kernel.S
@@ -2,8 +2,7 @@
/*
* Copyright IBM Corp. 2005
*
- * Author(s): Rolf Adelsberger,
- * Heiko Carstens <heiko.carstens@de.ibm.com>
+ * Author(s): Rolf Adelsberger
*
*/
@@ -15,6 +14,7 @@
* moves the new kernel to its destination...
* %r2 = pointer to first kimage_entry_t
* %r3 = start address - where to jump to after the job is done...
+ * %r4 = subcode
*
* %r5 will be used as temp. storage
* %r6 holds the destination address
@@ -26,53 +26,51 @@
*/
.text
-ENTRY(relocate_kernel)
- basr %r13,0 # base address
- .base:
- lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7
- lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9
- lg %r5,0(%r2) # read another word for indirection page
- aghi %r2,8 # increment pointer
- tml %r5,0x1 # is it a destination page?
- je .indir_check # NO, goto "indir_check"
- lgr %r6,%r5 # r6 = r5
- nill %r6,0xf000 # mask it out and...
- j .base # ...next iteration
- .indir_check:
- tml %r5,0x2 # is it a indirection page?
- je .done_test # NO, goto "done_test"
- nill %r5,0xf000 # YES, mask out,
- lgr %r2,%r5 # move it into the right register,
- j .base # and read next...
- .done_test:
- tml %r5,0x4 # is it the done indicator?
- je .source_test # NO! Well, then it should be the source indicator...
- j .done # ok, lets finish it here...
- .source_test:
- tml %r5,0x8 # it should be a source indicator...
- je .base # NO, ignore it...
- lgr %r8,%r5 # r8 = r5
- nill %r8,0xf000 # masking
- 0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0
- jo 0b
- j .base
- .done:
- sgr %r0,%r0 # clear register r0
- cghi %r3,0
- je .diag
- la %r4,load_psw-.base(%r13) # load psw-address into the register
- o %r3,4(%r4) # or load address into psw
- st %r3,4(%r4)
- mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0
- .diag:
- diag %r0,%r0,0x308
-ENDPROC(relocate_kernel)
+SYM_CODE_START(relocate_kernel)
+ basr %r13,0 # base address
+.base:
+ lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7
+ lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9
+ lg %r5,0(%r2) # read another word for indirection page
+ aghi %r2,8 # increment pointer
+ tml %r5,0x1 # is it a destination page?
+ je .indir_check # NO, goto "indir_check"
+ lgr %r6,%r5 # r6 = r5
+ nill %r6,0xf000 # mask it out and...
+ j .base # ...next iteration
+.indir_check:
+ tml %r5,0x2 # is it a indirection page?
+ je .done_test # NO, goto "done_test"
+ nill %r5,0xf000 # YES, mask out,
+ lgr %r2,%r5 # move it into the right register,
+ j .base # and read next...
+.done_test:
+ tml %r5,0x4 # is it the done indicator?
+ je .source_test # NO! Well, then it should be the source indicator...
+ j .done # ok, lets finish it here...
+.source_test:
+ tml %r5,0x8 # it should be a source indicator...
+ je .base # NO, ignore it...
+ lgr %r8,%r5 # r8 = r5
+ nill %r8,0xf000 # masking
+0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0
+ jo 0b
+ j .base
+.done:
+ lgr %r0,%r4 # subcode
+ cghi %r3,0
+ je .diag
+ la %r4,load_psw-.base(%r13) # load psw-address into the register
+ o %r3,4(%r4) # or load address into psw
+ st %r3,4(%r4)
+ mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0
+.diag:
+ diag %r0,%r0,0x308
+SYM_CODE_END(relocate_kernel)
- .align 8
- load_psw:
- .long 0x00080000,0x80000000
- relocate_kernel_end:
- .align 8
- .globl relocate_kernel_len
- relocate_kernel_len:
- .quad relocate_kernel_end - relocate_kernel
+ .balign 8
+SYM_DATA_START_LOCAL(load_psw)
+ .long 0x00080000,0x80000000
+SYM_DATA_END_LABEL(load_psw, SYM_L_LOCAL, relocate_kernel_end)
+ .balign 8
+SYM_DATA(relocate_kernel_len, .quad relocate_kernel_end - relocate_kernel)
diff --git a/arch/s390/kernel/rethook.c b/arch/s390/kernel/rethook.c
new file mode 100644
index 000000000000..af10e6bdd34e
--- /dev/null
+++ b/arch/s390/kernel/rethook.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/rethook.h>
+#include <linux/kprobes.h>
+#include "rethook.h"
+
+void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount)
+{
+ rh->ret_addr = regs->gprs[14];
+ rh->frame = regs->gprs[15];
+
+ /* Replace the return addr with trampoline addr */
+ regs->gprs[14] = (unsigned long)&arch_rethook_trampoline;
+}
+NOKPROBE_SYMBOL(arch_rethook_prepare);
+
+void arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ /* Replace fake return address with real one. */
+ regs->gprs[14] = correct_ret_addr;
+}
+NOKPROBE_SYMBOL(arch_rethook_fixup_return);
+
+/*
+ * Called from arch_rethook_trampoline
+ */
+unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs)
+{
+ return rethook_trampoline_handler(regs, regs->gprs[15]);
+}
+NOKPROBE_SYMBOL(arch_rethook_trampoline_callback);
+
+/* assembler function that handles the rethook must not be probed itself */
+NOKPROBE_SYMBOL(arch_rethook_trampoline);
diff --git a/arch/s390/kernel/rethook.h b/arch/s390/kernel/rethook.h
new file mode 100644
index 000000000000..32f069eed3f3
--- /dev/null
+++ b/arch/s390/kernel/rethook.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __S390_RETHOOK_H
+#define __S390_RETHOOK_H
+
+unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs);
+
+#endif
diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c
index 125c7f6e8715..1788a5454b6f 100644
--- a/arch/s390/kernel/runtime_instr.c
+++ b/arch/s390/kernel/runtime_instr.c
@@ -57,7 +57,7 @@ static void init_runtime_instr_cb(struct runtime_instr_cb *cb)
cb->k = 1;
cb->ps = 1;
cb->pc = 1;
- cb->key = PAGE_DEFAULT_KEY;
+ cb->key = PAGE_DEFAULT_KEY >> 4;
cb->v = 1;
}
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 9cbf490fd162..d1f3b56e7afc 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -37,7 +37,7 @@
#include <linux/root_dev.h>
#include <linux/console.h>
#include <linux/kernel_stat.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-map-ops.h>
#include <linux/device.h>
#include <linux/notifier.h>
#include <linux/pfn.h>
@@ -49,14 +49,17 @@
#include <linux/memory.h>
#include <linux/compat.h>
#include <linux/start_kernel.h>
+#include <linux/hugetlb.h>
+#include <linux/kmemleak.h>
+#include <asm/archrandom.h>
#include <asm/boot_data.h>
#include <asm/ipl.h>
#include <asm/facility.h>
#include <asm/smp.h>
#include <asm/mmu_context.h>
#include <asm/cpcmd.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/nmi.h>
#include <asm/irq.h>
#include <asm/page.h>
@@ -71,8 +74,10 @@
#include <asm/numa.h>
#include <asm/alternative.h>
#include <asm/nospec-branch.h>
-#include <asm/mem_detect.h>
+#include <asm/physmem_info.h>
+#include <asm/maccess.h>
#include <asm/uv.h>
+#include <asm/asm-offsets.h>
#include "entry.h"
/*
@@ -87,30 +92,71 @@ EXPORT_SYMBOL(console_devno);
unsigned int console_irq = -1;
EXPORT_SYMBOL(console_irq);
-unsigned long elf_hwcap __read_mostly = 0;
-char elf_platform[ELF_PLATFORM_SIZE];
+/*
+ * Some code and data needs to stay below 2 GB, even when the kernel would be
+ * relocated above 2 GB, because it has to use 31 bit addresses.
+ * Such code and data is part of the .amode31 section.
+ */
+char __amode31_ref *__samode31 = _samode31;
+char __amode31_ref *__eamode31 = _eamode31;
+char __amode31_ref *__stext_amode31 = _stext_amode31;
+char __amode31_ref *__etext_amode31 = _etext_amode31;
+struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table;
+struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table;
-unsigned long int_hwcap = 0;
+/*
+ * Control registers CR2, CR5 and CR15 are initialized with addresses
+ * of tables that must be placed below 2G which is handled by the AMODE31
+ * sections.
+ * Because the AMODE31 sections are relocated below 2G at startup,
+ * the content of control registers CR2, CR5 and CR15 must be updated
+ * with new addresses after the relocation. The initial initialization of
+ * control registers occurs in head64.S and then gets updated again after AMODE31
+ * relocation. We must access the relevant AMODE31 tables indirectly via
+ * pointers placed in the .amode31.refs linker section. Those pointers get
+ * updated automatically during AMODE31 relocation and always contain a valid
+ * address within AMODE31 sections.
+ */
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
-int __bootdata_preserved(prot_virt_guest);
-#endif
+static __amode31_data u32 __ctl_duct_amode31[16] __aligned(64);
+
+static __amode31_data u64 __ctl_aste_amode31[8] __aligned(64) = {
+ [1] = 0xffffffffffffffff
+};
+
+static __amode31_data u32 __ctl_duald_amode31[32] __aligned(128) = {
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0,
+ 0x80000000, 0, 0, 0
+};
+
+static __amode31_data u32 __ctl_linkage_stack_amode31[8] __aligned(64) = {
+ 0, 0, 0x89000000, 0,
+ 0, 0, 0x8a000000, 0
+};
+
+static u64 __amode31_ref *__ctl_aste = __ctl_aste_amode31;
+static u32 __amode31_ref *__ctl_duald = __ctl_duald_amode31;
+static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31;
+static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31;
+
+unsigned long __bootdata_preserved(max_mappable);
+unsigned long __bootdata(ident_map_size);
+struct physmem_info __bootdata(physmem_info);
-int __bootdata(noexec_disabled);
-int __bootdata(memory_end_set);
-unsigned long __bootdata(memory_end);
-unsigned long __bootdata(vmalloc_size);
-unsigned long __bootdata(max_physmem_end);
-struct mem_detect_info __bootdata(mem_detect);
-
-struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table);
-struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table);
-unsigned long __bootdata_preserved(__swsusp_reset_dma);
-unsigned long __bootdata_preserved(__stext_dma);
-unsigned long __bootdata_preserved(__etext_dma);
-unsigned long __bootdata_preserved(__sdma);
-unsigned long __bootdata_preserved(__edma);
unsigned long __bootdata_preserved(__kaslr_offset);
+int __bootdata_preserved(__kaslr_enabled);
+unsigned int __bootdata_preserved(zlib_dfltcc_support);
+EXPORT_SYMBOL(zlib_dfltcc_support);
+u64 __bootdata_preserved(stfle_fac_list[16]);
+EXPORT_SYMBOL(stfle_fac_list);
+u64 __bootdata_preserved(alt_stfle_fac_list[16]);
+struct oldmem_data __bootdata_preserved(oldmem_data);
unsigned long VMALLOC_START;
EXPORT_SYMBOL(VMALLOC_START);
@@ -120,6 +166,7 @@ EXPORT_SYMBOL(VMALLOC_END);
struct page *vmemmap;
EXPORT_SYMBOL(vmemmap);
+unsigned long vmemmap_size;
unsigned long MODULES_VADDR;
unsigned long MODULES_END;
@@ -128,6 +175,14 @@ unsigned long MODULES_END;
struct lowcore *lowcore_ptr[NR_CPUS];
EXPORT_SYMBOL(lowcore_ptr);
+DEFINE_STATIC_KEY_FALSE(cpu_has_bear);
+
+/*
+ * The Write Back bit position in the physaddr is given by the SLPC PCI.
+ * Leaving the mask zero always uses write through which is safe
+ */
+unsigned long mio_wb_bit_mask __ro_after_init;
+
/*
* This is set up by the setup-routine at boot-time
* for S390 need to find out, what we have to setup
@@ -161,7 +216,7 @@ static void __init set_preferred_console(void)
else if (CONSOLE_IS_3270)
add_preferred_console("tty3270", 0, NULL);
else if (CONSOLE_IS_VT220)
- add_preferred_console("ttyS", 1, NULL);
+ add_preferred_console("ttysclp", 0, NULL);
else if (CONSOLE_IS_HVC)
add_preferred_console("hvc", 0, NULL);
}
@@ -241,18 +296,16 @@ static void __init conmode_default(void)
SET_CONSOLE_SCLP;
#endif
}
- if (IS_ENABLED(CONFIG_VT) && IS_ENABLED(CONFIG_DUMMY_CONSOLE))
- conswitchp = &dummy_con;
}
#ifdef CONFIG_CRASH_DUMP
static void __init setup_zfcpdump(void)
{
- if (ipl_info.type != IPL_TYPE_FCP_DUMP)
+ if (!is_ipl_type_dump())
return;
- if (OLDMEM_BASE)
+ if (oldmem_data.start)
return;
- strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev");
+ strlcat(boot_command_line, " cio_ignore=all,!ipldev,!condev", COMMAND_LINE_SIZE);
console_loglevel = 2;
}
#else
@@ -303,17 +356,17 @@ void machine_power_off(void)
void (*pm_power_off)(void) = machine_power_off;
EXPORT_SYMBOL_GPL(pm_power_off);
-void *restart_stack __section(.data);
+void *restart_stack;
unsigned long stack_alloc(void)
{
#ifdef CONFIG_VMAP_STACK
- return (unsigned long)
- __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
- __builtin_return_address(0));
+ void *ret;
+
+ ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ kmemleak_not_leak(ret);
+ return (unsigned long)ret;
#else
return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
#endif
@@ -328,51 +381,21 @@ void stack_free(unsigned long stack)
#endif
}
-int __init arch_early_irq_init(void)
-{
- unsigned long stack;
-
- stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
- if (!stack)
- panic("Couldn't allocate async stack");
- S390_lowcore.async_stack = stack + STACK_INIT_OFFSET;
- return 0;
-}
-
-static int __init async_stack_realloc(void)
-{
- unsigned long old, new;
-
- old = S390_lowcore.async_stack - STACK_INIT_OFFSET;
- new = stack_alloc();
- if (!new)
- panic("Couldn't allocate async stack");
- S390_lowcore.async_stack = new + STACK_INIT_OFFSET;
- free_pages(old, THREAD_SIZE_ORDER);
- return 0;
-}
-early_initcall(async_stack_realloc);
-
-void __init arch_call_rest_init(void)
+static unsigned long __init stack_alloc_early(void)
{
unsigned long stack;
- stack = stack_alloc();
- if (!stack)
- panic("Couldn't allocate kernel stack");
- current->stack = (void *) stack;
-#ifdef CONFIG_VMAP_STACK
- current->stack_vm_area = (void *) stack;
-#endif
- set_task_stack_end_magic(current);
- stack += STACK_INIT_OFFSET;
- S390_lowcore.kernel_stack = stack;
- CALL_ON_STACK_NORETURN(rest_init, stack);
+ stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE);
+ if (!stack) {
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, THREAD_SIZE, THREAD_SIZE);
+ }
+ return stack;
}
-static void __init setup_lowcore_dat_off(void)
+static void __init setup_lowcore(void)
{
- struct lowcore *lc;
+ struct lowcore *lc, *abs_lc;
/*
* Setup lowcore for boot cpu
@@ -383,52 +406,40 @@ static void __init setup_lowcore_dat_off(void)
panic("%s: Failed to allocate %zu bytes align=%zx\n",
__func__, sizeof(*lc), sizeof(*lc));
- lc->restart_psw.mask = PSW_KERNEL_BITS;
- lc->restart_psw.addr = (unsigned long) restart_int_handler;
- lc->external_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT;
+ lc->restart_psw.addr = __pa(restart_int_handler);
+ lc->external_new_psw.mask = PSW_KERNEL_BITS;
lc->external_new_psw.addr = (unsigned long) ext_int_handler;
- lc->svc_new_psw.mask = PSW_KERNEL_BITS |
- PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
+ lc->svc_new_psw.mask = PSW_KERNEL_BITS;
lc->svc_new_psw.addr = (unsigned long) system_call;
- lc->program_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->program_new_psw.mask = PSW_KERNEL_BITS;
lc->program_new_psw.addr = (unsigned long) pgm_check_handler;
lc->mcck_new_psw.mask = PSW_KERNEL_BITS;
lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler;
- lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_MCHECK;
+ lc->io_new_psw.mask = PSW_KERNEL_BITS;
lc->io_new_psw.addr = (unsigned long) io_int_handler;
lc->clock_comparator = clock_comparator_max;
- lc->nodat_stack = ((unsigned long) &init_thread_union)
- + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
lc->current_task = (unsigned long)&init_task;
lc->lpp = LPP_MAGIC;
lc->machine_flags = S390_lowcore.machine_flags;
lc->preempt_count = S390_lowcore.preempt_count;
- lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
- memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
- sizeof(lc->stfle_fac_list));
- memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
- sizeof(lc->alt_stfle_fac_list));
- nmi_alloc_boot_cpu(lc);
- vdso_alloc_boot_cpu(lc);
- lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
- lc->async_enter_timer = S390_lowcore.async_enter_timer;
+ nmi_alloc_mcesa_early(&lc->mcesad);
+ lc->sys_enter_timer = S390_lowcore.sys_enter_timer;
lc->exit_timer = S390_lowcore.exit_timer;
lc->user_timer = S390_lowcore.user_timer;
lc->system_timer = S390_lowcore.system_timer;
lc->steal_timer = S390_lowcore.steal_timer;
lc->last_update_timer = S390_lowcore.last_update_timer;
lc->last_update_clock = S390_lowcore.last_update_clock;
-
/*
* Allocate the global restart stack which is the same for
- * all CPUs in cast *one* of them does a PSW restart.
+ * all CPUs in case *one* of them does a PSW restart.
*/
- restart_stack = memblock_alloc(THREAD_SIZE, THREAD_SIZE);
- if (!restart_stack)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, THREAD_SIZE, THREAD_SIZE);
- restart_stack += STACK_INIT_OFFSET;
-
+ restart_stack = (void *)(stack_alloc_early() + STACK_INIT_OFFSET);
+ lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+ lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+ lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET;
+ lc->kernel_stack = S390_lowcore.kernel_stack;
/*
* Set up PSW restart to call ipl.c:do_restart(). Copy the relevant
* restart data to the absolute zero lowcore. This is necessary if
@@ -437,32 +448,32 @@ static void __init setup_lowcore_dat_off(void)
lc->restart_stack = (unsigned long) restart_stack;
lc->restart_fn = (unsigned long) do_restart;
lc->restart_data = 0;
- lc->restart_source = -1UL;
-
- /* Setup absolute zero lowcore */
- mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack);
- mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn);
- mem_assign_absolute(S390_lowcore.restart_data, lc->restart_data);
- mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source);
- mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw);
-
+ lc->restart_source = -1U;
lc->spinlock_lockval = arch_spin_lockval(0);
lc->spinlock_index = 0;
arch_spin_lock_setup(0);
- lc->br_r1_trampoline = 0x07f1; /* br %r1 */
-
- set_prefix((u32)(unsigned long) lc);
+ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+ lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+ lc->preempt_count = PREEMPT_DISABLED;
+ lc->kernel_asce = S390_lowcore.kernel_asce;
+ lc->user_asce = S390_lowcore.user_asce;
+
+ system_ctlreg_init_save_area(lc);
+ abs_lc = get_abs_lowcore();
+ abs_lc->restart_stack = lc->restart_stack;
+ abs_lc->restart_fn = lc->restart_fn;
+ abs_lc->restart_data = lc->restart_data;
+ abs_lc->restart_source = lc->restart_source;
+ abs_lc->restart_psw = lc->restart_psw;
+ abs_lc->restart_flags = RESTART_FLAG_CTLREGS;
+ abs_lc->program_new_psw = lc->program_new_psw;
+ abs_lc->mcesad = lc->mcesad;
+ put_abs_lowcore(abs_lc);
+
+ set_prefix(__pa(lc));
lowcore_ptr[0] = lc;
-}
-
-static void __init setup_lowcore_dat_on(void)
-{
- __ctl_clear_bit(0, 28);
- S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT;
- S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT;
- S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT;
- S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT;
- __ctl_set_bit(0, 28);
+ if (abs_lowcore_map(0, lowcore_ptr[0], false))
+ panic("Couldn't setup absolute lowcore");
}
static struct resource code_resource = {
@@ -489,8 +500,9 @@ static struct resource __initdata *standard_resources[] = {
static void __init setup_resources(void)
{
struct resource *res, *std_res, *sub_res;
- struct memblock_region *reg;
+ phys_addr_t start, end;
int j;
+ u64 i;
code_resource.start = (unsigned long) _text;
code_resource.end = (unsigned long) _etext - 1;
@@ -499,7 +511,7 @@ static void __init setup_resources(void)
bss_resource.start = (unsigned long) __bss_start;
bss_resource.end = (unsigned long) __bss_stop - 1;
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
res = memblock_alloc(sizeof(*res), 8);
if (!res)
panic("%s: Failed to allocate %zu bytes align=0x%x\n",
@@ -507,8 +519,13 @@ static void __init setup_resources(void)
res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
res->name = "System RAM";
- res->start = reg->base;
- res->end = reg->base + reg->size - 1;
+ res->start = start;
+ /*
+ * In memblock, end points to the first byte after the
+ * range while in resources, end points to the last byte in
+ * the range.
+ */
+ res->end = end - 1;
request_resource(&iomem_resource, res);
for (j = 0; j < ARRAY_SIZE(standard_resources); j++) {
@@ -539,7 +556,8 @@ static void __init setup_resources(void)
* part of the System RAM resource.
*/
if (crashk_res.end) {
- memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0);
+ memblock_add_node(crashk_res.start, resource_size(&crashk_res),
+ 0, MEMBLOCK_NONE);
memblock_reserve(crashk_res.start, resource_size(&crashk_res));
insert_resource(&iomem_resource, &crashk_res);
}
@@ -548,56 +566,17 @@ static void __init setup_resources(void)
static void __init setup_memory_end(void)
{
- unsigned long vmax, tmp;
-
- /* Choose kernel address space layout: 3 or 4 levels. */
- if (IS_ENABLED(CONFIG_KASAN)) {
- vmax = IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)
- ? _REGION1_SIZE
- : _REGION2_SIZE;
- } else {
- tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE;
- tmp = tmp * (sizeof(struct page) + PAGE_SIZE);
- if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE)
- vmax = _REGION2_SIZE; /* 3-level kernel page table */
- else
- vmax = _REGION1_SIZE; /* 4-level kernel page table */
- }
-
- /* module area is at the end of the kernel address space. */
- MODULES_END = vmax;
- MODULES_VADDR = MODULES_END - MODULES_LEN;
- VMALLOC_END = MODULES_VADDR;
- VMALLOC_START = VMALLOC_END - vmalloc_size;
-
- /* Split remaining virtual space between 1:1 mapping & vmemmap array */
- tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page));
- /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */
- tmp = SECTION_ALIGN_UP(tmp);
- tmp = VMALLOC_START - tmp * sizeof(struct page);
- tmp &= ~((vmax >> 11) - 1); /* align to page table level */
- tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS);
- vmemmap = (struct page *) tmp;
-
- /* Take care that memory_end is set and <= vmemmap */
- memory_end = min(memory_end ?: max_physmem_end, (unsigned long)vmemmap);
-#ifdef CONFIG_KASAN
- /* fit in kasan shadow memory region between 1:1 and vmemmap */
- memory_end = min(memory_end, KASAN_SHADOW_START);
- vmemmap = max(vmemmap, (struct page *)KASAN_SHADOW_END);
-#endif
- max_pfn = max_low_pfn = PFN_DOWN(memory_end);
- memblock_remove(memory_end, ULONG_MAX);
-
- pr_notice("The maximum memory size is %luMB\n", memory_end >> 20);
+ max_pfn = max_low_pfn = PFN_DOWN(ident_map_size);
+ pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20);
}
#ifdef CONFIG_CRASH_DUMP
/*
- * When kdump is enabled, we have to ensure that no memory from
- * the area [0 - crashkernel memory size] and
- * [crashk_res.start - crashk_res.end] is set offline.
+ * When kdump is enabled, we have to ensure that no memory from the area
+ * [0 - crashkernel memory size] is set offline - it will be exchanged with
+ * the crashkernel memory region when kdump is triggered. The crashkernel
+ * memory region can never get offlined (pages are unmovable).
*/
static int kdump_mem_notifier(struct notifier_block *nb,
unsigned long action, void *data)
@@ -608,11 +587,7 @@ static int kdump_mem_notifier(struct notifier_block *nb,
return NOTIFY_OK;
if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res)))
return NOTIFY_BAD;
- if (arg->start_pfn > PFN_DOWN(crashk_res.end))
- return NOTIFY_OK;
- if (arg->start_pfn + arg->nr_pages - 1 < PFN_DOWN(crashk_res.start))
- return NOTIFY_OK;
- return NOTIFY_BAD;
+ return NOTIFY_OK;
}
static struct notifier_block kdump_mem_nb = {
@@ -622,36 +597,15 @@ static struct notifier_block kdump_mem_nb = {
#endif
/*
- * Make sure that the area behind memory_end is protected
- */
-static void reserve_memory_end(void)
-{
- if (memory_end_set)
- memblock_reserve(memory_end, ULONG_MAX);
-}
-
-/*
- * Make sure that oldmem, where the dump is stored, is protected
+ * Reserve page tables created by decompressor
*/
-static void reserve_oldmem(void)
+static void __init reserve_pgtables(void)
{
-#ifdef CONFIG_CRASH_DUMP
- if (OLDMEM_BASE)
- /* Forget all memory above the running kdump system */
- memblock_reserve(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX);
-#endif
-}
+ unsigned long start, end;
+ struct reserved_range *range;
-/*
- * Make sure that oldmem, where the dump is stored, is protected
- */
-static void remove_oldmem(void)
-{
-#ifdef CONFIG_CRASH_DUMP
- if (OLDMEM_BASE)
- /* Forget all memory above the running kdump system */
- memblock_remove(OLDMEM_SIZE, (phys_addr_t)ULONG_MAX);
-#endif
+ for_each_physmem_reserved_type_range(RR_VMEM, range, &start, &end)
+ memblock_reserve(start, end - start);
}
/*
@@ -664,8 +618,8 @@ static void __init reserve_crashkernel(void)
phys_addr_t low, high;
int rc;
- rc = parse_crashkernel(boot_command_line, memory_end, &crash_size,
- &crash_base);
+ rc = parse_crashkernel(boot_command_line, ident_map_size,
+ &crash_size, &crash_base, NULL, NULL);
crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
@@ -678,9 +632,9 @@ static void __init reserve_crashkernel(void)
return;
}
- low = crash_base ?: OLDMEM_BASE;
+ low = crash_base ?: oldmem_data.start;
high = low + crash_size;
- if (low >= OLDMEM_BASE && high <= OLDMEM_BASE + OLDMEM_SIZE) {
+ if (low >= oldmem_data.start && high <= oldmem_data.start + oldmem_data.size) {
/* The crashkernel fits into OLDMEM, reuse OLDMEM */
crash_base = low;
} else {
@@ -694,8 +648,9 @@ static void __init reserve_crashkernel(void)
return;
}
low = crash_base ?: low;
- crash_base = memblock_find_in_range(low, high, crash_size,
- KEXEC_CRASH_MEM_ALIGN);
+ crash_base = memblock_phys_alloc_range(crash_size,
+ KEXEC_CRASH_MEM_ALIGN,
+ low, high);
}
if (!crash_base) {
@@ -704,10 +659,12 @@ static void __init reserve_crashkernel(void)
return;
}
- if (register_memory_notifier(&kdump_mem_nb))
+ if (register_memory_notifier(&kdump_mem_nb)) {
+ memblock_phys_free(crash_base, crash_size);
return;
+ }
- if (!OLDMEM_BASE && MACHINE_IS_VM)
+ if (!oldmem_data.start && MACHINE_IS_VM)
diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size));
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
@@ -725,13 +682,13 @@ static void __init reserve_crashkernel(void)
*/
static void __init reserve_initrd(void)
{
-#ifdef CONFIG_BLK_DEV_INITRD
- if (!INITRD_START || !INITRD_SIZE)
+ unsigned long addr, size;
+
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD) || !get_physmem_reserved(RR_INITRD, &addr, &size))
return;
- initrd_start = INITRD_START;
- initrd_end = initrd_start + INITRD_SIZE;
- memblock_reserve(INITRD_START, INITRD_SIZE);
-#endif
+ initrd_start = (unsigned long)__va(addr);
+ initrd_end = initrd_start + size;
+ memblock_reserve(addr, size);
}
/*
@@ -743,75 +700,37 @@ static void __init reserve_certificate_list(void)
memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size);
}
-static void __init reserve_mem_detect_info(void)
+static void __init reserve_physmem_info(void)
{
- unsigned long start, size;
+ unsigned long addr, size;
- get_mem_detect_reserved(&start, &size);
- if (size)
- memblock_reserve(start, size);
+ if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size))
+ memblock_reserve(addr, size);
}
-static void __init free_mem_detect_info(void)
+static void __init free_physmem_info(void)
{
- unsigned long start, size;
+ unsigned long addr, size;
- get_mem_detect_reserved(&start, &size);
- if (size)
- memblock_free(start, size);
+ if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size))
+ memblock_phys_free(addr, size);
}
-static void __init memblock_physmem_add(phys_addr_t start, phys_addr_t size)
-{
- memblock_dbg("memblock_physmem_add: [%#016llx-%#016llx]\n",
- start, start + size - 1);
- memblock_add_range(&memblock.memory, start, size, 0, 0);
- memblock_add_range(&memblock.physmem, start, size, 0, 0);
-}
-
-static const char * __init get_mem_info_source(void)
-{
- switch (mem_detect.info_source) {
- case MEM_DETECT_SCLP_STOR_INFO:
- return "sclp storage info";
- case MEM_DETECT_DIAG260:
- return "diag260";
- case MEM_DETECT_SCLP_READ_INFO:
- return "sclp read info";
- case MEM_DETECT_BIN_SEARCH:
- return "binary search";
- }
- return "none";
-}
-
-static void __init memblock_add_mem_detect_info(void)
+static void __init memblock_add_physmem_info(void)
{
unsigned long start, end;
int i;
- memblock_dbg("physmem info source: %s (%hhd)\n",
- get_mem_info_source(), mem_detect.info_source);
+ pr_debug("physmem info source: %s (%hhd)\n",
+ get_physmem_info_source(), physmem_info.info_source);
/* keep memblock lists close to the kernel */
memblock_set_bottom_up(true);
- for_each_mem_detect_block(i, &start, &end)
+ for_each_physmem_usable_range(i, &start, &end)
+ memblock_add(start, end - start);
+ for_each_physmem_online_range(i, &start, &end)
memblock_physmem_add(start, end - start);
memblock_set_bottom_up(false);
- memblock_dump_all();
-}
-
-/*
- * Check for initrd being in usable memory
- */
-static void __init check_initrd(void)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
- if (INITRD_START && INITRD_SIZE &&
- !memblock_is_region_memory(INITRD_START, INITRD_SIZE)) {
- pr_err("The initial RAM disk does not fit into the memory\n");
- memblock_free(INITRD_START, INITRD_SIZE);
- initrd_start = initrd_end = 0;
- }
-#endif
+ memblock_set_node(0, ULONG_MAX, &memblock.memory, 0);
}
/*
@@ -819,176 +738,68 @@ static void __init check_initrd(void)
*/
static void __init reserve_kernel(void)
{
- unsigned long start_pfn = PFN_UP(__pa(_end));
-
- memblock_reserve(0, HEAD_END);
- memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn)
- - (unsigned long)_stext);
- memblock_reserve(__sdma, __edma - __sdma);
+ memblock_reserve(0, STARTUP_NORMAL_OFFSET);
+ memblock_reserve(OLDMEM_BASE, sizeof(unsigned long));
+ memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long));
+ memblock_reserve(physmem_info.reserved[RR_AMODE31].start, __eamode31 - __samode31);
+ memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP);
+ memblock_reserve(__pa(_stext), _end - _stext);
}
static void __init setup_memory(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
/*
* Init storage key for present memory
*/
- for_each_memblock(memory, reg) {
- storage_key_init_range(reg->base, reg->base + reg->size);
- }
- psw_set_key(PAGE_DEFAULT_KEY);
+ for_each_mem_range(i, &start, &end)
+ storage_key_init_range(start, end);
- /* Only cosmetics */
- memblock_enforce_memory_limit(memblock_end_of_DRAM());
+ psw_set_key(PAGE_DEFAULT_KEY);
}
-/*
- * Setup hardware capabilities.
- */
-static int __init setup_hwcaps(void)
+static void __init relocate_amode31_section(void)
{
- static const int stfl_bits[6] = { 0, 2, 7, 17, 19, 21 };
- struct cpuid cpu_id;
- int i;
+ unsigned long amode31_size = __eamode31 - __samode31;
+ long amode31_offset, *ptr;
- /*
- * The store facility list bits numbers as found in the principles
- * of operation are numbered with bit 1UL<<31 as number 0 to
- * bit 1UL<<0 as number 31.
- * Bit 0: instructions named N3, "backported" to esa-mode
- * Bit 2: z/Architecture mode is active
- * Bit 7: the store-facility-list-extended facility is installed
- * Bit 17: the message-security assist is installed
- * Bit 19: the long-displacement facility is installed
- * Bit 21: the extended-immediate facility is installed
- * Bit 22: extended-translation facility 3 is installed
- * Bit 30: extended-translation facility 3 enhancement facility
- * These get translated to:
- * HWCAP_S390_ESAN3 bit 0, HWCAP_S390_ZARCH bit 1,
- * HWCAP_S390_STFLE bit 2, HWCAP_S390_MSA bit 3,
- * HWCAP_S390_LDISP bit 4, HWCAP_S390_EIMM bit 5 and
- * HWCAP_S390_ETF3EH bit 8 (22 && 30).
- */
- for (i = 0; i < 6; i++)
- if (test_facility(stfl_bits[i]))
- elf_hwcap |= 1UL << i;
-
- if (test_facility(22) && test_facility(30))
- elf_hwcap |= HWCAP_S390_ETF3EH;
-
- /*
- * Check for additional facilities with store-facility-list-extended.
- * stfle stores doublewords (8 byte) with bit 1ULL<<63 as bit 0
- * and 1ULL<<0 as bit 63. Bits 0-31 contain the same information
- * as stored by stfl, bits 32-xxx contain additional facilities.
- * How many facility words are stored depends on the number of
- * doublewords passed to the instruction. The additional facilities
- * are:
- * Bit 42: decimal floating point facility is installed
- * Bit 44: perform floating point operation facility is installed
- * translated to:
- * HWCAP_S390_DFP bit 6 (42 && 44).
- */
- if ((elf_hwcap & (1UL << 2)) && test_facility(42) && test_facility(44))
- elf_hwcap |= HWCAP_S390_DFP;
+ amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31;
+ pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size);
- /*
- * Huge page support HWCAP_S390_HPAGE is bit 7.
- */
- if (MACHINE_HAS_EDAT1)
- elf_hwcap |= HWCAP_S390_HPAGE;
+ /* Move original AMODE31 section to the new one */
+ memmove((void *)physmem_info.reserved[RR_AMODE31].start, __samode31, amode31_size);
+ /* Zero out the old AMODE31 section to catch invalid accesses within it */
+ memset(__samode31, 0, amode31_size);
- /*
- * 64-bit register support for 31-bit processes
- * HWCAP_S390_HIGH_GPRS is bit 9.
- */
- elf_hwcap |= HWCAP_S390_HIGH_GPRS;
-
- /*
- * Transactional execution support HWCAP_S390_TE is bit 10.
- */
- if (MACHINE_HAS_TE)
- elf_hwcap |= HWCAP_S390_TE;
-
- /*
- * Vector extension HWCAP_S390_VXRS is bit 11. The Vector extension
- * can be disabled with the "novx" parameter. Use MACHINE_HAS_VX
- * instead of facility bit 129.
- */
- if (MACHINE_HAS_VX) {
- elf_hwcap |= HWCAP_S390_VXRS;
- if (test_facility(134))
- elf_hwcap |= HWCAP_S390_VXRS_EXT;
- if (test_facility(135))
- elf_hwcap |= HWCAP_S390_VXRS_BCD;
- if (test_facility(148))
- elf_hwcap |= HWCAP_S390_VXRS_EXT2;
- if (test_facility(152))
- elf_hwcap |= HWCAP_S390_VXRS_PDE;
- }
- if (test_facility(150))
- elf_hwcap |= HWCAP_S390_SORT;
- if (test_facility(151))
- elf_hwcap |= HWCAP_S390_DFLT;
+ /* Update all AMODE31 region references */
+ for (ptr = _start_amode31_refs; ptr != _end_amode31_refs; ptr++)
+ *ptr += amode31_offset;
+}
- /*
- * Guarded storage support HWCAP_S390_GS is bit 12.
- */
- if (MACHINE_HAS_GS)
- elf_hwcap |= HWCAP_S390_GS;
-
- get_cpu_id(&cpu_id);
- add_device_randomness(&cpu_id, sizeof(cpu_id));
- switch (cpu_id.machine) {
- case 0x2064:
- case 0x2066:
- default: /* Use "z900" as default for 64 bit kernels. */
- strcpy(elf_platform, "z900");
- break;
- case 0x2084:
- case 0x2086:
- strcpy(elf_platform, "z990");
- break;
- case 0x2094:
- case 0x2096:
- strcpy(elf_platform, "z9-109");
- break;
- case 0x2097:
- case 0x2098:
- strcpy(elf_platform, "z10");
- break;
- case 0x2817:
- case 0x2818:
- strcpy(elf_platform, "z196");
- break;
- case 0x2827:
- case 0x2828:
- strcpy(elf_platform, "zEC12");
- break;
- case 0x2964:
- case 0x2965:
- strcpy(elf_platform, "z13");
- break;
- case 0x3906:
- case 0x3907:
- strcpy(elf_platform, "z14");
- break;
- case 0x8561:
- case 0x8562:
- strcpy(elf_platform, "z15");
- break;
- }
+/* This must be called after AMODE31 relocation */
+static void __init setup_cr(void)
+{
+ union ctlreg2 cr2;
+ union ctlreg5 cr5;
+ union ctlreg15 cr15;
- /*
- * Virtualization support HWCAP_INT_SIE is bit 0.
- */
- if (sclp.has_sief2)
- int_hwcap |= HWCAP_INT_SIE;
+ __ctl_duct[1] = (unsigned long)__ctl_aste;
+ __ctl_duct[2] = (unsigned long)__ctl_aste;
+ __ctl_duct[4] = (unsigned long)__ctl_duald;
- return 0;
+ /* Update control registers CR2, CR5 and CR15 */
+ local_ctl_store(2, &cr2.reg);
+ local_ctl_store(5, &cr5.reg);
+ local_ctl_store(15, &cr15.reg);
+ cr2.ducto = (unsigned long)__ctl_duct >> 6;
+ cr5.pasteo = (unsigned long)__ctl_duct >> 6;
+ cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3;
+ system_ctl_load(2, &cr2.reg);
+ system_ctl_load(5, &cr5.reg);
+ system_ctl_load(15, &cr15.reg);
}
-arch_initcall(setup_hwcaps);
/*
* Add system information as device randomness
@@ -997,30 +808,15 @@ static void __init setup_randomness(void)
{
struct sysinfo_3_2_2 *vmms;
- vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE,
- PAGE_SIZE);
+ vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
if (!vmms)
panic("Failed to allocate memory for sysinfo structure\n");
-
if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
- memblock_free((unsigned long) vmms, PAGE_SIZE);
-}
-
-/*
- * Find the correct size for the task_struct. This depends on
- * the size of the struct fpu at the end of the thread_struct
- * which is embedded in the task_struct.
- */
-static void __init setup_task_size(void)
-{
- int task_size = sizeof(struct task_struct);
+ memblock_free(vmms, PAGE_SIZE);
- if (!MACHINE_HAS_VX) {
- task_size -= sizeof(__vector128) * __NUM_VXRS;
- task_size += sizeof(freg_t) * __NUM_FPRS;
- }
- arch_task_struct_size = task_size;
+ if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG))
+ static_branch_enable(&s390_arch_random_available);
}
/*
@@ -1031,8 +827,7 @@ static void __init setup_control_program_code(void)
{
union diag318_info diag318_info = {
.cpnc = CPNC_LINUX,
- .cpvc_linux = 0,
- .cpvc_distro = {0},
+ .cpvc = 0,
};
if (!sclp.has_diag318)
@@ -1052,11 +847,11 @@ static void __init log_component_list(void)
if (!early_ipl_comp_list_addr)
return;
- if (ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)
+ if (ipl_block.hdr.flags & IPL_PL_FLAG_SIPL)
pr_info("Linux is running with Secure-IPL enabled\n");
else
pr_info("Linux is running with Secure-IPL disabled\n");
- ptr = (void *) early_ipl_comp_list_addr;
+ ptr = __va(early_ipl_comp_list_addr);
end = (void *) ptr + early_ipl_comp_list_size;
pr_info("The IPL report contains the following components:\n");
while (ptr < end) {
@@ -1102,14 +897,12 @@ void __init setup_arch(char **cmdline_p)
ROOT_DEV = Root_RAM0;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = (unsigned long) _end;
+ setup_initial_init_mm(_text, _etext, _edata, _end);
if (IS_ENABLED(CONFIG_EXPOLINE_AUTO))
nospec_auto_detect();
+ jump_label_init();
parse_early_param();
#ifdef CONFIG_CRASH_DUMP
/* Deactivate elfcorehdr= kernel parameter */
@@ -1118,49 +911,44 @@ void __init setup_arch(char **cmdline_p)
os_info_init();
setup_ipl();
- setup_task_size();
setup_control_program_code();
/* Do some memory reservations *before* memory is added to memblock */
- reserve_memory_end();
- reserve_oldmem();
+ reserve_pgtables();
reserve_kernel();
reserve_initrd();
reserve_certificate_list();
- reserve_mem_detect_info();
+ reserve_physmem_info();
+ memblock_set_current_limit(ident_map_size);
memblock_allow_resize();
/* Get information about *all* installed memory */
- memblock_add_mem_detect_info();
-
- free_mem_detect_info();
- remove_oldmem();
-
- /*
- * Make sure all chunks are MAX_ORDER aligned so we don't need the
- * extra checks that HOLES_IN_ZONE would require.
- *
- * Is this still required?
- */
- memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT));
+ memblock_add_physmem_info();
+ free_physmem_info();
setup_memory_end();
+ memblock_dump_all();
setup_memory();
- dma_contiguous_reserve(memory_end);
+
+ relocate_amode31_section();
+ setup_cr();
+ setup_uv();
+ dma_contiguous_reserve(ident_map_size);
vmcp_cma_reserve();
+ if (MACHINE_HAS_EDAT2)
+ hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
- check_initrd();
reserve_crashkernel();
#ifdef CONFIG_CRASH_DUMP
/*
- * Be aware that smp_save_dump_cpus() triggers a system reset.
+ * Be aware that smp_save_dump_secondary_cpus() triggers a system reset.
* Therefore CPU and device initialization should be done afterwards.
*/
- smp_save_dump_cpus();
+ smp_save_dump_secondary_cpus();
#endif
setup_resources();
- setup_lowcore_dat_off();
+ setup_lowcore();
smp_fill_possible_mask();
cpu_detect_mhz_feature();
cpu_init();
@@ -1168,8 +956,11 @@ void __init setup_arch(char **cmdline_p)
smp_detect_cpus();
topology_init_early();
+ if (test_facility(193))
+ static_branch_enable(&cpu_has_bear);
+
/*
- * Create kernel page tables and switch to virtual addressing.
+ * Create kernel page tables.
*/
paging_init();
@@ -1177,7 +968,9 @@ void __init setup_arch(char **cmdline_p)
* After paging_init created the kernel page table, the new PSWs
* in lowcore can now run with DAT enabled.
*/
- setup_lowcore_dat_on();
+#ifdef CONFIG_CRASH_DUMP
+ smp_save_dump_ipl_cpu();
+#endif
/* Setup default console */
conmode_default();
@@ -1187,7 +980,7 @@ void __init setup_arch(char **cmdline_p)
if (IS_ENABLED(CONFIG_EXPOLINE))
nospec_init_branches();
- /* Setup zfcpdump support */
+ /* Setup zfcp/nvme dump support */
setup_zfcpdump();
/* Add system specific data to the random pool */
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index e6fca5498e1f..43e9661cd715 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -12,10 +12,12 @@
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
+#include <linux/rseq.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/signal.h>
+#include <linux/entry-common.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/ptrace.h>
@@ -24,13 +26,13 @@
#include <linux/tty.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
-#include <linux/tracehook.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
#include <asm/lowcore.h>
#include <asm/switch_to.h>
+#include <asm/vdso.h>
#include "entry.h"
/*
@@ -139,7 +141,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
{
_sigregs user_sregs;
- /* Alwys make any pending restarted system call return -EINTR */
+ /* Always make any pending restarted system call return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs)))
@@ -148,10 +150,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI))
return -EINVAL;
- /* Test the floating-point-control word. */
- if (test_fp_ctl(user_sregs.fpregs.fpc))
- return -EINVAL;
-
/* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */
regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) |
(user_sregs.regs.psw.mask & (PSW_MASK_USER | PSW_MASK_RI));
@@ -181,9 +179,9 @@ static int save_sigregs_ext(struct pt_regs *regs,
int i;
/* Save vector registers to signal stack */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
+ vxrs[i] = current->thread.fpu.vxrs[i].low;
if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
sizeof(sregs_ext->vxrs_low)) ||
__copy_to_user(&sregs_ext->vxrs_high,
@@ -201,7 +199,7 @@ static int restore_sigregs_ext(struct pt_regs *regs,
int i;
/* Restore vector registers from signal stack */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
sizeof(sregs_ext->vxrs_low)) ||
__copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
@@ -209,7 +207,7 @@ static int restore_sigregs_ext(struct pt_regs *regs,
sizeof(sregs_ext->vxrs_high)))
return -EFAULT;
for (i = 0; i < __NUM_VXRS_LOW; i++)
- *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i];
+ current->thread.fpu.vxrs[i].low = vxrs[i];
}
return 0;
}
@@ -299,7 +297,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
* included in the signal frame on a 31-bit system.
*/
frame_size = sizeof(*frame) - sizeof(frame->sregs_ext);
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
frame_size += sizeof(frame->sregs_ext);
frame = get_sigframe(ka, regs, frame_size);
if (frame == (void __user *) -1UL)
@@ -332,15 +330,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
- if (ka->sa.sa_flags & SA_RESTORER) {
+ if (ka->sa.sa_flags & SA_RESTORER)
restorer = (unsigned long) ka->sa.sa_restorer;
- } else {
- /* Signal frame without vector registers are short ! */
- __u16 __user *svc = (void __user *) frame + frame_size - 2;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long) svc;
- }
+ else
+ restorer = VDSO64_SYMBOL(current, sigreturn);
/* Set up registers for signal handler */
regs->gprs[14] = restorer;
@@ -381,7 +374,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
* included in the signal frame on a 31-bit system.
*/
uc_flags = 0;
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
frame_size += sizeof(_sigregs_ext);
uc_flags |= UC_VXRS;
}
@@ -395,14 +388,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
- if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = (unsigned long) ksig->ka.sa.sa_restorer;
- } else {
- __u16 __user *svc = &frame->svc_insn;
- if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc))
- return -EFAULT;
- restorer = (unsigned long) svc;
- }
+ else
+ restorer = VDSO64_SYMBOL(current, rt_sigreturn);
/* Create siginfo on the signal stack */
if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -459,7 +448,8 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset,
* the kernel can handle, and then we build all the user-level signal handling
* stack-frames in one go after that.
*/
-void do_signal(struct pt_regs *regs)
+
+void arch_do_signal_or_restart(struct pt_regs *regs)
{
struct ksignal ksig;
sigset_t *oldset = sigmask_to_save();
@@ -487,7 +477,7 @@ void do_signal(struct pt_regs *regs)
regs->gprs[2] = -EINTR;
break;
}
- /* fallthrough */
+ fallthrough;
case -ERESTARTNOINTR:
regs->gprs[2] = regs->orig_gpr2;
regs->psw.addr =
@@ -498,6 +488,7 @@ void do_signal(struct pt_regs *regs)
}
/* No longer in a system call */
clear_pt_regs_flag(regs, PIF_SYSCALL);
+
rseq_signal_deliver(&ksig, regs);
if (is_compat_task())
handle_signal32(&ksig, oldset, regs);
@@ -513,16 +504,22 @@ void do_signal(struct pt_regs *regs)
switch (regs->gprs[2]) {
case -ERESTART_RESTARTBLOCK:
/* Restart with sys_restart_syscall */
- regs->int_code = __NR_restart_syscall;
- /* fallthrough */
+ regs->gprs[2] = regs->orig_gpr2;
+ current->restart_block.arch_data = regs->psw.addr;
+ if (is_compat_task())
+ regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall);
+ else
+ regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall);
+ if (test_thread_flag(TIF_SINGLE_STEP))
+ clear_thread_flag(TIF_PER_TRAP);
+ break;
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
- /* Restart system call with magic TIF bit. */
regs->gprs[2] = regs->orig_gpr2;
- set_pt_regs_flag(regs, PIF_SYSCALL);
+ regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
if (test_thread_flag(TIF_SINGLE_STEP))
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
break;
}
}
@@ -532,10 +529,3 @@ void do_signal(struct pt_regs *regs)
*/
restore_saved_sigmask();
}
-
-void do_notify_resume(struct pt_regs *regs)
-{
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
-}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2794cad9312e..c39d9f0d4b1c 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -5,7 +5,6 @@
* Copyright IBM Corp. 1999, 2012
* Author(s): Denis Joseph Barrow,
* Martin Schwidefsky <schwidefsky@de.ibm.com>,
- * Heiko Carstens <heiko.carstens@de.ibm.com>,
*
* based on other smp stuff by
* (c) 1995 Alan Cox, CymruNET Ltd <alan@cymru.net>
@@ -30,6 +29,7 @@
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/irqflags.h>
+#include <linux/irq_work.h>
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/sched/hotplug.h>
@@ -37,6 +37,8 @@
#include <linux/crash_dump.h>
#include <linux/kprobes.h>
#include <asm/asm-offsets.h>
+#include <asm/ctlreg.h>
+#include <asm/pfault.h>
#include <asm/diag.h>
#include <asm/switch_to.h>
#include <asm/facility.h>
@@ -45,9 +47,8 @@
#include <asm/irq.h>
#include <asm/tlbflush.h>
#include <asm/vtimer.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/sclp.h>
-#include <asm/vdso.h>
#include <asm/debug.h>
#include <asm/os_info.h>
#include <asm/sigp.h>
@@ -55,12 +56,16 @@
#include <asm/nmi.h>
#include <asm/stacktrace.h>
#include <asm/topology.h>
+#include <asm/vdso.h>
+#include <asm/maccess.h>
#include "entry.h"
enum {
ec_schedule = 0,
ec_call_function_single,
ec_stop_cpu,
+ ec_mcck_pending,
+ ec_irq_work,
};
enum {
@@ -71,7 +76,6 @@ enum {
static DEFINE_PER_CPU(struct cpu *, cpu_device);
struct pcpu {
- struct lowcore *lowcore; /* lowcore page(s) for the cpu */
unsigned long ec_mask; /* bit mask for ec_xxx functions */
unsigned long ec_clk; /* sigp timestamp for ec_xxx */
signed char state; /* physical cpu state */
@@ -93,6 +97,7 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS];
#endif
static unsigned int smp_max_threads __initdata = -1U;
+cpumask_t cpu_setup_mask;
static int __init early_nosmt(char *s)
{
@@ -110,7 +115,7 @@ early_param("smt", early_smt);
/*
* The smp_cpu_state_mutex must be held when changing the state or polarization
- * member of a pcpu data structure within the pcpu_devices arreay.
+ * member of a pcpu data structure within the pcpu_devices array.
*/
DEFINE_MUTEX(smp_cpu_state_mutex);
@@ -145,7 +150,7 @@ static int pcpu_sigp_retry(struct pcpu *pcpu, u8 order, u32 parm)
static inline int pcpu_stopped(struct pcpu *pcpu)
{
- u32 uninitialized_var(status);
+ u32 status;
if (__pcpu_sigp(pcpu->address, SIGP_SENSE,
0, &status) != SIGP_CC_STATUS_STORED)
@@ -188,102 +193,100 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit)
static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
{
- unsigned long async_stack, nodat_stack;
+ unsigned long async_stack, nodat_stack, mcck_stack;
struct lowcore *lc;
- if (pcpu != &pcpu_devices[0]) {
- pcpu->lowcore = (struct lowcore *)
- __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
- nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
- if (!pcpu->lowcore || !nodat_stack)
- goto out;
- } else {
- nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET;
- }
+ lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
+ nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
async_stack = stack_alloc();
- if (!async_stack)
+ mcck_stack = stack_alloc();
+ if (!lc || !nodat_stack || !async_stack || !mcck_stack)
goto out;
- lc = pcpu->lowcore;
memcpy(lc, &S390_lowcore, 512);
memset((char *) lc + 512, 0, sizeof(*lc) - 512);
lc->async_stack = async_stack + STACK_INIT_OFFSET;
lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET;
+ lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET;
lc->cpu_nr = cpu;
lc->spinlock_lockval = arch_spin_lockval(cpu);
lc->spinlock_index = 0;
- lc->br_r1_trampoline = 0x07f1; /* br %r1 */
- if (nmi_alloc_per_cpu(lc))
- goto out_async;
- if (vdso_alloc_per_cpu(lc))
+ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW);
+ lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW);
+ lc->preempt_count = PREEMPT_DISABLED;
+ if (nmi_alloc_mcesa(&lc->mcesad))
+ goto out;
+ if (abs_lowcore_map(cpu, lc, true))
goto out_mcesa;
lowcore_ptr[cpu] = lc;
- pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc);
+ pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc));
return 0;
out_mcesa:
- nmi_free_per_cpu(lc);
-out_async:
- stack_free(async_stack);
+ nmi_free_mcesa(&lc->mcesad);
out:
- if (pcpu != &pcpu_devices[0]) {
- free_pages(nodat_stack, THREAD_SIZE_ORDER);
- free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
- }
+ stack_free(mcck_stack);
+ stack_free(async_stack);
+ free_pages(nodat_stack, THREAD_SIZE_ORDER);
+ free_pages((unsigned long) lc, LC_ORDER);
return -ENOMEM;
}
static void pcpu_free_lowcore(struct pcpu *pcpu)
{
- unsigned long async_stack, nodat_stack, lowcore;
-
- nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET;
- async_stack = pcpu->lowcore->async_stack - STACK_INIT_OFFSET;
- lowcore = (unsigned long) pcpu->lowcore;
+ unsigned long async_stack, nodat_stack, mcck_stack;
+ struct lowcore *lc;
+ int cpu;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
+ nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET;
+ async_stack = lc->async_stack - STACK_INIT_OFFSET;
+ mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET;
pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0);
- lowcore_ptr[pcpu - pcpu_devices] = NULL;
- vdso_free_per_cpu(pcpu->lowcore);
- nmi_free_per_cpu(pcpu->lowcore);
+ lowcore_ptr[cpu] = NULL;
+ abs_lowcore_unmap(cpu);
+ nmi_free_mcesa(&lc->mcesad);
stack_free(async_stack);
- if (pcpu == &pcpu_devices[0])
- return;
+ stack_free(mcck_stack);
free_pages(nodat_stack, THREAD_SIZE_ORDER);
- free_pages(lowcore, LC_ORDER);
+ free_pages((unsigned long) lc, LC_ORDER);
}
static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc, *abs_lc;
+ lc = lowcore_ptr[cpu];
cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
lc->cpu_nr = cpu;
+ lc->restart_flags = RESTART_FLAG_CTLREGS;
lc->spinlock_lockval = arch_spin_lockval(cpu);
lc->spinlock_index = 0;
lc->percpu_offset = __per_cpu_offset[cpu];
lc->kernel_asce = S390_lowcore.kernel_asce;
- lc->user_asce = S390_lowcore.kernel_asce;
+ lc->user_asce = s390_invalid_asce;
lc->machine_flags = S390_lowcore.machine_flags;
lc->user_timer = lc->system_timer =
lc->steal_timer = lc->avg_steal_timer = 0;
- __ctl_store(lc->cregs_save_area, 0, 15);
+ abs_lc = get_abs_lowcore();
+ memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area));
+ put_abs_lowcore(abs_lc);
lc->cregs_save_area[1] = lc->kernel_asce;
- lc->cregs_save_area[7] = lc->vdso_asce;
+ lc->cregs_save_area[7] = lc->user_asce;
save_access_regs((unsigned int *) lc->access_regs_save_area);
- memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
- sizeof(lc->stfle_fac_list));
- memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
- sizeof(lc->alt_stfle_fac_list));
arch_spin_lock_setup(cpu);
}
static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc;
+ int cpu;
- lc->kernel_stack = (unsigned long) task_stack_page(tsk)
- + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
- lc->current_task = (unsigned long) tsk;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
+ lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET;
+ lc->current_task = (unsigned long)tsk;
lc->lpp = LPP_MAGIC;
lc->current_pid = tsk->pid;
lc->user_timer = tsk->thread.user_timer;
@@ -296,41 +299,59 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data)
{
- struct lowcore *lc = pcpu->lowcore;
+ struct lowcore *lc;
+ int cpu;
- lc->restart_stack = lc->nodat_stack;
+ cpu = pcpu - pcpu_devices;
+ lc = lowcore_ptr[cpu];
+ lc->restart_stack = lc->kernel_stack;
lc->restart_fn = (unsigned long) func;
lc->restart_data = (unsigned long) data;
- lc->restart_source = -1UL;
+ lc->restart_source = -1U;
pcpu_sigp_retry(pcpu, SIGP_RESTART, 0);
}
+typedef void (pcpu_delegate_fn)(void *);
+
/*
* Call function via PSW restart on pcpu and stop the current cpu.
*/
-static void __pcpu_delegate(void (*func)(void*), void *data)
+static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
{
func(data); /* should not return */
}
-static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu,
- void (*func)(void *),
- void *data, unsigned long stack)
+static void pcpu_delegate(struct pcpu *pcpu,
+ pcpu_delegate_fn *func,
+ void *data, unsigned long stack)
{
- struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices];
- unsigned long source_cpu = stap();
+ struct lowcore *lc, *abs_lc;
+ unsigned int source_cpu;
- __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
- if (pcpu->address == source_cpu)
- CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data);
+ lc = lowcore_ptr[pcpu - pcpu_devices];
+ source_cpu = stap();
+
+ if (pcpu->address == source_cpu) {
+ call_on_stack(2, stack, void, __pcpu_delegate,
+ pcpu_delegate_fn *, func, void *, data);
+ }
/* Stop target cpu (if func returns this stops the current cpu). */
pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
+ pcpu_sigp_retry(pcpu, SIGP_CPU_RESET, 0);
/* Restart func on the target cpu and stop the current cpu. */
- mem_assign_absolute(lc->restart_stack, stack);
- mem_assign_absolute(lc->restart_fn, (unsigned long) func);
- mem_assign_absolute(lc->restart_data, (unsigned long) data);
- mem_assign_absolute(lc->restart_source, source_cpu);
- __bpon();
+ if (lc) {
+ lc->restart_stack = stack;
+ lc->restart_fn = (unsigned long)func;
+ lc->restart_data = (unsigned long)data;
+ lc->restart_source = source_cpu;
+ } else {
+ abs_lc = get_abs_lowcore();
+ abs_lc->restart_stack = stack;
+ abs_lc->restart_fn = (unsigned long)func;
+ abs_lc->restart_data = (unsigned long)data;
+ abs_lc->restart_source = source_cpu;
+ put_abs_lowcore(abs_lc);
+ }
asm volatile(
"0: sigp 0,%0,%2 # sigp restart to target cpu\n"
" brc 2,0b # busy, try again\n"
@@ -382,7 +403,7 @@ void smp_call_online_cpu(void (*func)(void *), void *data)
*/
void smp_call_ipl_cpu(void (*func)(void *), void *data)
{
- struct lowcore *lc = pcpu_devices->lowcore;
+ struct lowcore *lc = lowcore_ptr[0];
if (pcpu_devices[0].address == stap())
lc = &S390_lowcore;
@@ -401,7 +422,12 @@ int smp_find_processor_id(u16 address)
return -1;
}
-bool arch_vcpu_is_preempted(int cpu)
+void schedule_mcck_handler(void)
+{
+ pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending);
+}
+
+bool notrace arch_vcpu_is_preempted(int cpu)
{
if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
return false;
@@ -411,17 +437,15 @@ bool arch_vcpu_is_preempted(int cpu)
}
EXPORT_SYMBOL(arch_vcpu_is_preempted);
-void smp_yield_cpu(int cpu)
+void notrace smp_yield_cpu(int cpu)
{
- if (MACHINE_HAS_DIAG9C) {
- diag_stat_inc_norecursion(DIAG_STAT_X09C);
- asm volatile("diag %0,0,0x9c"
- : : "d" (pcpu_devices[cpu].address));
- } else if (MACHINE_HAS_DIAG44 && !smp_cpu_mtid) {
- diag_stat_inc_norecursion(DIAG_STAT_X044);
- asm volatile("diag 0,0,0x44");
- }
+ if (!MACHINE_HAS_DIAG9C)
+ return;
+ diag_stat_inc_norecursion(DIAG_STAT_X09C);
+ asm volatile("diag %0,0,0x9c"
+ : : "d" (pcpu_devices[cpu].address));
}
+EXPORT_SYMBOL_GPL(smp_yield_cpu);
/*
* Send cpus emergency shutdown signal. This gives the cpus the
@@ -429,10 +453,12 @@ void smp_yield_cpu(int cpu)
*/
void notrace smp_emergency_stop(void)
{
- cpumask_t cpumask;
+ static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+ static cpumask_t cpumask;
u64 end;
int cpu;
+ arch_spin_lock(&lock);
cpumask_copy(&cpumask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpumask);
@@ -453,6 +479,7 @@ void notrace smp_emergency_stop(void)
break;
cpu_relax();
}
+ arch_spin_unlock(&lock);
}
NOKPROBE_SYMBOL(smp_emergency_stop);
@@ -464,7 +491,7 @@ void smp_send_stop(void)
int cpu;
/* Disable all interrupts/machine checks */
- __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
+ __load_psw_mask(PSW_KERNEL_BITS);
trace_hardirqs_off();
debug_set_critical();
@@ -498,6 +525,10 @@ static void smp_handle_ext_call(void)
scheduler_ipi();
if (test_bit(ec_call_function_single, &bits))
generic_smp_call_function_single_interrupt();
+ if (test_bit(ec_mcck_pending, &bits))
+ s390_handle_mcck();
+ if (test_bit(ec_irq_work, &bits))
+ irq_work_run();
}
static void do_ext_call_interrupt(struct ext_code ext_code,
@@ -525,71 +556,37 @@ void arch_send_call_function_single_ipi(int cpu)
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
pcpu_ec_call(pcpu_devices + cpu, ec_schedule);
}
-/*
- * parameter area for the set/clear control bit callbacks
- */
-struct ec_creg_mask_parms {
- unsigned long orval;
- unsigned long andval;
- int cr;
-};
-
-/*
- * callback for setting/clearing control bits
- */
-static void smp_ctl_bit_callback(void *info)
-{
- struct ec_creg_mask_parms *pp = info;
- unsigned long cregs[16];
-
- __ctl_store(cregs, 0, 15);
- cregs[pp->cr] = (cregs[pp->cr] & pp->andval) | pp->orval;
- __ctl_load(cregs, 0, 15);
-}
-
-/*
- * Set a bit in a control register of all cpus
- */
-void smp_ctl_set_bit(int cr, int bit)
-{
- struct ec_creg_mask_parms parms = { 1UL << bit, -1UL, cr };
-
- on_each_cpu(smp_ctl_bit_callback, &parms, 1);
-}
-EXPORT_SYMBOL(smp_ctl_set_bit);
-
-/*
- * Clear a bit in a control register of all cpus
- */
-void smp_ctl_clear_bit(int cr, int bit)
+#ifdef CONFIG_IRQ_WORK
+void arch_irq_work_raise(void)
{
- struct ec_creg_mask_parms parms = { 0, ~(1UL << bit), cr };
-
- on_each_cpu(smp_ctl_bit_callback, &parms, 1);
+ pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work);
}
-EXPORT_SYMBOL(smp_ctl_clear_bit);
+#endif
#ifdef CONFIG_CRASH_DUMP
int smp_store_status(int cpu)
{
- struct pcpu *pcpu = pcpu_devices + cpu;
+ struct lowcore *lc;
+ struct pcpu *pcpu;
unsigned long pa;
- pa = __pa(&pcpu->lowcore->floating_pt_save_area);
+ pcpu = pcpu_devices + cpu;
+ lc = lowcore_ptr[cpu];
+ pa = __pa(&lc->floating_pt_save_area);
if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS,
pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
- if (!MACHINE_HAS_VX && !MACHINE_HAS_GS)
+ if (!cpu_has_vx() && !MACHINE_HAS_GS)
return 0;
- pa = __pa(pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK);
+ pa = lc->mcesad & MCESA_ORIGIN_MASK;
if (MACHINE_HAS_GS)
- pa |= pcpu->lowcore->mcesad & MCESA_LC_MASK;
+ pa |= lc->mcesad & MCESA_LC_MASK;
if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS,
pa) != SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
@@ -599,14 +596,14 @@ int smp_store_status(int cpu)
/*
* Collect CPU state of the previous, crashed system.
* There are four cases:
- * 1) standard zfcp dump
- * condition: OLDMEM_BASE == NULL && ipl_info.type == IPL_TYPE_FCP_DUMP
+ * 1) standard zfcp/nvme dump
+ * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true
* The state for all CPUs except the boot CPU needs to be collected
* with sigp stop-and-store-status. The boot CPU state is located in
* the absolute lowcore of the memory stored in the HSA. The zcore code
* will copy the boot CPU state from the HSA.
- * 2) stand-alone kdump for SCSI (zfcp dump with swapped memory)
- * condition: OLDMEM_BASE != NULL && ipl_info.type == IPL_TYPE_FCP_DUMP
+ * 2) stand-alone kdump for SCSI/NVMe (zfcp/nvme dump with swapped memory)
+ * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == true
* The state for all CPUs except the boot CPU needs to be collected
* with sigp stop-and-store-status. The firmware or the boot-loader
* stored the registers of the boot CPU in the absolute lowcore in the
@@ -622,42 +619,39 @@ int smp_store_status(int cpu)
* This case does not exist for s390 anymore, setup_arch explicitly
* deactivates the elfcorehdr= kernel parameter
*/
-static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr,
- bool is_boot_cpu, unsigned long page)
+static bool dump_available(void)
{
- __vector128 *vxrs = (__vector128 *) page;
-
- if (is_boot_cpu)
- vxrs = boot_cpu_vector_save_area;
- else
- __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, page);
- save_area_add_vxrs(sa, vxrs);
+ return oldmem_data.start || is_ipl_type_dump();
}
-static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr,
- bool is_boot_cpu, unsigned long page)
+void __init smp_save_dump_ipl_cpu(void)
{
- void *regs = (void *) page;
+ struct save_area *sa;
+ void *regs;
- if (is_boot_cpu)
- copy_oldmem_kernel(regs, (void *) __LC_FPREGS_SAVE_AREA, 512);
- else
- __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, page);
+ if (!dump_available())
+ return;
+ sa = save_area_alloc(true);
+ regs = memblock_alloc(512, 8);
+ if (!sa || !regs)
+ panic("could not allocate memory for boot CPU save area\n");
+ copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512);
save_area_add_regs(sa, regs);
+ memblock_free(regs, 512);
+ if (cpu_has_vx())
+ save_area_add_vxrs(sa, boot_cpu_vector_save_area);
}
-void __init smp_save_dump_cpus(void)
+void __init smp_save_dump_secondary_cpus(void)
{
int addr, boot_cpu_addr, max_cpu_addr;
struct save_area *sa;
- unsigned long page;
- bool is_boot_cpu;
+ void *page;
- if (!(OLDMEM_BASE || ipl_info.type == IPL_TYPE_FCP_DUMP))
- /* No previous system present, normal boot. */
+ if (!dump_available())
return;
/* Allocate a page as dumping area for the store status sigps */
- page = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 1UL << 31);
+ page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
if (!page)
panic("ERROR: Failed to allocate %lx bytes below %lx\n",
PAGE_SIZE, 1UL << 31);
@@ -667,29 +661,23 @@ void __init smp_save_dump_cpus(void)
boot_cpu_addr = stap();
max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev;
for (addr = 0; addr <= max_cpu_addr; addr++) {
+ if (addr == boot_cpu_addr)
+ continue;
if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) ==
SIGP_CC_NOT_OPERATIONAL)
continue;
- is_boot_cpu = (addr == boot_cpu_addr);
- /* Allocate save area */
- sa = save_area_alloc(is_boot_cpu);
+ sa = save_area_alloc(false);
if (!sa)
panic("could not allocate memory for save area\n");
- if (MACHINE_HAS_VX)
- /* Get the vector registers */
- smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page);
- /*
- * For a zfcp dump OLDMEM_BASE == NULL and the registers
- * of the boot CPU are stored in the HSA. To retrieve
- * these registers an SCLP request is required which is
- * done by drivers/s390/char/zcore.c:init_cpu_info()
- */
- if (!is_boot_cpu || OLDMEM_BASE)
- /* Get the CPU registers */
- smp_save_cpu_regs(sa, addr, is_boot_cpu, page);
+ __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page));
+ save_area_add_regs(sa, page);
+ if (cpu_has_vx()) {
+ __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page));
+ save_area_add_vxrs(sa, page);
+ }
}
memblock_free(page, PAGE_SIZE);
- diag_dma_ops.diag308_reset();
+ diag_amode31_ops.diag308_reset();
pcpu_set_smt(0);
}
#endif /* CONFIG_CRASH_DUMP */
@@ -704,6 +692,11 @@ int smp_cpu_get_polarization(int cpu)
return pcpu_devices[cpu].polarization;
}
+int smp_cpu_get_cpu_address(int cpu)
+{
+ return pcpu_devices[cpu].address;
+}
+
static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
{
static int use_sigp_detection;
@@ -763,11 +756,13 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
{
struct sclp_core_entry *core;
- cpumask_t avail;
+ static cpumask_t avail;
bool configured;
u16 core_id;
int nr, i;
+ cpus_read_lock();
+ mutex_lock(&smp_cpu_state_mutex);
nr = 0;
cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask);
/*
@@ -788,6 +783,8 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
configured = i < info->configured;
nr += smp_add_core(&info->core[i], &avail, configured, early);
}
+ mutex_unlock(&smp_cpu_state_mutex);
+ cpus_read_unlock();
return nr;
}
@@ -835,83 +832,70 @@ void __init smp_detect_cpus(void)
pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus);
/* Add CPUs present at boot */
- get_online_cpus();
__smp_rescan_cpus(info, true);
- put_online_cpus();
- memblock_free_early((unsigned long)info, sizeof(*info));
+ memblock_free(info, sizeof(*info));
}
-static void smp_init_secondary(void)
+/*
+ * Activate a secondary processor.
+ */
+static void smp_start_secondary(void *cpuvoid)
{
- int cpu = smp_processor_id();
+ int cpu = raw_smp_processor_id();
S390_lowcore.last_update_clock = get_tod_clock();
+ S390_lowcore.restart_stack = (unsigned long)restart_stack;
+ S390_lowcore.restart_fn = (unsigned long)do_restart;
+ S390_lowcore.restart_data = 0;
+ S390_lowcore.restart_source = -1U;
+ S390_lowcore.restart_flags = 0;
restore_access_regs(S390_lowcore.access_regs_save_area);
- set_cpu_flag(CIF_ASCE_PRIMARY);
- set_cpu_flag(CIF_ASCE_SECONDARY);
cpu_init();
- preempt_disable();
+ rcutree_report_cpu_starting(cpu);
init_cpu_timer();
vtime_init();
+ vdso_getcpu_init();
pfault_init();
- notify_cpu_starting(smp_processor_id());
+ cpumask_set_cpu(cpu, &cpu_setup_mask);
+ update_cpu_masks();
+ notify_cpu_starting(cpu);
if (topology_cpu_dedicated(cpu))
set_cpu_flag(CIF_DEDICATED_CPU);
else
clear_cpu_flag(CIF_DEDICATED_CPU);
- set_cpu_online(smp_processor_id(), true);
+ set_cpu_online(cpu, true);
inc_irq_stat(CPU_RST);
local_irq_enable();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-/*
- * Activate a secondary processor.
- */
-static void __no_sanitize_address smp_start_secondary(void *cpuvoid)
-{
- S390_lowcore.restart_stack = (unsigned long) restart_stack;
- S390_lowcore.restart_fn = (unsigned long) do_restart;
- S390_lowcore.restart_data = 0;
- S390_lowcore.restart_source = -1UL;
- __ctl_load(S390_lowcore.cregs_save_area, 0, 15);
- __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
- CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack);
-}
-
/* Upping and downing of CPUs */
int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
- struct pcpu *pcpu;
- int base, i, rc;
+ struct pcpu *pcpu = pcpu_devices + cpu;
+ int rc;
- pcpu = pcpu_devices + cpu;
if (pcpu->state != CPU_STATE_CONFIGURED)
return -EIO;
- base = smp_get_base_cpu(cpu);
- for (i = 0; i <= smp_cpu_mtid; i++) {
- if (base + i < nr_cpu_ids)
- if (cpu_online(base + i))
- break;
- }
- /*
- * If this is the first CPU of the core to get online
- * do an initial CPU reset.
- */
- if (i > smp_cpu_mtid &&
- pcpu_sigp_retry(pcpu_devices + base, SIGP_INITIAL_CPU_RESET, 0) !=
+ if (pcpu_sigp_retry(pcpu, SIGP_INITIAL_CPU_RESET, 0) !=
SIGP_CC_ORDER_CODE_ACCEPTED)
return -EIO;
rc = pcpu_alloc_lowcore(pcpu, cpu);
if (rc)
return rc;
+ /*
+ * Make sure global control register contents do not change
+ * until new CPU has initialized control registers.
+ */
+ system_ctlreg_lock();
pcpu_prepare_secondary(pcpu, cpu);
pcpu_attach_task(pcpu, tidle);
pcpu_start_fn(pcpu, smp_start_secondary, NULL);
/* Wait until cpu puts itself in the online & active maps */
while (!cpu_online(cpu))
cpu_relax();
+ system_ctlreg_unlock();
return 0;
}
@@ -926,19 +910,23 @@ early_param("possible_cpus", _setup_possible_cpus);
int __cpu_disable(void)
{
- unsigned long cregs[16];
+ struct ctlreg cregs[16];
+ int cpu;
/* Handle possible pending IPIs */
smp_handle_ext_call();
- set_cpu_online(smp_processor_id(), false);
+ cpu = smp_processor_id();
+ set_cpu_online(cpu, false);
+ cpumask_clear_cpu(cpu, &cpu_setup_mask);
+ update_cpu_masks();
/* Disable pseudo page faults on this cpu. */
pfault_fini();
/* Disable interrupt sources via control register. */
- __ctl_store(cregs, 0, 15);
- cregs[0] &= ~0x0000ee70UL; /* disable all external interrupts */
- cregs[6] &= ~0xff000000UL; /* disable all I/O interrupts */
- cregs[14] &= ~0x1f000000UL; /* disable most machine checks */
- __ctl_load(cregs, 0, 15);
+ __local_ctl_store(0, 15, cregs);
+ cregs[0].val &= ~0x0000ee70UL; /* disable all external interrupts */
+ cregs[6].val &= ~0xff000000UL; /* disable all I/O interrupts */
+ cregs[14].val &= ~0x1f000000UL; /* disable most machine checks */
+ __local_ctl_load(0, 15, cregs);
clear_cpu_flag(CIF_NOHZ_DELAY);
return 0;
}
@@ -959,7 +947,6 @@ void __cpu_die(unsigned int cpu)
void __noreturn cpu_die(void)
{
idle_task_exit();
- __bpon();
pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0);
for (;;) ;
}
@@ -979,12 +966,12 @@ void __init smp_fill_possible_mask(void)
void __init smp_prepare_cpus(unsigned int max_cpus)
{
- /* request the 0x1201 emergency signal external interrupt */
if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt))
panic("Couldn't request external interrupt 0x1201");
- /* request the 0x1202 external call external interrupt */
+ system_ctl_set_bit(0, 14);
if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt))
panic("Couldn't request external interrupt 0x1202");
+ system_ctl_set_bit(0, 13);
}
void __init smp_prepare_boot_cpu(void)
@@ -993,15 +980,10 @@ void __init smp_prepare_boot_cpu(void)
WARN_ON(!cpu_present(0) || !cpu_online(0));
pcpu->state = CPU_STATE_CONFIGURED;
- pcpu->lowcore = (struct lowcore *)(unsigned long) store_prefix();
S390_lowcore.percpu_offset = __per_cpu_offset[0];
smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN);
}
-void __init smp_cpus_done(unsigned int max_cpus)
-{
-}
-
void __init smp_setup_processor_id(void)
{
pcpu_devices[0].address = stap();
@@ -1044,14 +1026,12 @@ static ssize_t cpu_configure_store(struct device *dev,
return -EINVAL;
if (val != 0 && val != 1)
return -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smp_cpu_state_mutex);
rc = -EBUSY;
- /* disallow configuration changes of online cpus and cpu 0 */
+ /* disallow configuration changes of online cpus */
cpu = dev->id;
cpu = smp_get_base_cpu(cpu);
- if (cpu == 0)
- goto out;
for (i = 0; i <= smp_cpu_mtid; i++)
if (cpu_online(cpu + i))
goto out;
@@ -1093,7 +1073,7 @@ static ssize_t cpu_configure_store(struct device *dev,
}
out:
mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return rc ? rc : count;
}
static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
@@ -1131,6 +1111,7 @@ static int smp_cpu_online(unsigned int cpu)
return sysfs_create_group(&s->kobj, &cpu_online_attr_group);
}
+
static int smp_cpu_pre_down(unsigned int cpu)
{
struct device *s = &per_cpu(cpu_device, cpu)->dev;
@@ -1150,7 +1131,7 @@ static int smp_add_present_cpu(int cpu)
return -ENOMEM;
per_cpu(cpu_device, cpu) = c;
s = &c->dev;
- c->hotpluggable = 1;
+ c->hotpluggable = !!cpu;
rc = register_cpu(c, cpu);
if (rc)
goto out;
@@ -1179,11 +1160,7 @@ int __ref smp_rescan_cpus(void)
if (!info)
return -ENOMEM;
smp_get_core_info(info, 0);
- get_online_cpus();
- mutex_lock(&smp_cpu_state_mutex);
nr = __smp_rescan_cpus(info, false);
- mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
kfree(info);
if (nr)
topology_schedule_update();
@@ -1208,11 +1185,17 @@ static DEVICE_ATTR_WO(rescan);
static int __init s390_smp_init(void)
{
+ struct device *dev_root;
int cpu, rc = 0;
- rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan);
- if (rc)
- return rc;
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ rc = device_create_file(dev_root, &dev_attr_rescan);
+ put_device(dev_root);
+ if (rc)
+ return rc;
+ }
+
for_each_present_cpu(cpu) {
rc = smp_add_present_cpu(cpu);
if (rc)
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index fc5419ac64c8..94f440e38303 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -3,13 +3,15 @@
* Stack trace management functions
*
* Copyright IBM Corp. 2006
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#include <linux/stacktrace.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
#include <asm/kprobes.h>
+#include <asm/ptrace.h>
void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
struct task_struct *task, struct pt_regs *regs)
@@ -19,17 +21,11 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
unwind_for_each_frame(&state, task, regs, 0) {
addr = unwind_get_return_address(&state);
- if (!addr || !consume_entry(cookie, addr, false))
+ if (!addr || !consume_entry(cookie, addr))
break;
}
}
-/*
- * This function returns an error if it detects any unreliable features of the
- * stack. Otherwise it guarantees that the stack trace is reliable.
- *
- * If the task is not 'current', the caller *must* ensure the task is inactive.
- */
int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
void *cookie, struct task_struct *task)
{
@@ -47,16 +43,16 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
if (!addr)
return -EINVAL;
-#ifdef CONFIG_KPROBES
+#ifdef CONFIG_RETHOOK
/*
- * Mark stacktraces with kretprobed functions on them
+ * Mark stacktraces with krethook functions on them
* as unreliable.
*/
- if (state.ip == (unsigned long)kretprobe_trampoline)
+ if (state.ip == (unsigned long)arch_rethook_trampoline)
return -EINVAL;
#endif
- if (!consume_entry(cookie, addr, false))
+ if (!consume_entry(cookie, addr))
return -EINVAL;
}
@@ -65,3 +61,43 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
return -EINVAL;
return 0;
}
+
+void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
+ const struct pt_regs *regs)
+{
+ struct stack_frame_user __user *sf;
+ unsigned long ip, sp;
+ bool first = true;
+
+ if (is_compat_task())
+ return;
+ if (!consume_entry(cookie, instruction_pointer(regs)))
+ return;
+ sf = (void __user *)user_stack_pointer(regs);
+ pagefault_disable();
+ while (1) {
+ if (__get_user(sp, &sf->back_chain))
+ break;
+ if (__get_user(ip, &sf->gprs[8]))
+ break;
+ if (ip & 0x1) {
+ /*
+ * If the instruction address is invalid, and this
+ * is the first stack frame, assume r14 has not
+ * been written to the stack yet. Otherwise exit.
+ */
+ if (first && !(regs->gprs[14] & 0x1))
+ ip = regs->gprs[14];
+ else
+ break;
+ }
+ if (!consume_entry(cookie, ip))
+ break;
+ /* Sanity check: ABI requires SP to be aligned 8 bytes. */
+ if (!sp || sp & 0x7)
+ break;
+ sf = (void __user *)sp;
+ first = false;
+ }
+ pagefault_enable();
+}
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index 888cc2f166db..30bb20461db4 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -317,7 +317,9 @@ static void fill_diag(struct sthyi_sctns *sctns)
if (pages <= 0)
return;
- diag204_buf = vmalloc(array_size(pages, PAGE_SIZE));
+ diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE),
+ PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
if (!diag204_buf)
return;
@@ -395,19 +397,18 @@ out:
static int sthyi(u64 vaddr, u64 *rc)
{
- register u64 code asm("0") = 0;
- register u64 addr asm("2") = vaddr;
- register u64 rcode asm("3");
+ union register_pair r1 = { .even = 0, }; /* subcode */
+ union register_pair r2 = { .even = vaddr, };
int cc;
asm volatile(
- ".insn rre,0xB2560000,%[code],%[addr]\n"
+ ".insn rre,0xB2560000,%[r1],%[r2]\n"
"ipm %[cc]\n"
"srl %[cc],28\n"
- : [cc] "=d" (cc), "=d" (rcode)
- : [code] "d" (code), [addr] "a" (addr)
+ : [cc] "=&d" (cc), [r2] "+&d" (r2.pair)
+ : [r1] "d" (r1.pair)
: "memory", "cc");
- *rc = rcode;
+ *rc = r2.odd;
return cc;
}
@@ -460,9 +461,9 @@ static int sthyi_update_cache(u64 *rc)
*
* Fills the destination with system information returned by the STHYI
* instruction. The data is generated by emulation or execution of STHYI,
- * if available. The return value is the condition code that would be
- * returned, the rc parameter is the return code which is passed in
- * register R2 + 1.
+ * if available. The return value is either a negative error value or
+ * the condition code that would be returned, the rc parameter is the
+ * return code which is passed in register R2 + 1.
*/
int sthyi_fill(void *dst, u64 *rc)
{
diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c
deleted file mode 100644
index 75b7b307946e..000000000000
--- a/arch/s390/kernel/suspend.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Suspend support specific for s390.
- *
- * Copyright IBM Corp. 2009
- *
- * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com>
- */
-
-#include <linux/pfn.h>
-#include <linux/suspend.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <asm/ctl_reg.h>
-#include <asm/ipl.h>
-#include <asm/cio.h>
-#include <asm/sections.h>
-#include "entry.h"
-
-/*
- * The restore of the saved pages in an hibernation image will set
- * the change and referenced bits in the storage key for each page.
- * Overindication of the referenced bits after an hibernation cycle
- * does not cause any harm but the overindication of the change bits
- * would cause trouble.
- * Use the ARCH_SAVE_PAGE_KEYS hooks to save the storage key of each
- * page to the most significant byte of the associated page frame
- * number in the hibernation image.
- */
-
-/*
- * Key storage is allocated as a linked list of pages.
- * The size of the keys array is (PAGE_SIZE - sizeof(long))
- */
-struct page_key_data {
- struct page_key_data *next;
- unsigned char data[];
-};
-
-#define PAGE_KEY_DATA_SIZE (PAGE_SIZE - sizeof(struct page_key_data *))
-
-static struct page_key_data *page_key_data;
-static struct page_key_data *page_key_rp, *page_key_wp;
-static unsigned long page_key_rx, page_key_wx;
-unsigned long suspend_zero_pages;
-
-/*
- * For each page in the hibernation image one additional byte is
- * stored in the most significant byte of the page frame number.
- * On suspend no additional memory is required but on resume the
- * keys need to be memorized until the page data has been restored.
- * Only then can the storage keys be set to their old state.
- */
-unsigned long page_key_additional_pages(unsigned long pages)
-{
- return DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE);
-}
-
-/*
- * Free page_key_data list of arrays.
- */
-void page_key_free(void)
-{
- struct page_key_data *pkd;
-
- while (page_key_data) {
- pkd = page_key_data;
- page_key_data = pkd->next;
- free_page((unsigned long) pkd);
- }
-}
-
-/*
- * Allocate page_key_data list of arrays with enough room to store
- * one byte for each page in the hibernation image.
- */
-int page_key_alloc(unsigned long pages)
-{
- struct page_key_data *pk;
- unsigned long size;
-
- size = DIV_ROUND_UP(pages, PAGE_KEY_DATA_SIZE);
- while (size--) {
- pk = (struct page_key_data *) get_zeroed_page(GFP_KERNEL);
- if (!pk) {
- page_key_free();
- return -ENOMEM;
- }
- pk->next = page_key_data;
- page_key_data = pk;
- }
- page_key_rp = page_key_wp = page_key_data;
- page_key_rx = page_key_wx = 0;
- return 0;
-}
-
-/*
- * Save the storage key into the upper 8 bits of the page frame number.
- */
-void page_key_read(unsigned long *pfn)
-{
- struct page *page;
- unsigned long addr;
- unsigned char key;
-
- page = pfn_to_page(*pfn);
- addr = (unsigned long) page_address(page);
- key = (unsigned char) page_get_storage_key(addr) & 0x7f;
- if (arch_test_page_nodat(page))
- key |= 0x80;
- *(unsigned char *) pfn = key;
-}
-
-/*
- * Extract the storage key from the upper 8 bits of the page frame number
- * and store it in the page_key_data list of arrays.
- */
-void page_key_memorize(unsigned long *pfn)
-{
- page_key_wp->data[page_key_wx] = *(unsigned char *) pfn;
- *(unsigned char *) pfn = 0;
- if (++page_key_wx < PAGE_KEY_DATA_SIZE)
- return;
- page_key_wp = page_key_wp->next;
- page_key_wx = 0;
-}
-
-/*
- * Get the next key from the page_key_data list of arrays and set the
- * storage key of the page referred by @address. If @address refers to
- * a "safe" page the swsusp_arch_resume code will transfer the storage
- * key from the buffer page to the original page.
- */
-void page_key_write(void *address)
-{
- struct page *page;
- unsigned char key;
-
- key = page_key_rp->data[page_key_rx];
- page_set_storage_key((unsigned long) address, key & 0x7f, 0);
- page = virt_to_page(address);
- if (key & 0x80)
- arch_set_page_nodat(page, 0);
- else
- arch_set_page_dat(page, 0);
- if (++page_key_rx >= PAGE_KEY_DATA_SIZE)
- return;
- page_key_rp = page_key_rp->next;
- page_key_rx = 0;
-}
-
-int pfn_is_nosave(unsigned long pfn)
-{
- unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin));
- unsigned long nosave_end_pfn = PFN_DOWN(__pa(&__nosave_end));
- unsigned long end_rodata_pfn = PFN_DOWN(__pa(__end_rodata)) - 1;
- unsigned long stext_pfn = PFN_DOWN(__pa(_stext));
-
- /* Always save lowcore pages (LC protection might be enabled). */
- if (pfn <= LC_PAGES)
- return 0;
- if (pfn >= nosave_begin_pfn && pfn < nosave_end_pfn)
- return 1;
- /* Skip memory holes and read-only pages (DCSS, ...). */
- if (pfn >= stext_pfn && pfn <= end_rodata_pfn)
- return 0;
- if (tprot(PFN_PHYS(pfn)))
- return 1;
- return 0;
-}
-
-/*
- * PM notifier callback for suspend
- */
-static int suspend_pm_cb(struct notifier_block *nb, unsigned long action,
- void *ptr)
-{
- switch (action) {
- case PM_SUSPEND_PREPARE:
- case PM_HIBERNATION_PREPARE:
- suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER);
- if (!suspend_zero_pages)
- return NOTIFY_BAD;
- break;
- case PM_POST_SUSPEND:
- case PM_POST_HIBERNATION:
- free_pages(suspend_zero_pages, LC_ORDER);
- break;
- default:
- return NOTIFY_DONE;
- }
- return NOTIFY_OK;
-}
-
-static int __init suspend_pm_init(void)
-{
- pm_notifier(suspend_pm_cb, 0);
- return 0;
-}
-arch_initcall(suspend_pm_init);
-
-void save_processor_state(void)
-{
- /* swsusp_arch_suspend() actually saves all cpu register contents.
- * Machine checks must be disabled since swsusp_arch_suspend() stores
- * register contents to their lowcore save areas. That's the same
- * place where register contents on machine checks would be saved.
- * To avoid register corruption disable machine checks.
- * We must also disable machine checks in the new psw mask for
- * program checks, since swsusp_arch_suspend() may generate program
- * checks. Disabling machine checks for all other new psw masks is
- * just paranoia.
- */
- local_mcck_disable();
- /* Disable lowcore protection */
- __ctl_clear_bit(0,28);
- S390_lowcore.external_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.svc_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.io_new_psw.mask &= ~PSW_MASK_MCHECK;
- S390_lowcore.program_new_psw.mask &= ~PSW_MASK_MCHECK;
-}
-
-void restore_processor_state(void)
-{
- S390_lowcore.external_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.svc_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.io_new_psw.mask |= PSW_MASK_MCHECK;
- S390_lowcore.program_new_psw.mask |= PSW_MASK_MCHECK;
- /* Enable lowcore protection */
- __ctl_set_bit(0,28);
- local_mcck_enable();
-}
-
-/* Called at the end of swsusp_arch_resume */
-void s390_early_resume(void)
-{
- lgr_info_log();
- channel_subsystem_reinit();
- zpci_rescan();
-}
diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S
deleted file mode 100644
index a7baf0b5f818..000000000000
--- a/arch/s390/kernel/swsusp.S
+++ /dev/null
@@ -1,276 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * S390 64-bit swsusp implementation
- *
- * Copyright IBM Corp. 2009
- *
- * Author(s): Hans-Joachim Picht <hans@linux.vnet.ibm.com>
- * Michael Holzheu <holzheu@linux.vnet.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/thread_info.h>
-#include <asm/asm-offsets.h>
-#include <asm/nospec-insn.h>
-#include <asm/sigp.h>
-
-/*
- * Save register context in absolute 0 lowcore and call swsusp_save() to
- * create in-memory kernel image. The context is saved in the designated
- * "store status" memory locations (see POP).
- * We return from this function twice. The first time during the suspend to
- * disk process. The second time via the swsusp_arch_resume() function
- * (see below) in the resume process.
- * This function runs with disabled interrupts.
- */
- GEN_BR_THUNK %r14
-
- .section .text
-ENTRY(swsusp_arch_suspend)
- lg %r1,__LC_NODAT_STACK
- stmg %r6,%r15,__SF_GPRS(%r1)
- aghi %r1,-STACK_FRAME_OVERHEAD
- stg %r15,__SF_BACKCHAIN(%r1)
- lgr %r15,%r1
-
- /* Store FPU registers */
- brasl %r14,save_fpu_regs
-
- /* Deactivate DAT */
- stnsm __SF_EMPTY(%r15),0xfb
-
- /* Store prefix register on stack */
- stpx __SF_EMPTY(%r15)
-
- /* Save prefix register contents for lowcore copy */
- llgf %r10,__SF_EMPTY(%r15)
-
- /* Get pointer to save area */
- lghi %r1,0x1000
-
- /* Save CPU address */
- stap __LC_EXT_CPU_ADDR(%r0)
-
- /* Store registers */
- mvc 0x318(4,%r1),__SF_EMPTY(%r15) /* move prefix to lowcore */
- stam %a0,%a15,0x340(%r1) /* store access registers */
- stctg %c0,%c15,0x380(%r1) /* store control registers */
- stmg %r0,%r15,0x280(%r1) /* store general registers */
-
- stpt 0x328(%r1) /* store timer */
- stck __SF_EMPTY(%r15) /* store clock */
- stckc 0x330(%r1) /* store clock comparator */
-
- /* Update cputime accounting before going to sleep */
- lg %r0,__LC_LAST_UPDATE_TIMER
- slg %r0,0x328(%r1)
- alg %r0,__LC_SYSTEM_TIMER
- stg %r0,__LC_SYSTEM_TIMER
- mvc __LC_LAST_UPDATE_TIMER(8),0x328(%r1)
- lg %r0,__LC_LAST_UPDATE_CLOCK
- slg %r0,__SF_EMPTY(%r15)
- alg %r0,__LC_STEAL_TIMER
- stg %r0,__LC_STEAL_TIMER
- mvc __LC_LAST_UPDATE_CLOCK(8),__SF_EMPTY(%r15)
-
- /* Activate DAT */
- stosm __SF_EMPTY(%r15),0x04
-
- /* Set prefix page to zero */
- xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15)
- spx __SF_EMPTY(%r15)
-
- /* Save absolute zero pages */
- larl %r2,suspend_zero_pages
- lg %r2,0(%r2)
- lghi %r4,0
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Copy lowcore to absolute zero lowcore */
- lghi %r2,0
- lgr %r4,%r10
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Save image */
- brasl %r14,swsusp_save
-
- /* Restore prefix register and return */
- lghi %r1,0x1000
- spx 0x318(%r1)
- lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
- lghi %r2,0
- BR_EX %r14
-ENDPROC(swsusp_arch_suspend)
-
-/*
- * Restore saved memory image to correct place and restore register context.
- * Then we return to the function that called swsusp_arch_suspend().
- * swsusp_arch_resume() runs with disabled interrupts.
- */
-ENTRY(swsusp_arch_resume)
- stmg %r6,%r15,__SF_GPRS(%r15)
- lgr %r1,%r15
- aghi %r15,-STACK_FRAME_OVERHEAD
- stg %r1,__SF_BACKCHAIN(%r15)
-
- /* Make all free pages stable */
- lghi %r2,1
- brasl %r14,arch_set_page_states
-
- /* Set prefix page to zero */
- xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15)
- spx __SF_EMPTY(%r15)
-
- /* Deactivate DAT */
- stnsm __SF_EMPTY(%r15),0xfb
-
- /* Restore saved image */
- larl %r1,restore_pblist
- lg %r1,0(%r1)
- ltgr %r1,%r1
- jz 2f
-0:
- lg %r2,8(%r1)
- lg %r4,0(%r1)
- iske %r0,%r4
- lghi %r3,PAGE_SIZE
- lghi %r5,PAGE_SIZE
-1:
- mvcle %r2,%r4,0
- jo 1b
- lg %r2,8(%r1)
- sske %r0,%r2
- lg %r1,16(%r1)
- ltgr %r1,%r1
- jnz 0b
-2:
- ptlb /* flush tlb */
-
- /* Reset System */
- larl %r1,.Lnew_pgm_check_psw
- epsw %r2,%r3
- stm %r2,%r3,0(%r1)
- mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1)
- larl %r1,__swsusp_reset_dma
- lg %r1,0(%r1)
- BASR_EX %r14,%r1
- larl %r1,smp_cpu_mt_shift
- icm %r1,15,0(%r1)
- jz smt_done
- llgfr %r1,%r1
-smt_loop:
- sigp %r1,%r0,SIGP_SET_MULTI_THREADING
- brc 8,smt_done /* accepted */
- brc 2,smt_loop /* busy, try again */
-smt_done:
- larl %r1,.Lnew_pgm_check_psw
- lpswe 0(%r1)
-pgm_check_entry:
-
- /* Switch to original suspend CPU */
- larl %r1,.Lresume_cpu /* Resume CPU address: r2 */
- stap 0(%r1)
- llgh %r2,0(%r1)
- llgh %r1,__LC_EXT_CPU_ADDR(%r0) /* Suspend CPU address: r1 */
- cgr %r1,%r2
- je restore_registers /* r1 = r2 -> nothing to do */
- larl %r4,.Lrestart_suspend_psw /* Set new restart PSW */
- mvc __LC_RST_NEW_PSW(16,%r0),0(%r4)
-3:
- sigp %r9,%r1,SIGP_INITIAL_CPU_RESET /* sigp initial cpu reset */
- brc 8,4f /* accepted */
- brc 2,3b /* busy, try again */
-
- /* Suspend CPU not available -> panic */
- larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD
- larl %r2,.Lpanic_string
- brasl %r14,sclp_early_printk_force
- larl %r3,.Ldisabled_wait_31
- lpsw 0(%r3)
-4:
- /* Switch to suspend CPU */
- sigp %r9,%r1,SIGP_RESTART /* sigp restart to suspend CPU */
- brc 2,4b /* busy, try again */
-5:
- sigp %r9,%r2,SIGP_STOP /* sigp stop to current resume CPU */
- brc 2,5b /* busy, try again */
-6: j 6b
-
-restart_suspend:
- larl %r1,.Lresume_cpu
- llgh %r2,0(%r1)
-7:
- sigp %r9,%r2,SIGP_SENSE /* sigp sense, wait for resume CPU */
- brc 8,7b /* accepted, status 0, still running */
- brc 2,7b /* busy, try again */
- tmll %r9,0x40 /* Test if resume CPU is stopped */
- jz 7b
-
-restore_registers:
- /* Restore registers */
- lghi %r13,0x1000 /* %r1 = pointer to save area */
-
- /* Ignore time spent in suspended state. */
- llgf %r1,0x318(%r13)
- stck __LC_LAST_UPDATE_CLOCK(%r1)
- spt 0x328(%r13) /* reprogram timer */
- //sckc 0x330(%r13) /* set clock comparator */
-
- lctlg %c0,%c15,0x380(%r13) /* load control registers */
- lam %a0,%a15,0x340(%r13) /* load access registers */
-
- /* Load old stack */
- lg %r15,0x2f8(%r13)
-
- /* Save prefix register */
- mvc __SF_EMPTY(4,%r15),0x318(%r13)
-
- /* Restore absolute zero pages */
- lghi %r2,0
- larl %r4,suspend_zero_pages
- lg %r4,0(%r4)
- lghi %r3,2*PAGE_SIZE
- lghi %r5,2*PAGE_SIZE
-1: mvcle %r2,%r4,0
- jo 1b
-
- /* Restore prefix register */
- spx __SF_EMPTY(%r15)
-
- /* Activate DAT */
- stosm __SF_EMPTY(%r15),0x04
-
- /* Make all free pages unstable */
- lghi %r2,0
- brasl %r14,arch_set_page_states
-
- /* Call arch specific early resume code */
- brasl %r14,s390_early_resume
-
- /* Return 0 */
- lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15)
- lghi %r2,0
- BR_EX %r14
-ENDPROC(swsusp_arch_resume)
-
- .section .data..nosave,"aw",@progbits
- .align 8
-.Ldisabled_wait_31:
- .long 0x000a0000,0x00000000
-.Lpanic_string:
- .asciz "Resume not possible because suspend CPU is no longer available\n"
- .align 8
-.Lrestart_suspend_psw:
- .quad 0x0000000180000000,restart_suspend
-.Lnew_pgm_check_psw:
- .quad 0,pgm_check_entry
-.Lresume_cpu:
- .byte 0,0
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/syscall.c
index 202fa73ac167..dc2355c623d6 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/syscall.c
@@ -29,6 +29,13 @@
#include <linux/unistd.h>
#include <linux/ipc.h>
#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/thread_info.h>
+#include <linux/entry-common.h>
+
+#include <asm/ptrace.h>
+#include <asm/vtime.h>
+
#include "entry.h"
/*
@@ -100,3 +107,64 @@ SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
}
+
+static void do_syscall(struct pt_regs *regs)
+{
+ unsigned long nr;
+
+ nr = regs->int_code & 0xffff;
+ if (!nr) {
+ nr = regs->gprs[1] & 0xffff;
+ regs->int_code &= ~0xffffUL;
+ regs->int_code |= nr;
+ }
+
+ regs->gprs[2] = nr;
+
+ if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) {
+ regs->psw.addr = current->restart_block.arch_data;
+ current->restart_block.arch_data = 1;
+ }
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+
+ /*
+ * In the s390 ptrace ABI, both the syscall number and the return value
+ * use gpr2. However, userspace puts the syscall number either in the
+ * svc instruction itself, or uses gpr1. To make at least skipping syscalls
+ * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here
+ * and if set, the syscall will be skipped.
+ */
+
+ if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)))
+ goto out;
+ regs->gprs[2] = -ENOSYS;
+ if (likely(nr >= NR_syscalls))
+ goto out;
+ do {
+ regs->gprs[2] = current->thread.sys_call_table[nr](regs);
+ } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART));
+out:
+ syscall_exit_to_user_mode_work(regs);
+}
+
+void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
+{
+ add_random_kstack_offset();
+ enter_from_user_mode(regs);
+ regs->psw = S390_lowcore.svc_old_psw;
+ regs->int_code = S390_lowcore.svc_int_code;
+ update_timer_sys();
+ if (static_branch_likely(&cpu_has_bear))
+ current->thread.last_break = regs->last_break;
+
+ local_irq_enable();
+ regs->orig_gpr2 = regs->gprs[2];
+
+ if (per_trap)
+ set_thread_flag(TIF_PER_TRAP);
+
+ regs->flags = 0;
+ set_pt_regs_flag(regs, PIF_SYSCALL);
+ do_syscall(regs);
+ exit_to_user_mode();
+}
diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile
index b98f25029b8e..fb85e797946d 100644
--- a/arch/s390/kernel/syscalls/Makefile
+++ b/arch/s390/kernel/syscalls/Makefile
@@ -21,8 +21,7 @@ uapi: $(uapi-hdrs-y)
# Create output directory if not already present
-_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \
- $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
+$(shell mkdir -p $(uapi) $(kapi))
filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $<
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 3054e9c035a3..095bb86339a7 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -26,7 +26,7 @@
16 32 lchown - sys_lchown16
19 common lseek sys_lseek compat_sys_lseek
20 common getpid sys_getpid sys_getpid
-21 common mount sys_mount compat_sys_mount
+21 common mount sys_mount sys_mount
22 common umount sys_oldumount sys_oldumount
23 32 setuid - sys_setuid16
24 32 getuid - sys_getuid16
@@ -100,7 +100,7 @@
106 common stat sys_newstat compat_sys_newstat
107 common lstat sys_newlstat compat_sys_newlstat
108 common fstat sys_newfstat compat_sys_newfstat
-110 common lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie
+110 common lookup_dcookie - -
111 common vhangup sys_vhangup sys_vhangup
112 common idle - -
114 common wait4 sys_wait4 compat_sys_wait4
@@ -122,7 +122,7 @@
131 common quotactl sys_quotactl sys_quotactl
132 common getpgid sys_getpgid sys_getpgid
133 common fchdir sys_fchdir sys_fchdir
-134 common bdflush sys_bdflush sys_bdflush
+134 common bdflush sys_ni_syscall sys_ni_syscall
135 common sysfs sys_sysfs sys_sysfs
136 common personality sys_s390_personality sys_s390_personality
137 common afs_syscall - -
@@ -134,11 +134,11 @@
142 64 select sys_select -
143 common flock sys_flock sys_flock
144 common msync sys_msync sys_msync
-145 common readv sys_readv compat_sys_readv
-146 common writev sys_writev compat_sys_writev
+145 common readv sys_readv sys_readv
+146 common writev sys_writev sys_writev
147 common getsid sys_getsid sys_getsid
148 common fdatasync sys_fdatasync sys_fdatasync
-149 common _sysctl sys_sysctl compat_sys_sysctl
+149 common _sysctl - -
150 common mlock sys_mlock sys_mlock
151 common munlock sys_munlock sys_munlock
152 common mlockall sys_mlockall sys_mlockall
@@ -274,9 +274,9 @@
265 common statfs64 sys_statfs64 compat_sys_statfs64
266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages
-268 common mbind sys_mbind compat_sys_mbind
-269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
-270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy
+268 common mbind sys_mbind sys_mbind
+269 common get_mempolicy sys_get_mempolicy sys_get_mempolicy
+270 common set_mempolicy sys_set_mempolicy sys_set_mempolicy
271 common mq_open sys_mq_open compat_sys_mq_open
272 common mq_unlink sys_mq_unlink sys_mq_unlink
273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32
@@ -293,7 +293,7 @@
284 common inotify_init sys_inotify_init sys_inotify_init
285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch
286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch
-287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages
+287 common migrate_pages sys_migrate_pages sys_migrate_pages
288 common openat sys_openat compat_sys_openat
289 common mkdirat sys_mkdirat sys_mkdirat
290 common mknodat sys_mknodat sys_mknodat
@@ -316,8 +316,8 @@
306 common splice sys_splice sys_splice
307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range
308 common tee sys_tee sys_tee
-309 common vmsplice sys_vmsplice compat_sys_vmsplice
-310 common move_pages sys_move_pages compat_sys_move_pages
+309 common vmsplice sys_vmsplice sys_vmsplice
+310 common move_pages sys_move_pages sys_move_pages
311 common getcpu sys_getcpu sys_getcpu
312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait
313 common utimes sys_utimes sys_utimes_time32
@@ -347,8 +347,8 @@
337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32
338 common syncfs sys_syncfs sys_syncfs
339 common setns sys_setns sys_setns
-340 common process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
-341 common process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
+340 common process_vm_readv sys_process_vm_readv sys_process_vm_readv
+341 common process_vm_writev sys_process_vm_writev sys_process_vm_writev
342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr
343 common kcmp sys_kcmp sys_kcmp
344 common finit_module sys_finit_module sys_finit_module
@@ -372,8 +372,8 @@
362 common connect sys_connect sys_connect
363 common listen sys_listen sys_listen
364 common accept4 sys_accept4 sys_accept4
-365 common getsockopt sys_getsockopt compat_sys_getsockopt
-366 common setsockopt sys_setsockopt compat_sys_setsockopt
+365 common getsockopt sys_getsockopt sys_getsockopt
+366 common setsockopt sys_setsockopt sys_setsockopt
367 common getsockname sys_getsockname sys_getsockname
368 common getpeername sys_getpeername sys_getpeername
369 common sendto sys_sendto sys_sendto
@@ -438,3 +438,29 @@
433 common fspick sys_fspick sys_fspick
434 common pidfd_open sys_pidfd_open sys_pidfd_open
435 common clone3 sys_clone3 sys_clone3
+436 common close_range sys_close_range sys_close_range
+437 common openat2 sys_openat2 sys_openat2
+438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd
+439 common faccessat2 sys_faccessat2 sys_faccessat2
+440 common process_madvise sys_process_madvise sys_process_madvise
+441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
+442 common mount_setattr sys_mount_setattr sys_mount_setattr
+443 common quotactl_fd sys_quotactl_fd sys_quotactl_fd
+444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset
+445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule
+446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self
+447 common memfd_secret sys_memfd_secret sys_memfd_secret
+448 common process_mrelease sys_process_mrelease sys_process_mrelease
+449 common futex_waitv sys_futex_waitv sys_futex_waitv
+450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common cachestat sys_cachestat sys_cachestat
+452 common fchmodat2 sys_fchmodat2 sys_fchmodat2
+453 common map_shadow_stack sys_map_shadow_stack sys_map_shadow_stack
+454 common futex_wake sys_futex_wake sys_futex_wake
+455 common futex_wait sys_futex_wait sys_futex_wait
+456 common futex_requeue sys_futex_requeue sys_futex_requeue
+457 common statmount sys_statmount sys_statmount
+458 common listmount sys_listmount sys_listmount
+459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr
+460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr
+461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index 2ac3c9b56a13..f6f8f498c9be 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -14,6 +14,7 @@
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/slab.h>
+#include <asm/asm-extable.h>
#include <asm/ebcdic.h>
#include <asm/debug.h>
#include <asm/sysinfo.h>
@@ -25,19 +26,22 @@ int topology_max_mnest;
static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl)
{
- register int r0 asm("0") = (fc << 28) | sel1;
- register int r1 asm("1") = sel2;
+ int r0 = (fc << 28) | sel1;
int rc = 0;
asm volatile(
- " stsi 0(%3)\n"
+ " lr 0,%[r0]\n"
+ " lr 1,%[r1]\n"
+ " stsi 0(%[sysinfo])\n"
"0: jz 2f\n"
- "1: lhi %1,%4\n"
- "2:\n"
+ "1: lhi %[rc],%[retval]\n"
+ "2: lr %[r0],0\n"
EX_TABLE(0b, 1b)
- : "+d" (r0), "+d" (rc)
- : "d" (r1), "a" (sysinfo), "K" (-EOPNOTSUPP)
- : "cc", "memory");
+ : [r0] "+d" (r0), [rc] "+d" (rc)
+ : [r1] "d" (sel2),
+ [sysinfo] "a" (sysinfo),
+ [retval] "K" (-EOPNOTSUPP)
+ : "cc", "0", "1", "memory");
*lvl = ((unsigned int) r0) >> 28;
return rc;
}
@@ -77,10 +81,12 @@ static bool convert_ext_name(unsigned char encoding, char *name, size_t len)
static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
{
+ bool has_var_cap;
int i;
if (stsi(info, 1, 1, 1))
return;
+ has_var_cap = !!info->model_var_cap[0];
EBCASC(info->manufacturer, sizeof(info->manufacturer));
EBCASC(info->type, sizeof(info->type));
EBCASC(info->model, sizeof(info->model));
@@ -89,6 +95,8 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
EBCASC(info->model_capacity, sizeof(info->model_capacity));
EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap));
EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap));
+ if (has_var_cap)
+ EBCASC(info->model_var_cap, sizeof(info->model_var_cap));
seq_printf(m, "Manufacturer: %-16.16s\n", info->manufacturer);
seq_printf(m, "Type: %-4.4s\n", info->type);
if (info->lic)
@@ -116,12 +124,18 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n",
info->model_temp_cap,
info->model_temp_cap_rating);
+ if (has_var_cap && info->model_var_cap_rating)
+ seq_printf(m, "Model Var. Capacity: %-16.16s %08u\n",
+ info->model_var_cap,
+ info->model_var_cap_rating);
if (info->ncr)
seq_printf(m, "Nominal Cap. Rating: %08u\n", info->ncr);
if (info->npr)
seq_printf(m, "Nominal Perm. Rating: %08u\n", info->npr);
if (info->ntr)
seq_printf(m, "Nominal Temp. Rating: %08u\n", info->ntr);
+ if (has_var_cap && info->nvr)
+ seq_printf(m, "Nominal Var. Rating: %08u\n", info->nvr);
if (info->cai) {
seq_printf(m, "Capacity Adj. Ind.: %d\n", info->cai);
seq_printf(m, "Capacity Ch. Reason: %d\n", info->ccr);
diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S
new file mode 100644
index 000000000000..14c6d25c035f
--- /dev/null
+++ b/arch/s390/kernel/text_amode31.S
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code that needs to run below 2 GB.
+ *
+ * Copyright IBM Corp. 2019
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm-extable.h>
+#include <asm/errno.h>
+#include <asm/sigp.h>
+
+ .section .amode31.text,"ax"
+/*
+ * Simplified version of expoline thunk. The normal thunks can not be used here,
+ * because they might be more than 2 GB away, and not reachable by the relative
+ * branch. No comdat, exrl, etc. optimizations used here, because it only
+ * affects a few functions that are not performance-relevant.
+ */
+ .macro BR_EX_AMODE31_r14
+ larl %r1,0f
+ ex 0,0(%r1)
+ j .
+0: br %r14
+ .endm
+
+/*
+ * int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode)
+ */
+SYM_FUNC_START(_diag14_amode31)
+ lgr %r1,%r2
+ lgr %r2,%r3
+ lgr %r3,%r4
+ lhi %r5,-EIO
+ sam31
+ diag %r1,%r2,0x14
+.Ldiag14_ex:
+ ipm %r5
+ srl %r5,28
+.Ldiag14_fault:
+ sam64
+ lgfr %r2,%r5
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag14_ex, .Ldiag14_fault)
+SYM_FUNC_END(_diag14_amode31)
+
+/*
+ * int _diag210_amode31(struct diag210 *addr)
+ */
+SYM_FUNC_START(_diag210_amode31)
+ lgr %r1,%r2
+ lhi %r2,-1
+ sam31
+ diag %r1,%r0,0x210
+.Ldiag210_ex:
+ ipm %r2
+ srl %r2,28
+.Ldiag210_fault:
+ sam64
+ lgfr %r2,%r2
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag210_ex, .Ldiag210_fault)
+SYM_FUNC_END(_diag210_amode31)
+
+/*
+ * int diag8c(struct diag8c *addr, struct ccw_dev_id *devno, size_t len)
+*/
+SYM_FUNC_START(_diag8c_amode31)
+ llgf %r3,0(%r3)
+ sam31
+ diag %r2,%r4,0x8c
+.Ldiag8c_ex:
+ sam64
+ lgfr %r2,%r3
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag8c_ex, .Ldiag8c_ex)
+SYM_FUNC_END(_diag8c_amode31)
+/*
+ * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode)
+ */
+SYM_FUNC_START(_diag26c_amode31)
+ lghi %r5,-EOPNOTSUPP
+ sam31
+ diag %r2,%r4,0x26c
+.Ldiag26c_ex:
+ sam64
+ lgfr %r2,%r5
+ BR_EX_AMODE31_r14
+ EX_TABLE_AMODE31(.Ldiag26c_ex, .Ldiag26c_ex)
+SYM_FUNC_END(_diag26c_amode31)
+
+/*
+ * void _diag0c_amode31(struct hypfs_diag0c_entry *entry)
+ */
+SYM_FUNC_START(_diag0c_amode31)
+ sam31
+ diag %r2,%r2,0x0c
+ sam64
+ BR_EX_AMODE31_r14
+SYM_FUNC_END(_diag0c_amode31)
+
+/*
+ * void _diag308_reset_amode31(void)
+ *
+ * Calls diag 308 subcode 1 and continues execution
+ */
+SYM_FUNC_START(_diag308_reset_amode31)
+ larl %r4,ctlregs # Save control registers
+ stctg %c0,%c15,0(%r4)
+ lg %r2,0(%r4) # Disable lowcore protection
+ nilh %r2,0xefff
+ larl %r4,ctlreg0
+ stg %r2,0(%r4)
+ lctlg %c0,%c0,0(%r4)
+ larl %r4,fpctl # Floating point control register
+ stfpc 0(%r4)
+ larl %r4,prefix # Save prefix register
+ stpx 0(%r4)
+ larl %r4,prefix_zero # Set prefix register to 0
+ spx 0(%r4)
+ larl %r4,continue_psw # Save PSW flags
+ epsw %r2,%r3
+ stm %r2,%r3,0(%r4)
+ larl %r4,.Lrestart_part2 # Setup restart PSW at absolute 0
+ larl %r3,restart_diag308_psw
+ og %r4,0(%r3) # Save PSW
+ lghi %r3,0
+ sturg %r4,%r3 # Use sturg, because of large pages
+ lghi %r1,1
+ lghi %r0,0
+ diag %r0,%r1,0x308
+.Lrestart_part2:
+ lhi %r0,0 # Load r0 with zero
+ lhi %r1,2 # Use mode 2 = ESAME (dump)
+ sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode
+ sam64 # Switch to 64 bit addressing mode
+ larl %r4,ctlregs # Restore control registers
+ lctlg %c0,%c15,0(%r4)
+ larl %r4,fpctl # Restore floating point ctl register
+ lfpc 0(%r4)
+ larl %r4,prefix # Restore prefix register
+ spx 0(%r4)
+ larl %r4,continue_psw # Restore PSW flags
+ larl %r2,.Lcontinue
+ stg %r2,8(%r4)
+ lpswe 0(%r4)
+.Lcontinue:
+ BR_EX_AMODE31_r14
+SYM_FUNC_END(_diag308_reset_amode31)
+
+ .section .amode31.data,"aw",@progbits
+ .balign 8
+SYM_DATA_LOCAL(restart_diag308_psw, .long 0x00080000,0x80000000)
+SYM_DATA_LOCAL(continue_psw, .quad 0,0)
+SYM_DATA_LOCAL(ctlreg0, .quad 0)
+SYM_DATA_LOCAL(ctlregs, .fill 16,8,0)
+SYM_DATA_LOCAL(fpctl, .long 0)
+SYM_DATA_LOCAL(prefix, .long 0)
+SYM_DATA_LOCAL(prefix_zero, .long 0)
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index f9d070d016e3..14abad953c02 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -41,6 +41,9 @@
#include <linux/gfp.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
+#include <vdso/vsyscall.h>
+#include <vdso/clocksource.h>
+#include <vdso/helpers.h>
#include <asm/facility.h>
#include <asm/delay.h>
#include <asm/div64.h>
@@ -52,11 +55,7 @@
#include <asm/cio.h>
#include "entry.h"
-unsigned char tod_clock_base[16] __aligned(8) = {
- /* Force to data section. */
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
+union tod_clock tod_clock_base __section(".data");
EXPORT_SYMBOL_GPL(tod_clock_base);
u64 clock_comparator_max = -1ULL;
@@ -69,10 +68,10 @@ EXPORT_SYMBOL(s390_epoch_delta_notifier);
unsigned char ptff_function_mask[16];
-static unsigned long long lpar_offset;
-static unsigned long long initial_leap_seconds;
-static unsigned long long tod_steering_end;
-static long long tod_steering_delta;
+static unsigned long lpar_offset;
+static unsigned long initial_leap_seconds;
+static unsigned long tod_steering_end;
+static long tod_steering_delta;
/*
* Get time offsets with PTFF
@@ -81,10 +80,12 @@ void __init time_early_init(void)
{
struct ptff_qto qto;
struct ptff_qui qui;
+ int cs;
/* Initialize TOD steering parameters */
- tod_steering_end = *(unsigned long long *) &tod_clock_base[1];
- vdso_data->ts_end = tod_steering_end;
+ tod_steering_end = tod_clock_base.tod;
+ for (cs = 0; cs < CS_BASES; cs++)
+ vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
if (!test_facility(28))
return;
@@ -97,10 +98,15 @@ void __init time_early_init(void)
/* get initial leap seconds */
if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0)
- initial_leap_seconds = (unsigned long long)
+ initial_leap_seconds = (unsigned long)
((long) qui.old_leap * 4096000000L);
}
+unsigned long long noinstr sched_clock_noinstr(void)
+{
+ return tod_to_ns(__get_tod_clock_monotonic());
+}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -110,18 +116,13 @@ unsigned long long notrace sched_clock(void)
}
NOKPROBE_SYMBOL(sched_clock);
-static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt)
+static void ext_to_timespec64(union tod_clock *clk, struct timespec64 *xt)
{
- unsigned long long high, low, rem, sec, nsec;
+ unsigned long rem, sec, nsec;
- /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */
- high = (*(unsigned long long *) clk) >> 4;
- low = (*(unsigned long long *)&clk[7]) << 4;
- /* Calculate seconds and nano-seconds */
- sec = high;
+ sec = clk->us;
rem = do_div(sec, 1000000);
- nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32;
-
+ nsec = ((clk->sus + (rem << 12)) * 125) >> 9;
xt->tv_sec = sec;
xt->tv_nsec = nsec;
}
@@ -172,10 +173,10 @@ void init_cpu_timer(void)
clockevents_register_device(cd);
/* Enable clock comparator timer interrupt. */
- __ctl_set_bit(0,11);
+ local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SUBMASK_BIT);
/* Always allow the timing alert external interrupt. */
- __ctl_set_bit(0, 4);
+ local_ctl_set_bit(0, CR0_ETR_SUBMASK_BIT);
}
static void clock_comparator_interrupt(struct ext_code ext_code,
@@ -201,30 +202,26 @@ static void stp_reset(void);
void read_persistent_clock64(struct timespec64 *ts)
{
- unsigned char clk[STORE_CLOCK_EXT_SIZE];
- __u64 delta;
+ union tod_clock clk;
+ u64 delta;
delta = initial_leap_seconds + TOD_UNIX_EPOCH;
- get_tod_clock_ext(clk);
- *(__u64 *) &clk[1] -= delta;
- if (*(__u64 *) &clk[1] > delta)
- clk[0]--;
- ext_to_timespec64(clk, ts);
+ store_tod_clock_ext(&clk);
+ clk.eitod -= delta;
+ ext_to_timespec64(&clk, ts);
}
void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
struct timespec64 *boot_offset)
{
- unsigned char clk[STORE_CLOCK_EXT_SIZE];
struct timespec64 boot_time;
- __u64 delta;
+ union tod_clock clk;
+ u64 delta;
delta = initial_leap_seconds + TOD_UNIX_EPOCH;
- memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE);
- *(__u64 *)&clk[1] -= delta;
- if (*(__u64 *)&clk[1] > delta)
- clk[0]--;
- ext_to_timespec64(clk, &boot_time);
+ clk = tod_clock_base;
+ clk.eitod -= delta;
+ ext_to_timespec64(&clk, &boot_time);
read_persistent_clock64(wall_time);
*boot_offset = timespec64_sub(*wall_time, boot_time);
@@ -232,12 +229,12 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
static u64 read_tod_clock(struct clocksource *cs)
{
- unsigned long long now, adj;
+ unsigned long now, adj;
preempt_disable(); /* protect from changes to steering parameters */
now = get_tod_clock();
adj = tod_steering_end - now;
- if (unlikely((s64) adj >= 0))
+ if (unlikely((s64) adj > 0))
/*
* manually steer by 1 cycle every 2^16 cycles. This
* corresponds to shifting the tod delta by 15. 1s is
@@ -253,10 +250,11 @@ static struct clocksource clocksource_tod = {
.name = "tod",
.rating = 400,
.read = read_tod_clock,
- .mask = -1ULL,
+ .mask = CLOCKSOURCE_MASK(64),
.mult = 1000,
.shift = 12,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .vdso_clock_mode = VDSO_CLOCKMODE_TOD,
};
struct clocksource * __init clocksource_default_clock(void)
@@ -264,55 +262,6 @@ struct clocksource * __init clocksource_default_clock(void)
return &clocksource_tod;
}
-void update_vsyscall(struct timekeeper *tk)
-{
- u64 nsecps;
-
- if (tk->tkr_mono.clock != &clocksource_tod)
- return;
-
- /* Make userspace gettimeofday spin until we're done. */
- ++vdso_data->tb_update_count;
- smp_wmb();
- vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
- vdso_data->xtime_clock_sec = tk->xtime_sec;
- vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
- vdso_data->wtom_clock_sec =
- tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
- + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
- nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
- while (vdso_data->wtom_clock_nsec >= nsecps) {
- vdso_data->wtom_clock_nsec -= nsecps;
- vdso_data->wtom_clock_sec++;
- }
-
- vdso_data->xtime_coarse_sec = tk->xtime_sec;
- vdso_data->xtime_coarse_nsec =
- (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
- vdso_data->wtom_coarse_sec =
- vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
- vdso_data->wtom_coarse_nsec =
- vdso_data->xtime_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
- while (vdso_data->wtom_coarse_nsec >= NSEC_PER_SEC) {
- vdso_data->wtom_coarse_nsec -= NSEC_PER_SEC;
- vdso_data->wtom_coarse_sec++;
- }
-
- vdso_data->tk_mult = tk->tkr_mono.mult;
- vdso_data->tk_shift = tk->tkr_mono.shift;
- smp_wmb();
- ++vdso_data->tb_update_count;
-}
-
-extern struct timezone sys_tz;
-
-void update_vsyscall_tz(void)
-{
- vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
- vdso_data->tz_dsttime = sys_tz.tz_dsttime;
-}
-
/*
* Initialize the TOD clock and the CPU timer of
* the boot cpu.
@@ -341,11 +290,12 @@ void __init time_init(void)
}
static DEFINE_PER_CPU(atomic_t, clock_sync_word);
-static DEFINE_MUTEX(clock_sync_mutex);
+static DEFINE_MUTEX(stp_mutex);
static unsigned long clock_sync_flags;
-#define CLOCK_SYNC_HAS_STP 0
-#define CLOCK_SYNC_STP 1
+#define CLOCK_SYNC_HAS_STP 0
+#define CLOCK_SYNC_STP 1
+#define CLOCK_SYNC_STPINFO_VALID 2
/*
* The get_clock function for the physical clock. It will get the current
@@ -419,18 +369,15 @@ static inline int check_sync_clock(void)
* Apply clock delta to the global data structures.
* This is called once on the CPU that performed the clock sync.
*/
-static void clock_sync_global(unsigned long long delta)
+static void clock_sync_global(long delta)
{
unsigned long now, adj;
struct ptff_qto qto;
+ int cs;
/* Fixup the monotonic sched clock. */
- *(unsigned long long *) &tod_clock_base[1] += delta;
- if (*(unsigned long long *) &tod_clock_base[1] < delta)
- /* Epoch overflow */
- tod_clock_base[0]++;
+ tod_clock_base.eitod += delta;
/* Adjust TOD steering parameters. */
- vdso_data->tb_update_count++;
now = get_tod_clock();
adj = tod_steering_end - now;
if (unlikely((s64) adj >= 0))
@@ -439,12 +386,14 @@ static void clock_sync_global(unsigned long long delta)
-(adj >> 15) : (adj >> 15);
tod_steering_delta += delta;
if ((abs(tod_steering_delta) >> 48) != 0)
- panic("TOD clock sync offset %lli is too large to drift\n",
+ panic("TOD clock sync offset %li is too large to drift\n",
tod_steering_delta);
tod_steering_end = now + (abs(tod_steering_delta) << 15);
- vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1;
- vdso_data->ts_end = tod_steering_end;
- vdso_data->tb_update_count++;
+ for (cs = 0; cs < CS_BASES; cs++) {
+ vdso_data[cs].arch_data.tod_steering_end = tod_steering_end;
+ vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta;
+ }
+
/* Update LPAR offset. */
if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
lpar_offset = qto.tod_epoch_difference;
@@ -456,7 +405,7 @@ static void clock_sync_global(unsigned long long delta)
* Apply clock delta to the per-CPU data structures of this CPU.
* This is called for each online CPU after the call to clock_sync_global.
*/
-static void clock_sync_local(unsigned long long delta)
+static void clock_sync_local(long delta)
{
/* Add the delta to the clock comparator. */
if (S390_lowcore.clock_comparator != clock_comparator_max) {
@@ -480,7 +429,7 @@ static void __init time_init_wq(void)
struct clock_sync_data {
atomic_t cpus;
int in_sync;
- unsigned long long clock_delta;
+ long clock_delta;
};
/*
@@ -491,7 +440,6 @@ static struct stp_sstpi stp_info;
static void *stp_page;
static void stp_work_fn(struct work_struct *work);
-static DEFINE_MUTEX(stp_work_mutex);
static DECLARE_WORK(stp_work, stp_work_fn);
static struct timer_list stp_timer;
@@ -582,10 +530,26 @@ void stp_queue_work(void)
queue_work(time_sync_wq, &stp_work);
}
+static int __store_stpinfo(void)
+{
+ int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+
+ if (rc)
+ clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+ else
+ set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+ return rc;
+}
+
+static int stpinfo_valid(void)
+{
+ return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
+}
+
static int stp_sync_clock(void *data)
{
struct clock_sync_data *sync = data;
- unsigned long long clock_delta;
+ long clock_delta, flags;
static int first;
int rc;
@@ -595,19 +559,18 @@ static int stp_sync_clock(void *data)
while (atomic_read(&sync->cpus) != 0)
cpu_relax();
rc = 0;
- if (stp_info.todoff[0] || stp_info.todoff[1] ||
- stp_info.todoff[2] || stp_info.todoff[3] ||
- stp_info.tmd != 2) {
+ if (stp_info.todoff || stp_info.tmd != 2) {
+ flags = vdso_update_begin();
rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0,
&clock_delta);
if (rc == 0) {
sync->clock_delta = clock_delta;
clock_sync_global(clock_delta);
- rc = chsc_sstpi(stp_page, &stp_info,
- sizeof(struct stp_sstpi));
+ rc = __store_stpinfo();
if (rc == 0 && stp_info.tmd != 2)
rc = -EAGAIN;
}
+ vdso_update_end(flags);
}
sync->in_sync = rc ? -EAGAIN : 1;
xchg(&first, 0);
@@ -627,6 +590,81 @@ static int stp_sync_clock(void *data)
return 0;
}
+static int stp_clear_leap(void)
+{
+ struct __kernel_timex txc;
+ int ret;
+
+ memset(&txc, 0, sizeof(txc));
+
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ return ret;
+
+ txc.modes = ADJ_STATUS;
+ txc.status &= ~(STA_INS|STA_DEL);
+ return do_adjtimex(&txc);
+}
+
+static void stp_check_leap(void)
+{
+ struct stp_stzi stzi;
+ struct stp_lsoib *lsoib = &stzi.lsoib;
+ struct __kernel_timex txc;
+ int64_t timediff;
+ int leapdiff, ret;
+
+ if (!stp_info.lu || !check_sync_clock()) {
+ /*
+ * Either a scheduled leap second was removed by the operator,
+ * or STP is out of sync. In both cases, clear the leap second
+ * kernel flags.
+ */
+ if (stp_clear_leap() < 0)
+ pr_err("failed to clear leap second flags\n");
+ return;
+ }
+
+ if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) {
+ pr_err("stzi failed\n");
+ return;
+ }
+
+ timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC;
+ leapdiff = lsoib->nlso - lsoib->also;
+
+ if (leapdiff != 1 && leapdiff != -1) {
+ pr_err("Cannot schedule %d leap seconds\n", leapdiff);
+ return;
+ }
+
+ if (timediff < 0) {
+ if (stp_clear_leap() < 0)
+ pr_err("failed to clear leap second flags\n");
+ } else if (timediff < 7200) {
+ memset(&txc, 0, sizeof(txc));
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ return;
+
+ txc.modes = ADJ_STATUS;
+ if (leapdiff > 0)
+ txc.status |= STA_INS;
+ else
+ txc.status |= STA_DEL;
+ ret = do_adjtimex(&txc);
+ if (ret < 0)
+ pr_err("failed to set leap second flags\n");
+ /* arm Timer to clear leap second flags */
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC));
+ } else {
+ /* The day the leap second is scheduled for hasn't been reached. Retry
+ * in one hour.
+ */
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC));
+ }
+}
+
/*
* STP work. Check for the STP state and take over the clock
* synchronization if the STP clock source is usable.
@@ -637,7 +675,7 @@ static void stp_work_fn(struct work_struct *work)
int rc;
/* prevent multiple execution. */
- mutex_lock(&stp_work_mutex);
+ mutex_lock(&stp_mutex);
if (!stp_online) {
chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL);
@@ -645,33 +683,34 @@ static void stp_work_fn(struct work_struct *work)
goto out_unlock;
}
- rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL);
+ rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xf0e0, NULL);
if (rc)
goto out_unlock;
- rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+ rc = __store_stpinfo();
if (rc || stp_info.c == 0)
goto out_unlock;
/* Skip synchronization if the clock is already in sync. */
- if (check_sync_clock())
- goto out_unlock;
-
- memset(&stp_sync, 0, sizeof(stp_sync));
- cpus_read_lock();
- atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
- stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
- cpus_read_unlock();
+ if (!check_sync_clock()) {
+ memset(&stp_sync, 0, sizeof(stp_sync));
+ cpus_read_lock();
+ atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
+ stop_machine_cpuslocked(stp_sync_clock, &stp_sync, cpu_online_mask);
+ cpus_read_unlock();
+ }
if (!check_sync_clock())
/*
- * There is a usable clock but the synchonization failed.
+ * There is a usable clock but the synchronization failed.
* Retry after a second.
*/
- mod_timer(&stp_timer, jiffies + HZ);
+ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC));
+ else if (stp_info.lu)
+ stp_check_leap();
out_unlock:
- mutex_unlock(&stp_work_mutex);
+ mutex_unlock(&stp_mutex);
}
/*
@@ -682,115 +721,178 @@ static struct bus_type stp_subsys = {
.dev_name = "stp",
};
-static ssize_t stp_ctn_id_show(struct device *dev,
+static ssize_t ctn_id_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%016llx\n",
- *(unsigned long long *) stp_info.ctnid);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%016lx\n",
+ *(unsigned long *) stp_info.ctnid);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
+static DEVICE_ATTR_RO(ctn_id);
-static ssize_t stp_ctn_type_show(struct device *dev,
+static ssize_t ctn_type_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.ctn);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.ctn);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
+static DEVICE_ATTR_RO(ctn_type);
-static ssize_t stp_dst_offset_show(struct device *dev,
+static ssize_t dst_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x2000))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x2000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
+static DEVICE_ATTR_RO(dst_offset);
-static ssize_t stp_leap_seconds_show(struct device *dev,
+static ssize_t leap_seconds_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x8000))
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x8000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+ mutex_unlock(&stp_mutex);
+ return ret;
+}
+
+static DEVICE_ATTR_RO(leap_seconds);
+
+static ssize_t leap_seconds_scheduled_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct stp_stzi stzi;
+ ssize_t ret;
+
+ mutex_lock(&stp_mutex);
+ if (!stpinfo_valid() || !(stp_info.vbits & 0x8000) || !stp_info.lu) {
+ mutex_unlock(&stp_mutex);
return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+ }
+
+ ret = chsc_stzi(stp_page, &stzi, sizeof(stzi));
+ mutex_unlock(&stp_mutex);
+ if (ret < 0)
+ return ret;
+
+ if (!stzi.lsoib.p)
+ return sprintf(buf, "0,0\n");
+
+ return sprintf(buf, "%lu,%d\n",
+ tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC,
+ stzi.lsoib.nlso - stzi.lsoib.also);
}
-static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
+static DEVICE_ATTR_RO(leap_seconds_scheduled);
-static ssize_t stp_stratum_show(struct device *dev,
+static ssize_t stratum_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
+static DEVICE_ATTR_RO(stratum);
-static ssize_t stp_time_offset_show(struct device *dev,
+static ssize_t time_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x0800))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int) stp_info.tto);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x0800))
+ ret = sprintf(buf, "%i\n", (int) stp_info.tto);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
+static DEVICE_ATTR_RO(time_offset);
-static ssize_t stp_time_zone_offset_show(struct device *dev,
+static ssize_t time_zone_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online || !(stp_info.vbits & 0x4000))
- return -ENODATA;
- return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid() && (stp_info.vbits & 0x4000))
+ ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(time_zone_offset, 0400,
- stp_time_zone_offset_show, NULL);
+static DEVICE_ATTR_RO(time_zone_offset);
-static ssize_t stp_timing_mode_show(struct device *dev,
+static ssize_t timing_mode_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.tmd);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.tmd);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
+static DEVICE_ATTR_RO(timing_mode);
-static ssize_t stp_timing_state_show(struct device *dev,
+static ssize_t timing_state_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- if (!stp_online)
- return -ENODATA;
- return sprintf(buf, "%i\n", stp_info.tst);
+ ssize_t ret = -ENODATA;
+
+ mutex_lock(&stp_mutex);
+ if (stpinfo_valid())
+ ret = sprintf(buf, "%i\n", stp_info.tst);
+ mutex_unlock(&stp_mutex);
+ return ret;
}
-static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
+static DEVICE_ATTR_RO(timing_state);
-static ssize_t stp_online_show(struct device *dev,
+static ssize_t online_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%i\n", stp_online);
}
-static ssize_t stp_online_store(struct device *dev,
+static ssize_t online_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
@@ -801,14 +903,14 @@ static ssize_t stp_online_store(struct device *dev,
return -EINVAL;
if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
return -EOPNOTSUPP;
- mutex_lock(&clock_sync_mutex);
+ mutex_lock(&stp_mutex);
stp_online = value;
if (stp_online)
set_bit(CLOCK_SYNC_STP, &clock_sync_flags);
else
clear_bit(CLOCK_SYNC_STP, &clock_sync_flags);
queue_work(time_sync_wq, &stp_work);
- mutex_unlock(&clock_sync_mutex);
+ mutex_unlock(&stp_mutex);
return count;
}
@@ -816,46 +918,27 @@ static ssize_t stp_online_store(struct device *dev,
* Can't use DEVICE_ATTR because the attribute should be named
* stp/online but dev_attr_online already exists in this file ..
*/
-static struct device_attribute dev_attr_stp_online = {
- .attr = { .name = "online", .mode = 0600 },
- .show = stp_online_show,
- .store = stp_online_store,
-};
-
-static struct device_attribute *stp_attributes[] = {
- &dev_attr_ctn_id,
- &dev_attr_ctn_type,
- &dev_attr_dst_offset,
- &dev_attr_leap_seconds,
- &dev_attr_stp_online,
- &dev_attr_stratum,
- &dev_attr_time_offset,
- &dev_attr_time_zone_offset,
- &dev_attr_timing_mode,
- &dev_attr_timing_state,
+static DEVICE_ATTR_RW(online);
+
+static struct attribute *stp_dev_attrs[] = {
+ &dev_attr_ctn_id.attr,
+ &dev_attr_ctn_type.attr,
+ &dev_attr_dst_offset.attr,
+ &dev_attr_leap_seconds.attr,
+ &dev_attr_online.attr,
+ &dev_attr_leap_seconds_scheduled.attr,
+ &dev_attr_stratum.attr,
+ &dev_attr_time_offset.attr,
+ &dev_attr_time_zone_offset.attr,
+ &dev_attr_timing_mode.attr,
+ &dev_attr_timing_state.attr,
NULL
};
+ATTRIBUTE_GROUPS(stp_dev);
static int __init stp_init_sysfs(void)
{
- struct device_attribute **attr;
- int rc;
-
- rc = subsys_system_register(&stp_subsys, NULL);
- if (rc)
- goto out;
- for (attr = stp_attributes; *attr; attr++) {
- rc = device_create_file(stp_subsys.dev_root, *attr);
- if (rc)
- goto out_unreg;
- }
- return 0;
-out_unreg:
- for (; attr >= stp_attributes; attr--)
- device_remove_file(stp_subsys.dev_root, *attr);
- bus_unregister(&stp_subsys);
-out:
- return rc;
+ return subsys_system_register(&stp_subsys, stp_dev_groups);
}
device_initcall(stp_init_sysfs);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3627953007ed..89e91b8ce842 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2007, 2011
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#define KMSG_COMPONENT "cpu"
@@ -26,7 +25,6 @@
#include <linux/nodemask.h>
#include <linux/node.h>
#include <asm/sysinfo.h>
-#include <asm/numa.h>
#define PTF_HORIZONTAL (0UL)
#define PTF_VERTICAL (1UL)
@@ -63,50 +61,56 @@ static struct mask_info drawer_info;
struct cpu_topology_s390 cpu_topology[NR_CPUS];
EXPORT_SYMBOL_GPL(cpu_topology);
-cpumask_t cpus_with_topology;
-
-static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
+static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int cpu)
{
- cpumask_t mask;
+ static cpumask_t mask;
- cpumask_copy(&mask, cpumask_of(cpu));
+ cpumask_clear(&mask);
+ if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
+ goto out;
+ cpumask_set_cpu(cpu, &mask);
switch (topology_mode) {
case TOPOLOGY_MODE_HW:
while (info) {
if (cpumask_test_cpu(cpu, &info->mask)) {
- mask = info->mask;
+ cpumask_copy(&mask, &info->mask);
break;
}
info = info->next;
}
- if (cpumask_empty(&mask))
- cpumask_copy(&mask, cpumask_of(cpu));
break;
case TOPOLOGY_MODE_PACKAGE:
cpumask_copy(&mask, cpu_present_mask);
break;
default:
- /* fallthrough */
+ fallthrough;
case TOPOLOGY_MODE_SINGLE:
- cpumask_copy(&mask, cpumask_of(cpu));
break;
}
- return mask;
+ cpumask_and(&mask, &mask, &cpu_setup_mask);
+out:
+ cpumask_copy(dst, &mask);
}
-static cpumask_t cpu_thread_map(unsigned int cpu)
+static void cpu_thread_map(cpumask_t *dst, unsigned int cpu)
{
- cpumask_t mask;
- int i;
+ static cpumask_t mask;
+ unsigned int max_cpu;
- cpumask_copy(&mask, cpumask_of(cpu));
+ cpumask_clear(&mask);
+ if (!cpumask_test_cpu(cpu, &cpu_setup_mask))
+ goto out;
+ cpumask_set_cpu(cpu, &mask);
if (topology_mode != TOPOLOGY_MODE_HW)
- return mask;
+ goto out;
cpu -= cpu % (smp_cpu_mtid + 1);
- for (i = 0; i <= smp_cpu_mtid; i++)
- if (cpu_present(cpu + i))
- cpumask_set_cpu(cpu + i, &mask);
- return mask;
+ max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1);
+ for (; cpu <= max_cpu; cpu++) {
+ if (cpumask_test_cpu(cpu, &cpu_setup_mask))
+ cpumask_set_cpu(cpu, &mask);
+ }
+out:
+ cpumask_copy(dst, &mask);
}
#define TOPOLOGY_CORE_BITS 64
@@ -120,26 +124,26 @@ static void add_cpus_to_mask(struct topology_core *tl_core,
unsigned int core;
for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) {
- unsigned int rcore;
- int lcpu, i;
+ unsigned int max_cpu, rcore;
+ int cpu;
rcore = TOPOLOGY_CORE_BITS - 1 - core + tl_core->origin;
- lcpu = smp_find_processor_id(rcore << smp_cpu_mt_shift);
- if (lcpu < 0)
+ cpu = smp_find_processor_id(rcore << smp_cpu_mt_shift);
+ if (cpu < 0)
continue;
- for (i = 0; i <= smp_cpu_mtid; i++) {
- topo = &cpu_topology[lcpu + i];
+ max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1);
+ for (; cpu <= max_cpu; cpu++) {
+ topo = &cpu_topology[cpu];
topo->drawer_id = drawer->id;
topo->book_id = book->id;
topo->socket_id = socket->id;
topo->core_id = rcore;
- topo->thread_id = lcpu + i;
+ topo->thread_id = cpu;
topo->dedicated = tl_core->d;
- cpumask_set_cpu(lcpu + i, &drawer->mask);
- cpumask_set_cpu(lcpu + i, &book->mask);
- cpumask_set_cpu(lcpu + i, &socket->mask);
- cpumask_set_cpu(lcpu + i, &cpus_with_topology);
- smp_cpu_set_polarization(lcpu + i, tl_core->pp);
+ cpumask_set_cpu(cpu, &drawer->mask);
+ cpumask_set_cpu(cpu, &book->mask);
+ cpumask_set_cpu(cpu, &socket->mask);
+ smp_cpu_set_polarization(cpu, tl_core->pp);
}
}
}
@@ -245,17 +249,18 @@ int topology_set_cpu_management(int fc)
return rc;
}
-static void update_cpu_masks(void)
+void update_cpu_masks(void)
{
- struct cpu_topology_s390 *topo;
- int cpu, id;
+ struct cpu_topology_s390 *topo, *topo_package, *topo_sibling;
+ int cpu, sibling, pkg_first, smt_first, id;
for_each_possible_cpu(cpu) {
topo = &cpu_topology[cpu];
- topo->thread_mask = cpu_thread_map(cpu);
- topo->core_mask = cpu_group_map(&socket_info, cpu);
- topo->book_mask = cpu_group_map(&book_info, cpu);
- topo->drawer_mask = cpu_group_map(&drawer_info, cpu);
+ cpu_thread_map(&topo->thread_mask, cpu);
+ cpu_group_map(&topo->core_mask, &socket_info, cpu);
+ cpu_group_map(&topo->book_mask, &book_info, cpu);
+ cpu_group_map(&topo->drawer_mask, &drawer_info, cpu);
+ topo->booted_cores = 0;
if (topology_mode != TOPOLOGY_MODE_HW) {
id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu;
topo->thread_id = cpu;
@@ -263,11 +268,23 @@ static void update_cpu_masks(void)
topo->socket_id = id;
topo->book_id = id;
topo->drawer_id = id;
- if (cpu_present(cpu))
- cpumask_set_cpu(cpu, &cpus_with_topology);
}
}
- numa_update_cpu_topology();
+ for_each_online_cpu(cpu) {
+ topo = &cpu_topology[cpu];
+ pkg_first = cpumask_first(&topo->core_mask);
+ topo_package = &cpu_topology[pkg_first];
+ if (cpu == pkg_first) {
+ for_each_cpu(sibling, &topo->core_mask) {
+ topo_sibling = &cpu_topology[sibling];
+ smt_first = cpumask_first(&topo_sibling->thread_mask);
+ if (sibling == smt_first)
+ topo_package->booted_cores++;
+ }
+ } else {
+ topo->booted_cores = topo_package->booted_cores;
+ }
+ }
}
void store_topology(struct sysinfo_15_1_x *info)
@@ -289,7 +306,6 @@ static int __arch_update_cpu_topology(void)
int rc = 0;
mutex_lock(&smp_cpu_state_mutex);
- cpumask_clear(&cpus_with_topology);
if (MACHINE_HAS_TOPOLOGY) {
rc = 1;
store_topology(info);
@@ -346,9 +362,9 @@ static atomic_t topology_poll = ATOMIC_INIT(0);
static void set_topology_timer(void)
{
if (atomic_add_unless(&topology_poll, -1, 0))
- mod_timer(&topology_timer, jiffies + HZ / 10);
+ mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100));
else
- mod_timer(&topology_timer, jiffies + HZ * 60);
+ mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC));
}
void topology_expect_change(void)
@@ -391,7 +407,7 @@ static ssize_t dispatching_store(struct device *dev,
if (val != 0 && val != 1)
return -EINVAL;
rc = 0;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smp_cpu_state_mutex);
if (cpu_management == val)
goto out;
@@ -402,7 +418,7 @@ static ssize_t dispatching_store(struct device *dev,
topology_expect_change();
out:
mutex_unlock(&smp_cpu_state_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return rc ? rc : count;
}
static DEVICE_ATTR_RW(dispatching);
@@ -506,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] = {
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
{ cpu_book_mask, SD_INIT_NAME(BOOK) },
{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, SD_INIT_NAME(PKG) },
{ NULL, },
};
@@ -554,6 +570,7 @@ void __init topology_init_early(void)
alloc_masks(info, &book_info, 2);
alloc_masks(info, &drawer_info, 3);
out:
+ cpumask_set_cpu(0, &cpu_setup_mask);
__arch_update_cpu_topology();
__arch_update_dedicated_flag(NULL);
}
@@ -584,7 +601,7 @@ static int __init topology_setup(char *str)
early_param("topology", topology_setup);
static int topology_ctl_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int enabled = topology_is_enabled();
int new_mode;
@@ -619,27 +636,25 @@ static struct ctl_table topology_ctl_table[] = {
.mode = 0644,
.proc_handler = topology_ctl_handler,
},
- { },
-};
-
-static struct ctl_table topology_dir_table[] = {
- {
- .procname = "s390",
- .maxlen = 0,
- .mode = 0555,
- .child = topology_ctl_table,
- },
- { },
};
static int __init topology_init(void)
{
+ struct device *dev_root;
+ int rc = 0;
+
timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE);
if (MACHINE_HAS_TOPOLOGY)
set_topology_timer();
else
topology_update_polarization_simple();
- register_sysctl_table(topology_dir_table);
- return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching);
+ register_sysctl("s390", topology_ctl_table);
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ rc = device_create_file(dev_root, &dev_attr_dispatching);
+ put_device(dev_root);
+ }
+ return rc;
}
device_initcall(topology_init);
diff --git a/arch/s390/kernel/trace.c b/arch/s390/kernel/trace.c
index 490b52e85014..11a669f3cc93 100644
--- a/arch/s390/kernel/trace.c
+++ b/arch/s390/kernel/trace.c
@@ -14,7 +14,7 @@ EXPORT_TRACEPOINT_SYMBOL(s390_diagnose);
static DEFINE_PER_CPU(unsigned int, diagnose_trace_depth);
-void trace_s390_diagnose_norecursion(int diag_nr)
+void notrace trace_s390_diagnose_norecursion(int diag_nr)
{
unsigned long flags;
unsigned int *depth;
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 164c0282b41a..46dac4540ca8 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -13,8 +13,11 @@
* 'Traps.c' handles hardware traps and faults after we have saved some
* state in 'asm.s'.
*/
+#include "asm/irqflags.h"
+#include "asm/ptrace.h"
#include <linux/kprobes.h>
#include <linux/kdebug.h>
+#include <linux/randomize_kstack.h>
#include <linux/extable.h>
#include <linux/ptrace.h>
#include <linux/sched.h>
@@ -23,7 +26,10 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/cpu.h>
+#include <linux/entry-common.h>
+#include <asm/asm-extable.h>
#include <asm/fpu/api.h>
+#include <asm/vtime.h>
#include "entry.h"
static inline void __user *get_trap_ip(struct pt_regs *regs)
@@ -31,16 +37,18 @@ static inline void __user *get_trap_ip(struct pt_regs *regs)
unsigned long address;
if (regs->int_code & 0x200)
- address = *(unsigned long *)(current->thread.trap_tdb + 24);
+ address = current->thread.trap_tdb.data[3];
else
address = regs->psw.addr;
return (void __user *) (address - (regs->int_code >> 16));
}
+#ifdef CONFIG_GENERIC_BUG
int is_valid_bugaddr(unsigned long addr)
{
return 1;
}
+#endif
void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
{
@@ -48,18 +56,8 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
force_sig_fault(si_signo, si_code, get_trap_ip(regs));
report_user_fault(regs, si_signo, 0);
} else {
- const struct exception_table_entry *fixup;
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup)
- regs->psw.addr = extable_fixup(fixup);
- else {
- enum bug_trap_type btt;
-
- btt = report_bug(regs->psw.addr, regs);
- if (btt == BUG_TRAP_TYPE_WARN)
- return;
+ if (!fixup_exception(regs))
die(regs, str);
- }
}
}
@@ -83,17 +81,17 @@ void do_per_trap(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(do_per_trap);
-void default_trap_handler(struct pt_regs *regs)
+static void default_trap_handler(struct pt_regs *regs)
{
if (user_mode(regs)) {
report_user_fault(regs, SIGSEGV, 0);
- do_exit(SIGSEGV);
+ force_exit_sig(SIGSEGV);
} else
die(regs, "Unknown program exception");
}
#define DO_ERROR_INFO(name, signr, sicode, str) \
-void name(struct pt_regs *regs) \
+static void name(struct pt_regs *regs) \
{ \
do_trap(regs, signr, sicode, str); \
}
@@ -145,13 +143,13 @@ static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc)
do_trap(regs, SIGFPE, si_code, "floating point exception");
}
-void translation_exception(struct pt_regs *regs)
+static void translation_specification_exception(struct pt_regs *regs)
{
/* May never happen. */
- panic("Translation exception");
+ panic("Translation-Specification Exception");
}
-void illegal_op(struct pt_regs *regs)
+static void illegal_op(struct pt_regs *regs)
{
__u8 opcode[6];
__u16 __user *location;
@@ -193,11 +191,11 @@ NOKPROBE_SYMBOL(illegal_op);
DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN,
"specification exception");
-void vector_exception(struct pt_regs *regs)
+static void vector_exception(struct pt_regs *regs)
{
int si_code, vic;
- if (!MACHINE_HAS_VX) {
+ if (!cpu_has_vx()) {
do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation");
return;
}
@@ -227,7 +225,7 @@ void vector_exception(struct pt_regs *regs)
do_trap(regs, SIGFPE, si_code, "vector exception");
}
-void data_exception(struct pt_regs *regs)
+static void data_exception(struct pt_regs *regs)
{
save_fpu_regs();
if (current->thread.fpu.fpc & FPC_DXC_MASK)
@@ -236,7 +234,7 @@ void data_exception(struct pt_regs *regs)
do_trap(regs, SIGILL, ILL_ILLOPN, "data exception");
}
-void space_switch_exception(struct pt_regs *regs)
+static void space_switch_exception(struct pt_regs *regs)
{
/* Set user psw back to home space mode. */
if (user_mode(regs))
@@ -245,6 +243,23 @@ void space_switch_exception(struct pt_regs *regs)
do_trap(regs, SIGILL, ILL_PRVOPC, "space switch event");
}
+static void monitor_event_exception(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ return;
+
+ switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) {
+ case BUG_TRAP_TYPE_NONE:
+ fixup_exception(regs);
+ break;
+ case BUG_TRAP_TYPE_WARN:
+ break;
+ case BUG_TRAP_TYPE_BUG:
+ die(regs, "monitor event");
+ break;
+ }
+}
+
void kernel_stack_overflow(struct pt_regs *regs)
{
bust_spinlocks(1);
@@ -255,8 +270,147 @@ void kernel_stack_overflow(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(kernel_stack_overflow);
+static void __init test_monitor_call(void)
+{
+ int val = 1;
+
+ if (!IS_ENABLED(CONFIG_BUG))
+ return;
+ asm volatile(
+ " mc 0,0\n"
+ "0: xgr %0,%0\n"
+ "1:\n"
+ EX_TABLE(0b,1b)
+ : "+d" (val));
+ if (!val)
+ panic("Monitor call doesn't work!\n");
+}
+
void __init trap_init(void)
{
- sort_extable(__start_dma_ex_table, __stop_dma_ex_table);
+ unsigned long flags;
+ struct ctlreg cr0;
+
+ local_irq_save(flags);
+ cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT);
+ psw_bits(S390_lowcore.external_new_psw).mcheck = 1;
+ psw_bits(S390_lowcore.program_new_psw).mcheck = 1;
+ psw_bits(S390_lowcore.svc_new_psw).mcheck = 1;
+ psw_bits(S390_lowcore.io_new_psw).mcheck = 1;
+ local_ctl_load(0, &cr0);
+ local_irq_restore(flags);
local_mcck_enable();
+ test_monitor_call();
}
+
+static void (*pgm_check_table[128])(struct pt_regs *regs);
+
+void noinstr __do_pgm_check(struct pt_regs *regs)
+{
+ unsigned int trapnr;
+ irqentry_state_t state;
+
+ regs->int_code = S390_lowcore.pgm_int_code;
+ regs->int_parm_long = S390_lowcore.trans_exc_code;
+
+ state = irqentry_enter(regs);
+
+ if (user_mode(regs)) {
+ update_timer_sys();
+ if (!static_branch_likely(&cpu_has_bear)) {
+ if (regs->last_break < 4096)
+ regs->last_break = 1;
+ }
+ current->thread.last_break = regs->last_break;
+ }
+
+ if (S390_lowcore.pgm_code & 0x0200) {
+ /* transaction abort */
+ current->thread.trap_tdb = S390_lowcore.pgm_tdb;
+ }
+
+ if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) {
+ if (user_mode(regs)) {
+ struct per_event *ev = &current->thread.per_event;
+
+ set_thread_flag(TIF_PER_TRAP);
+ ev->address = S390_lowcore.per_address;
+ ev->cause = S390_lowcore.per_code_combined;
+ ev->paid = S390_lowcore.per_access_id;
+ } else {
+ /* PER event in kernel is kprobes */
+ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
+ do_per_trap(regs);
+ goto out;
+ }
+ }
+
+ if (!irqs_disabled_flags(regs->psw.mask))
+ trace_hardirqs_on();
+ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER);
+
+ trapnr = regs->int_code & PGM_INT_CODE_MASK;
+ if (trapnr)
+ pgm_check_table[trapnr](regs);
+out:
+ local_irq_disable();
+ irqentry_exit(regs, state);
+}
+
+/*
+ * The program check table contains exactly 128 (0x00-0x7f) entries. Each
+ * line defines the function to be called corresponding to the program check
+ * interruption code.
+ */
+static void (*pgm_check_table[128])(struct pt_regs *regs) = {
+ [0x00] = default_trap_handler,
+ [0x01] = illegal_op,
+ [0x02] = privileged_op,
+ [0x03] = execute_exception,
+ [0x04] = do_protection_exception,
+ [0x05] = addressing_exception,
+ [0x06] = specification_exception,
+ [0x07] = data_exception,
+ [0x08] = overflow_exception,
+ [0x09] = divide_exception,
+ [0x0a] = overflow_exception,
+ [0x0b] = divide_exception,
+ [0x0c] = hfp_overflow_exception,
+ [0x0d] = hfp_underflow_exception,
+ [0x0e] = hfp_significance_exception,
+ [0x0f] = hfp_divide_exception,
+ [0x10] = do_dat_exception,
+ [0x11] = do_dat_exception,
+ [0x12] = translation_specification_exception,
+ [0x13] = special_op_exception,
+ [0x14] = default_trap_handler,
+ [0x15] = operand_exception,
+ [0x16] = default_trap_handler,
+ [0x17] = default_trap_handler,
+ [0x18] = transaction_exception,
+ [0x19] = default_trap_handler,
+ [0x1a] = default_trap_handler,
+ [0x1b] = vector_exception,
+ [0x1c] = space_switch_exception,
+ [0x1d] = hfp_sqrt_exception,
+ [0x1e ... 0x37] = default_trap_handler,
+ [0x38] = do_dat_exception,
+ [0x39] = do_dat_exception,
+ [0x3a] = do_dat_exception,
+ [0x3b] = do_dat_exception,
+ [0x3c] = default_trap_handler,
+ [0x3d] = do_secure_storage_access,
+ [0x3e] = do_non_secure_storage_access,
+ [0x3f] = do_secure_storage_violation,
+ [0x40] = monitor_event_exception,
+ [0x41 ... 0x7f] = default_trap_handler,
+};
+
+#define COND_TRAP(x) asm( \
+ ".weak " __stringify(x) "\n\t" \
+ ".set " __stringify(x) "," \
+ __stringify(default_trap_handler))
+
+COND_TRAP(do_secure_storage_access);
+COND_TRAP(do_non_secure_storage_access);
+COND_TRAP(do_secure_storage_violation);
diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c
index da2d4d4c5b0e..0ece156fdd7c 100644
--- a/arch/s390/kernel/unwind_bc.c
+++ b/arch/s390/kernel/unwind_bc.c
@@ -36,10 +36,17 @@ static bool update_stack_info(struct unwind_state *state, unsigned long sp)
return true;
}
-static inline bool is_task_pt_regs(struct unwind_state *state,
- struct pt_regs *regs)
+static inline bool is_final_pt_regs(struct unwind_state *state,
+ struct pt_regs *regs)
{
- return task_pt_regs(state->task) == regs;
+ /* user mode or kernel thread pt_regs at the bottom of task stack */
+ if (task_pt_regs(state->task) == regs)
+ return true;
+
+ /* user mode pt_regs at the bottom of irq stack */
+ return state->stack_info.type == STACK_TYPE_IRQ &&
+ state->stack_info.end - sizeof(struct pt_regs) == (unsigned long)regs &&
+ READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE;
}
bool unwind_next_frame(struct unwind_state *state)
@@ -57,8 +64,8 @@ bool unwind_next_frame(struct unwind_state *state)
ip = READ_ONCE_NOCHECK(sf->gprs[8]);
reliable = false;
regs = NULL;
- if (!__kernel_text_address(ip)) {
- /* skip bogus %r14 */
+ /* skip bogus %r14 or if is the same as regs->psw.addr */
+ if (!__kernel_text_address(ip) || state->ip == unwind_recover_ret_addr(state, ip)) {
state->regs = NULL;
return unwind_next_frame(state);
}
@@ -80,7 +87,7 @@ bool unwind_next_frame(struct unwind_state *state)
if (!on_stack(info, sp, sizeof(struct pt_regs)))
goto out_err;
regs = (struct pt_regs *) sp;
- if (is_task_pt_regs(state, regs))
+ if (is_final_pt_regs(state, regs))
goto out_stop;
ip = READ_ONCE_NOCHECK(regs->psw.addr);
sp = READ_ONCE_NOCHECK(regs->gprs[15]);
@@ -96,13 +103,11 @@ bool unwind_next_frame(struct unwind_state *state)
if (sp & 0x7)
goto out_err;
- ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *) sp);
-
/* Update unwind state */
state->sp = sp;
- state->ip = ip;
state->regs = regs;
state->reliable = reliable;
+ state->ip = unwind_recover_ret_addr(state, ip);
return true;
out_err:
@@ -154,12 +159,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
ip = READ_ONCE_NOCHECK(sf->gprs[8]);
}
- ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL);
-
/* Update unwind state */
state->sp = sp;
- state->ip = ip;
state->reliable = true;
+ state->ip = unwind_recover_ret_addr(state, ip);
if (!first_frame)
return;
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index 5007fac01bb5..b88345ef8bd9 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -32,7 +32,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
return -EINVAL;
if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)
return -EINVAL;
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
auprobe->saved_per = psw_bits(regs->psw).per;
auprobe->saved_int_code = regs->int_code;
regs->int_code = UPROBE_TRAP_NR;
@@ -103,7 +103,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
/* fix per address */
current->thread.per_event.address = utask->vaddr;
/* trigger per event */
- set_pt_regs_flag(regs, PIF_PER_TRAP);
+ set_thread_flag(TIF_PER_TRAP);
}
return 0;
}
@@ -126,6 +126,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
case DIE_SSTEP:
if (uprobe_post_sstep_notifier(regs))
return NOTIFY_STOP;
+ break;
default:
break;
}
@@ -176,9 +177,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(*(ptr)) input; \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)ptr & mask) \
+ if ((u64 __force)ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (get_user(input, ptr)) \
__rc = EMU_ADDRESSING; \
@@ -193,9 +192,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(ptr) __ptr = (ptr); \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)__ptr & mask) \
+ if ((u64 __force)__ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (put_user(*(input), __ptr)) \
__rc = EMU_ADDRESSING; \
@@ -212,9 +209,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len)
__typeof__(*(ptr)) input; \
int __rc = 0; \
\
- if (!test_facility(34)) \
- __rc = EMU_ILLEGAL_OP; \
- else if ((u64 __force)ptr & mask) \
+ if ((u64 __force)ptr & mask) \
__rc = EMU_SPECIFICATION; \
else if (get_user(input, ptr)) \
__rc = EMU_ADDRESSING; \
@@ -259,7 +254,7 @@ static void sim_stor_event(struct pt_regs *regs, void *addr, int len)
return;
current->thread.per_event.address = regs->psw.addr;
current->thread.per_event.cause = PER_EVENT_STORE >> 16;
- set_pt_regs_flag(regs, PIF_PER_TRAP);
+ set_thread_flag(TIF_PER_TRAP);
}
/*
@@ -326,10 +321,6 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs)
break;
case 0xc6:
switch (insn->opc1) {
- case 0x02: /* pfdrl */
- if (!test_facility(34))
- rc = EMU_ILLEGAL_OP;
- break;
case 0x04: /* cghrl */
rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64);
break;
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
new file mode 100644
index 000000000000..fc07bc39e698
--- /dev/null
+++ b/arch/s390/kernel/uv.c
@@ -0,0 +1,717 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Ultravisor functions and initialization
+ *
+ * Copyright IBM Corp. 2019, 2020
+ */
+#define KMSG_COMPONENT "prot_virt"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/bitmap.h>
+#include <linux/memblock.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <asm/facility.h>
+#include <asm/sections.h>
+#include <asm/uv.h>
+
+/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
+int __bootdata_preserved(prot_virt_guest);
+#endif
+
+/*
+ * uv_info contains both host and guest information but it's currently only
+ * expected to be used within modules if it's the KVM module or for
+ * any PV guest module.
+ *
+ * The kernel itself will write these values once in uv_query_info()
+ * and then make some of them readable via a sysfs interface.
+ */
+struct uv_info __bootdata_preserved(uv_info);
+EXPORT_SYMBOL(uv_info);
+
+#if IS_ENABLED(CONFIG_KVM)
+int __bootdata_preserved(prot_virt_host);
+EXPORT_SYMBOL(prot_virt_host);
+
+static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len)
+{
+ struct uv_cb_init uvcb = {
+ .header.cmd = UVC_CMD_INIT_UV,
+ .header.len = sizeof(uvcb),
+ .stor_origin = stor_base,
+ .stor_len = stor_len,
+ };
+
+ if (uv_call(0, (uint64_t)&uvcb)) {
+ pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n",
+ uvcb.header.rc, uvcb.header.rrc);
+ return -1;
+ }
+ return 0;
+}
+
+void __init setup_uv(void)
+{
+ void *uv_stor_base;
+
+ if (!is_prot_virt_host())
+ return;
+
+ uv_stor_base = memblock_alloc_try_nid(
+ uv_info.uv_base_stor_len, SZ_1M, SZ_2G,
+ MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
+ if (!uv_stor_base) {
+ pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n",
+ uv_info.uv_base_stor_len);
+ goto fail;
+ }
+
+ if (uv_init(__pa(uv_stor_base), uv_info.uv_base_stor_len)) {
+ memblock_free(uv_stor_base, uv_info.uv_base_stor_len);
+ goto fail;
+ }
+
+ pr_info("Reserving %luMB as ultravisor base storage\n",
+ uv_info.uv_base_stor_len >> 20);
+ return;
+fail:
+ pr_info("Disabling support for protected virtualization");
+ prot_virt_host = 0;
+}
+
+/*
+ * Requests the Ultravisor to pin the page in the shared state. This will
+ * cause an intercept when the guest attempts to unshare the pinned page.
+ */
+int uv_pin_shared(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_PIN_PAGE_SHARED,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr,
+ };
+
+ if (uv_call(0, (u64)&uvcb))
+ return -EINVAL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(uv_pin_shared);
+
+/*
+ * Requests the Ultravisor to destroy a guest page and make it
+ * accessible to the host. The destroy clears the page instead of
+ * exporting.
+ *
+ * @paddr: Absolute host address of page to be destroyed
+ */
+static int uv_destroy_page(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_DESTR_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr
+ };
+
+ if (uv_call(0, (u64)&uvcb)) {
+ /*
+ * Older firmware uses 107/d as an indication of a non secure
+ * page. Let us emulate the newer variant (no-op).
+ */
+ if (uvcb.header.rc == 0x107 && uvcb.header.rrc == 0xd)
+ return 0;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * The caller must already hold a reference to the page
+ */
+int uv_destroy_owned_page(unsigned long paddr)
+{
+ struct page *page = phys_to_page(paddr);
+ int rc;
+
+ get_page(page);
+ rc = uv_destroy_page(paddr);
+ if (!rc)
+ clear_bit(PG_arch_1, &page->flags);
+ put_page(page);
+ return rc;
+}
+
+/*
+ * Requests the Ultravisor to encrypt a guest page and make it
+ * accessible to the host for paging (export).
+ *
+ * @paddr: Absolute host address of page to be exported
+ */
+int uv_convert_from_secure(unsigned long paddr)
+{
+ struct uv_cb_cfs uvcb = {
+ .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .paddr = paddr
+ };
+
+ if (uv_call(0, (u64)&uvcb))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * The caller must already hold a reference to the page
+ */
+int uv_convert_owned_from_secure(unsigned long paddr)
+{
+ struct page *page = phys_to_page(paddr);
+ int rc;
+
+ get_page(page);
+ rc = uv_convert_from_secure(paddr);
+ if (!rc)
+ clear_bit(PG_arch_1, &page->flags);
+ put_page(page);
+ return rc;
+}
+
+/*
+ * Calculate the expected ref_count for a page that would otherwise have no
+ * further pins. This was cribbed from similar functions in other places in
+ * the kernel, but with some slight modifications. We know that a secure
+ * page can not be a huge page for example.
+ */
+static int expected_page_refs(struct page *page)
+{
+ int res;
+
+ res = page_mapcount(page);
+ if (PageSwapCache(page)) {
+ res++;
+ } else if (page_mapping(page)) {
+ res++;
+ if (page_has_private(page))
+ res++;
+ }
+ return res;
+}
+
+static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
+{
+ int expected, cc = 0;
+
+ if (PageWriteback(page))
+ return -EAGAIN;
+ expected = expected_page_refs(page);
+ if (!page_ref_freeze(page, expected))
+ return -EBUSY;
+ set_bit(PG_arch_1, &page->flags);
+ /*
+ * If the UVC does not succeed or fail immediately, we don't want to
+ * loop for long, or we might get stall notifications.
+ * On the other hand, this is a complex scenario and we are holding a lot of
+ * locks, so we can't easily sleep and reschedule. We try only once,
+ * and if the UVC returned busy or partial completion, we return
+ * -EAGAIN and we let the callers deal with it.
+ */
+ cc = __uv_call(0, (u64)uvcb);
+ page_ref_unfreeze(page, expected);
+ /*
+ * Return -ENXIO if the page was not mapped, -EINVAL for other errors.
+ * If busy or partially completed, return -EAGAIN.
+ */
+ if (cc == UVC_CC_OK)
+ return 0;
+ else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL)
+ return -EAGAIN;
+ return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
+}
+
+/**
+ * should_export_before_import - Determine whether an export is needed
+ * before an import-like operation
+ * @uvcb: the Ultravisor control block of the UVC to be performed
+ * @mm: the mm of the process
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: true if an export is needed before every import, otherwise false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+ /*
+ * The misc feature indicates, among other things, that importing a
+ * shared page from a different protected VM will automatically also
+ * transfer its ownership.
+ */
+ if (uv_has_feature(BIT_UV_FEAT_MISC))
+ return false;
+ if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+ return false;
+ return atomic_read(&mm->context.protected_count) > 1;
+}
+
+/*
+ * Requests the Ultravisor to make a page accessible to a guest.
+ * If it's brought in the first time, it will be cleared. If
+ * it has been exported before, it will be decrypted and integrity
+ * checked.
+ */
+int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
+{
+ struct vm_area_struct *vma;
+ bool local_drain = false;
+ spinlock_t *ptelock;
+ unsigned long uaddr;
+ struct page *page;
+ pte_t *ptep;
+ int rc;
+
+again:
+ rc = -EFAULT;
+ mmap_read_lock(gmap->mm);
+
+ uaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(uaddr))
+ goto out;
+ vma = vma_lookup(gmap->mm, uaddr);
+ if (!vma)
+ goto out;
+ /*
+ * Secure pages cannot be huge and userspace should not combine both.
+ * In case userspace does it anyway this will result in an -EFAULT for
+ * the unpack. The guest is thus never reaching secure mode. If
+ * userspace is playing dirty tricky with mapping huge pages later
+ * on this will result in a segmentation fault.
+ */
+ if (is_vm_hugetlb_page(vma))
+ goto out;
+
+ rc = -ENXIO;
+ ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
+ if (!ptep)
+ goto out;
+ if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
+ page = pte_page(*ptep);
+ rc = -EAGAIN;
+ if (trylock_page(page)) {
+ if (should_export_before_import(uvcb, gmap->mm))
+ uv_convert_from_secure(page_to_phys(page));
+ rc = make_page_secure(page, uvcb);
+ unlock_page(page);
+ }
+ }
+ pte_unmap_unlock(ptep, ptelock);
+out:
+ mmap_read_unlock(gmap->mm);
+
+ if (rc == -EAGAIN) {
+ /*
+ * If we are here because the UVC returned busy or partial
+ * completion, this is just a useless check, but it is safe.
+ */
+ wait_on_page_writeback(page);
+ } else if (rc == -EBUSY) {
+ /*
+ * If we have tried a local drain and the page refcount
+ * still does not match our expected safe value, try with a
+ * system wide drain. This is needed if the pagevecs holding
+ * the page are on a different CPU.
+ */
+ if (local_drain) {
+ lru_add_drain_all();
+ /* We give up here, and let the caller try again */
+ return -EAGAIN;
+ }
+ /*
+ * We are here if the page refcount does not match the
+ * expected safe value. The main culprits are usually
+ * pagevecs. With lru_add_drain() we drain the pagevecs
+ * on the local CPU so that hopefully the refcount will
+ * reach the expected safe value.
+ */
+ lru_add_drain();
+ local_drain = true;
+ /* And now we try again immediately after draining */
+ goto again;
+ } else if (rc == -ENXIO) {
+ if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE))
+ return -EFAULT;
+ return -EAGAIN;
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_make_secure);
+
+int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
+{
+ struct uv_cb_cts uvcb = {
+ .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .guest_handle = gmap->guest_handle,
+ .gaddr = gaddr,
+ };
+
+ return gmap_make_secure(gmap, gaddr, &uvcb);
+}
+EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
+
+/**
+ * gmap_destroy_page - Destroy a guest page.
+ * @gmap: the gmap of the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ */
+int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
+{
+ struct vm_area_struct *vma;
+ unsigned long uaddr;
+ struct page *page;
+ int rc;
+
+ rc = -EFAULT;
+ mmap_read_lock(gmap->mm);
+
+ uaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(uaddr))
+ goto out;
+ vma = vma_lookup(gmap->mm, uaddr);
+ if (!vma)
+ goto out;
+ /*
+ * Huge pages should not be able to become secure
+ */
+ if (is_vm_hugetlb_page(vma))
+ goto out;
+
+ rc = 0;
+ /* we take an extra reference here */
+ page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page))
+ goto out;
+ rc = uv_destroy_owned_page(page_to_phys(page));
+ /*
+ * Fault handlers can race; it is possible that two CPUs will fault
+ * on the same secure page. One CPU can destroy the page, reboot,
+ * re-enter secure mode and import it, while the second CPU was
+ * stuck at the beginning of the handler. At some point the second
+ * CPU will be able to progress, and it will not be able to destroy
+ * the page. In that case we do not want to terminate the process,
+ * we instead try to export the page.
+ */
+ if (rc)
+ rc = uv_convert_owned_from_secure(page_to_phys(page));
+ put_page(page);
+out:
+ mmap_read_unlock(gmap->mm);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_destroy_page);
+
+/*
+ * To be called with the page locked or with an extra reference! This will
+ * prevent gmap_make_secure from touching the page concurrently. Having 2
+ * parallel make_page_accessible is fine, as the UV calls will become a
+ * no-op if the page is already exported.
+ */
+int arch_make_page_accessible(struct page *page)
+{
+ int rc = 0;
+
+ /* Hugepage cannot be protected, so nothing to do */
+ if (PageHuge(page))
+ return 0;
+
+ /*
+ * PG_arch_1 is used in 3 places:
+ * 1. for kernel page tables during early boot
+ * 2. for storage keys of huge pages and KVM
+ * 3. As an indication that this page might be secure. This can
+ * overindicate, e.g. we set the bit before calling
+ * convert_to_secure.
+ * As secure pages are never huge, all 3 variants can co-exists.
+ */
+ if (!test_bit(PG_arch_1, &page->flags))
+ return 0;
+
+ rc = uv_pin_shared(page_to_phys(page));
+ if (!rc) {
+ clear_bit(PG_arch_1, &page->flags);
+ return 0;
+ }
+
+ rc = uv_convert_from_secure(page_to_phys(page));
+ if (!rc) {
+ clear_bit(PG_arch_1, &page->flags);
+ return 0;
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(arch_make_page_accessible);
+
+#endif
+
+#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
+static ssize_t uv_query_facilities(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n",
+ uv_info.inst_calls_list[0],
+ uv_info.inst_calls_list[1],
+ uv_info.inst_calls_list[2],
+ uv_info.inst_calls_list[3]);
+}
+
+static struct kobj_attribute uv_query_facilities_attr =
+ __ATTR(facilities, 0444, uv_query_facilities, NULL);
+
+static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_ver_attr =
+ __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr =
+ __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL);
+
+static ssize_t uv_query_dump_cpu_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len);
+}
+
+static struct kobj_attribute uv_query_dump_cpu_len_attr =
+ __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL);
+
+static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len);
+}
+
+static struct kobj_attribute uv_query_dump_storage_state_len_attr =
+ __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL);
+
+static ssize_t uv_query_dump_finalize_len(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len);
+}
+
+static struct kobj_attribute uv_query_dump_finalize_len_attr =
+ __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL);
+
+static ssize_t uv_query_feature_indications(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications);
+}
+
+static struct kobj_attribute uv_query_feature_indications_attr =
+ __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
+
+static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1);
+}
+
+static struct kobj_attribute uv_query_max_guest_cpus_attr =
+ __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL);
+
+static ssize_t uv_query_max_guest_vms(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf);
+}
+
+static struct kobj_attribute uv_query_max_guest_vms_attr =
+ __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL);
+
+static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr);
+}
+
+static struct kobj_attribute uv_query_max_guest_addr_attr =
+ __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
+
+static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver);
+}
+
+static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr =
+ __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL);
+
+static ssize_t uv_query_supp_att_pflags(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags);
+}
+
+static struct kobj_attribute uv_query_supp_att_pflags_attr =
+ __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL);
+
+static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver);
+}
+
+static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr =
+ __ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL);
+
+static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_add_secret_pcf_attr =
+ __ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL);
+
+static ssize_t uv_query_supp_secret_types(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types);
+}
+
+static struct kobj_attribute uv_query_supp_secret_types_attr =
+ __ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL);
+
+static ssize_t uv_query_max_secrets(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_secrets);
+}
+
+static struct kobj_attribute uv_query_max_secrets_attr =
+ __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL);
+
+static struct attribute *uv_query_attrs[] = {
+ &uv_query_facilities_attr.attr,
+ &uv_query_feature_indications_attr.attr,
+ &uv_query_max_guest_cpus_attr.attr,
+ &uv_query_max_guest_vms_attr.attr,
+ &uv_query_max_guest_addr_attr.attr,
+ &uv_query_supp_se_hdr_ver_attr.attr,
+ &uv_query_supp_se_hdr_pcf_attr.attr,
+ &uv_query_dump_storage_state_len_attr.attr,
+ &uv_query_dump_finalize_len_attr.attr,
+ &uv_query_dump_cpu_len_attr.attr,
+ &uv_query_supp_att_req_hdr_ver_attr.attr,
+ &uv_query_supp_att_pflags_attr.attr,
+ &uv_query_supp_add_secret_req_ver_attr.attr,
+ &uv_query_supp_add_secret_pcf_attr.attr,
+ &uv_query_supp_secret_types_attr.attr,
+ &uv_query_max_secrets_attr.attr,
+ NULL,
+};
+
+static struct attribute_group uv_query_attr_group = {
+ .attrs = uv_query_attrs,
+};
+
+static ssize_t uv_is_prot_virt_guest(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int val = 0;
+
+#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
+ val = prot_virt_guest;
+#endif
+ return sysfs_emit(buf, "%d\n", val);
+}
+
+static ssize_t uv_is_prot_virt_host(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int val = 0;
+
+#if IS_ENABLED(CONFIG_KVM)
+ val = prot_virt_host;
+#endif
+
+ return sysfs_emit(buf, "%d\n", val);
+}
+
+static struct kobj_attribute uv_prot_virt_guest =
+ __ATTR(prot_virt_guest, 0444, uv_is_prot_virt_guest, NULL);
+
+static struct kobj_attribute uv_prot_virt_host =
+ __ATTR(prot_virt_host, 0444, uv_is_prot_virt_host, NULL);
+
+static const struct attribute *uv_prot_virt_attrs[] = {
+ &uv_prot_virt_guest.attr,
+ &uv_prot_virt_host.attr,
+ NULL,
+};
+
+static struct kset *uv_query_kset;
+static struct kobject *uv_kobj;
+
+static int __init uv_info_init(void)
+{
+ int rc = -ENOMEM;
+
+ if (!test_facility(158))
+ return 0;
+
+ uv_kobj = kobject_create_and_add("uv", firmware_kobj);
+ if (!uv_kobj)
+ return -ENOMEM;
+
+ rc = sysfs_create_files(uv_kobj, uv_prot_virt_attrs);
+ if (rc)
+ goto out_kobj;
+
+ uv_query_kset = kset_create_and_add("query", NULL, uv_kobj);
+ if (!uv_query_kset) {
+ rc = -ENOMEM;
+ goto out_ind_files;
+ }
+
+ rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group);
+ if (!rc)
+ return 0;
+
+ kset_unregister(uv_query_kset);
+out_ind_files:
+ sysfs_remove_files(uv_kobj, uv_prot_virt_attrs);
+out_kobj:
+ kobject_del(uv_kobj);
+ kobject_put(uv_kobj);
+ return rc;
+}
+device_initcall(uv_info_init);
+#endif
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index bcc9bdb39ba2..bbaefd84f15e 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -6,269 +6,253 @@
* Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
*/
-#include <linux/init.h>
+#include <linux/binfmts.h>
+#include <linux/compat.h>
+#include <linux/elf.h>
#include <linux/errno.h>
-#include <linux/sched.h>
+#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
#include <linux/slab.h>
-#include <linux/user.h>
-#include <linux/elf.h>
-#include <linux/security.h>
-#include <linux/memblock.h>
-#include <linux/compat.h>
-#include <asm/asm-offsets.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/sections.h>
+#include <linux/smp.h>
+#include <linux/time_namespace.h>
+#include <linux/random.h>
+#include <vdso/datapage.h>
#include <asm/vdso.h>
-#include <asm/facility.h>
-extern char vdso64_start, vdso64_end;
-static void *vdso64_kbase = &vdso64_start;
-static unsigned int vdso64_pages;
-static struct page **vdso64_pagelist;
+extern char vdso64_start[], vdso64_end[];
+extern char vdso32_start[], vdso32_end[];
-/*
- * Should the kernel map a VDSO page into processes and pass its
- * address down to glibc upon exec()?
- */
-unsigned int __read_mostly vdso_enabled = 1;
-
-static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct page **vdso_pagelist;
- unsigned long vdso_pages;
-
- vdso_pagelist = vdso64_pagelist;
- vdso_pages = vdso64_pages;
-
- if (vmf->pgoff >= vdso_pages)
- return VM_FAULT_SIGBUS;
+static struct vm_special_mapping vvar_mapping;
- vmf->page = vdso_pagelist[vmf->pgoff];
- get_page(vmf->page);
- return 0;
-}
-
-static int vdso_mremap(const struct vm_special_mapping *sm,
- struct vm_area_struct *vma)
-{
- unsigned long vdso_pages;
-
- vdso_pages = vdso64_pages;
-
- if ((vdso_pages << PAGE_SHIFT) != vma->vm_end - vma->vm_start)
- return -EINVAL;
+static union {
+ struct vdso_data data[CS_BASES];
+ u8 page[PAGE_SIZE];
+} vdso_data_store __page_aligned_data;
- if (WARN_ON_ONCE(current->mm != vma->vm_mm))
- return -EFAULT;
+struct vdso_data *vdso_data = vdso_data_store.data;
- current->mm->context.vdso_base = vma->vm_start;
- return 0;
-}
-
-static const struct vm_special_mapping vdso_mapping = {
- .name = "[vdso]",
- .fault = vdso_fault,
- .mremap = vdso_mremap,
+enum vvar_pages {
+ VVAR_DATA_PAGE_OFFSET,
+ VVAR_TIMENS_PAGE_OFFSET,
+ VVAR_NR_PAGES,
};
-static int __init vdso_setup(char *str)
+#ifdef CONFIG_TIME_NS
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
{
- bool enabled;
-
- if (!kstrtobool(str, &enabled))
- vdso_enabled = enabled;
- return 1;
+ return (struct vdso_data *)(vvar_page);
}
-__setup("vdso=", vdso_setup);
-
-/*
- * The vdso data page
- */
-static union {
- struct vdso_data data;
- u8 page[PAGE_SIZE];
-} vdso_data_store __page_aligned_data;
-struct vdso_data *vdso_data = &vdso_data_store.data;
/*
- * Setup vdso data page.
+ * The VVAR page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will be re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_data() for details.
*/
-static void __init vdso_init_data(struct vdso_data *vd)
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
- vd->ectg_available = test_facility(31);
-}
-
-/*
- * Allocate/free per cpu vdso data.
- */
-#define SEGMENT_ORDER 2
+ struct mm_struct *mm = task->mm;
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma;
-/*
- * The initial vdso_data structure for the boot CPU. Eventually
- * it is replaced with a properly allocated structure in vdso_init.
- * This is necessary because a valid S390_lowcore.vdso_per_cpu_data
- * pointer is required to be able to return from an interrupt or
- * program check. See the exit paths in entry.S.
- */
-struct vdso_data boot_vdso_data __initdata;
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (!vma_is_special_mapping(vma, &vvar_mapping))
+ continue;
+ zap_vma_pages(vma);
+ break;
+ }
+ mmap_read_unlock(mm);
+ return 0;
+}
+#endif
-void __init vdso_alloc_boot_cpu(struct lowcore *lowcore)
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
{
- lowcore->vdso_per_cpu_data = (unsigned long) &boot_vdso_data;
+ struct page *timens_page = find_timens_vvar_page(vma);
+ unsigned long addr, pfn;
+ vm_fault_t err;
+
+ switch (vmf->pgoff) {
+ case VVAR_DATA_PAGE_OFFSET:
+ pfn = virt_to_pfn(vdso_data);
+ if (timens_page) {
+ /*
+ * Fault in VVAR page too, since it will be accessed
+ * to get clock data anyway.
+ */
+ addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE;
+ err = vmf_insert_pfn(vma, addr, pfn);
+ if (unlikely(err & VM_FAULT_ERROR))
+ return err;
+ pfn = page_to_pfn(timens_page);
+ }
+ break;
+#ifdef CONFIG_TIME_NS
+ case VVAR_TIMENS_PAGE_OFFSET:
+ /*
+ * If a task belongs to a time namespace then a namespace
+ * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
+ * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
+ * offset.
+ * See also the comment near timens_setup_vdso_data().
+ */
+ if (!timens_page)
+ return VM_FAULT_SIGBUS;
+ pfn = virt_to_pfn(vdso_data);
+ break;
+#endif /* CONFIG_TIME_NS */
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+ return vmf_insert_pfn(vma, vmf->address, pfn);
}
-int vdso_alloc_per_cpu(struct lowcore *lowcore)
+static int vdso_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma)
{
- unsigned long segment_table, page_table, page_frame;
- struct vdso_per_cpu_data *vd;
-
- segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER);
- page_table = get_zeroed_page(GFP_KERNEL);
- page_frame = get_zeroed_page(GFP_KERNEL);
- if (!segment_table || !page_table || !page_frame)
- goto out;
- arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER);
- arch_set_page_dat(virt_to_page(page_table), 0);
-
- /* Initialize per-cpu vdso data page */
- vd = (struct vdso_per_cpu_data *) page_frame;
- vd->cpu_nr = lowcore->cpu_nr;
- vd->node_id = cpu_to_node(vd->cpu_nr);
-
- /* Set up page table for the vdso address space */
- memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES);
- memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE);
-
- *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table;
- *(unsigned long *) page_table = _PAGE_PROTECT + page_frame;
-
- lowcore->vdso_asce = segment_table +
- _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT;
- lowcore->vdso_per_cpu_data = page_frame;
-
+ current->mm->context.vdso_base = vma->vm_start;
return 0;
-
-out:
- free_page(page_frame);
- free_page(page_table);
- free_pages(segment_table, SEGMENT_ORDER);
- return -ENOMEM;
}
-void vdso_free_per_cpu(struct lowcore *lowcore)
-{
- unsigned long segment_table, page_table, page_frame;
+static struct vm_special_mapping vvar_mapping = {
+ .name = "[vvar]",
+ .fault = vvar_fault,
+};
- segment_table = lowcore->vdso_asce & PAGE_MASK;
- page_table = *(unsigned long *) segment_table;
- page_frame = *(unsigned long *) page_table;
+static struct vm_special_mapping vdso64_mapping = {
+ .name = "[vdso]",
+ .mremap = vdso_mremap,
+};
+
+static struct vm_special_mapping vdso32_mapping = {
+ .name = "[vdso]",
+ .mremap = vdso_mremap,
+};
- free_page(page_frame);
- free_page(page_table);
- free_pages(segment_table, SEGMENT_ORDER);
+int vdso_getcpu_init(void)
+{
+ set_tod_programmable_field(smp_processor_id());
+ return 0;
}
+early_initcall(vdso_getcpu_init); /* Must be called before SMP init */
-/*
- * This is called from binfmt_elf, we create the special vma for the
- * vDSO and insert it into the mm struct tree
- */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len)
{
+ unsigned long vvar_start, vdso_text_start, vdso_text_len;
+ struct vm_special_mapping *vdso_mapping;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long vdso_pages;
- unsigned long vdso_base;
int rc;
- if (!vdso_enabled)
- return 0;
-
- if (is_compat_task())
- return 0;
-
- vdso_pages = vdso64_pages;
- /*
- * vDSO has a problem and was disabled, just don't "enable" it for
- * the process
- */
- if (vdso_pages == 0)
- return 0;
-
- /*
- * pick a base address for the vDSO in process space. We try to put
- * it at vdso_base which is the "natural" base for it, but we might
- * fail and end up putting it elsewhere.
- */
- if (down_write_killable(&mm->mmap_sem))
+ BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
+ if (mmap_write_lock_killable(mm))
return -EINTR;
- vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0);
- if (IS_ERR_VALUE(vdso_base)) {
- rc = vdso_base;
- goto out_up;
- }
- /*
- * our vma flags don't have VM_WRITE so by default, the process
- * isn't allowed to write those pages.
- * gdb can break that with ptrace interface, and thus trigger COW
- * on those pages but it's then your responsibility to never do that
- * on the "data" page of the vDSO or you'll stop getting kernel
- * updates and your nice userland gettimeofday will be totally dead.
- * It's fine to use that for setting breakpoints in the vDSO code
- * pages though.
- */
- vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
+ if (is_compat_task()) {
+ vdso_text_len = vdso32_end - vdso32_start;
+ vdso_mapping = &vdso32_mapping;
+ } else {
+ vdso_text_len = vdso64_end - vdso64_start;
+ vdso_mapping = &vdso64_mapping;
+ }
+ vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0);
+ rc = vvar_start;
+ if (IS_ERR_VALUE(vvar_start))
+ goto out;
+ vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE,
+ VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+ VM_PFNMAP,
+ &vvar_mapping);
+ rc = PTR_ERR(vma);
+ if (IS_ERR(vma))
+ goto out;
+ vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE;
+ /* VM_MAYWRITE for COW so gdb can set breakpoints */
+ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- &vdso_mapping);
+ vdso_mapping);
if (IS_ERR(vma)) {
+ do_munmap(mm, vvar_start, PAGE_SIZE, NULL);
rc = PTR_ERR(vma);
- goto out_up;
+ } else {
+ current->mm->context.vdso_base = vdso_text_start;
+ rc = 0;
}
+out:
+ mmap_write_unlock(mm);
+ return rc;
+}
- current->mm->context.vdso_base = vdso_base;
- rc = 0;
+static unsigned long vdso_addr(unsigned long start, unsigned long len)
+{
+ unsigned long addr, end, offset;
-out_up:
- up_write(&mm->mmap_sem);
- return rc;
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+ if (end >= VDSO_BASE)
+ end = VDSO_BASE;
+ end -= len;
+
+ if (end > start) {
+ offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
+ return addr;
}
-static int __init vdso_init(void)
+unsigned long vdso_size(void)
{
- int i;
+ unsigned long size = VVAR_NR_PAGES * PAGE_SIZE;
- vdso_init_data(vdso_data);
+ if (is_compat_task())
+ size += vdso32_end - vdso32_start;
+ else
+ size += vdso64_end - vdso64_start;
+ return PAGE_ALIGN(size);
+}
- /* Calculate the size of the 64 bit vDSO */
- vdso64_pages = ((&vdso64_end - &vdso64_start
- + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ unsigned long addr = VDSO_BASE;
+ unsigned long size = vdso_size();
- /* Make sure pages are in the correct state */
- vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *),
- GFP_KERNEL);
- BUG_ON(vdso64_pagelist == NULL);
- for (i = 0; i < vdso64_pages - 1; i++) {
- struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
- get_page(pg);
- vdso64_pagelist[i] = pg;
- }
- vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data);
- vdso64_pagelist[vdso64_pages] = NULL;
- if (vdso_alloc_per_cpu(&S390_lowcore))
- BUG();
+ if (current->flags & PF_RANDOMIZE)
+ addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size);
+ return map_vdso(addr, size);
+}
- get_page(virt_to_page(vdso_data));
+static struct page ** __init vdso_setup_pages(void *start, void *end)
+{
+ int pages = (end - start) >> PAGE_SHIFT;
+ struct page **pagelist;
+ int i;
+ pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL);
+ if (!pagelist)
+ panic("%s: Cannot allocate page list for VDSO", __func__);
+ for (i = 0; i < pages; i++)
+ pagelist[i] = virt_to_page(start + i * PAGE_SIZE);
+ return pagelist;
+}
+
+static int __init vdso_init(void)
+{
+ vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end);
+ if (IS_ENABLED(CONFIG_COMPAT))
+ vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end);
return 0;
}
-early_initcall(vdso_init);
+arch_initcall(vdso_init);
diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore
new file mode 100644
index 000000000000..5167384843b9
--- /dev/null
+++ b/arch/s390/kernel/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
new file mode 100644
index 000000000000..caec7db6f966
--- /dev/null
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0
+# List of files in the vdso
+
+KCOV_INSTRUMENT := n
+
+# Include the generic Makefile to check the built vdso.
+include $(srctree)/lib/vdso/Makefile
+obj-vdso32 = vdso_user_wrapper-32.o note-32.o
+
+# Build rules
+
+targets := $(obj-vdso32) vdso32.so vdso32.so.dbg
+obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32))
+
+KBUILD_AFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
+KBUILD_AFLAGS_32 += -m31 -s
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin
+
+LDFLAGS_vdso32.so.dbg += -fPIC -shared -soname=linux-vdso32.so.1 \
+ --hash-style=both --build-id=sha1 -melf_s390 -T
+
+$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+
+obj-y += vdso32_wrapper.o
+targets += vdso32.lds
+CPPFLAGS_vdso32.lds += -P -C -U$(ARCH)
+
+# Disable gcov profiling, ubsan and kasan for VDSO code
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
+
+# Force dependency (incbin is bad)
+$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
+
+quiet_cmd_vdso_and_check = VDSO $@
+ cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
+
+$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
+ $(call if_changed,vdso_and_check)
+
+# strip rule for the .so file
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+ $(call if_changed,objcopy)
+
+$(obj-vdso32): %-32.o: %.S FORCE
+ $(call if_changed_dep,vdso32as)
+
+# actual build commands
+quiet_cmd_vdso32as = VDSO32A $@
+ cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso32cc = VDSO32C $@
+ cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $<
+
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
+
+include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE
+ $(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..9c4f951e227d
--- /dev/null
+++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S
new file mode 100644
index 000000000000..db19d0680a0a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/note.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
new file mode 100644
index 000000000000..edf5ff1debe1
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32.lds.S
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This is the infamous ld script for the 64 bits vdso
+ * library
+ */
+
+#include <asm/page.h>
+#include <asm/vdso.h>
+
+OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
+OUTPUT_ARCH(s390:31-bit)
+ENTRY(_start)
+
+SECTIONS
+{
+ PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+ PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
+ . = VDSO_LBASE + SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .note : { *(.note.*) } :text :note
+
+ . = ALIGN(16);
+ .text : {
+ *(.text .stub .text.* .gnu.linkonce.t.*)
+ } :text
+ PROVIDE(__etext = .);
+ PROVIDE(_etext = .);
+ PROVIDE(etext = .);
+
+ /*
+ * Other stuff is appended to the text segment:
+ */
+ .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
+ .rodata1 : { *(.rodata1) }
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+ .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) }
+
+ .rela.dyn ALIGN(8) : { *(.rela.dyn) }
+ .got ALIGN(8) : { *(.got .toc) }
+ .got.plt ALIGN(8) : { *(.got.plt) }
+
+ _end = .;
+ PROVIDE(end = .);
+
+ /*
+ * Stabs debugging sections are here too.
+ */
+ .stab 0 : { *(.stab) }
+ .stabstr 0 : { *(.stabstr) }
+ .stab.excl 0 : { *(.stab.excl) }
+ .stab.exclstr 0 : { *(.stab.exclstr) }
+ .stab.index 0 : { *(.stab.index) }
+ .stab.indexstr 0 : { *(.stab.indexstr) }
+ .comment 0 : { *(.comment) }
+
+ /*
+ * DWARF debug sections.
+ * Symbols in the DWARF debugging sections are relative to the
+ * beginning of the section so we begin them at 0.
+ */
+ /* DWARF 1 */
+ .debug 0 : { *(.debug) }
+ .line 0 : { *(.line) }
+ /* GNU DWARF 1 extensions */
+ .debug_srcinfo 0 : { *(.debug_srcinfo) }
+ .debug_sfnames 0 : { *(.debug_sfnames) }
+ /* DWARF 1.1 and DWARF 2 */
+ .debug_aranges 0 : { *(.debug_aranges) }
+ .debug_pubnames 0 : { *(.debug_pubnames) }
+ /* DWARF 2 */
+ .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
+ .debug_abbrev 0 : { *(.debug_abbrev) }
+ .debug_line 0 : { *(.debug_line) }
+ .debug_frame 0 : { *(.debug_frame) }
+ .debug_str 0 : { *(.debug_str) }
+ .debug_loc 0 : { *(.debug_loc) }
+ .debug_macinfo 0 : { *(.debug_macinfo) }
+ /* SGI/MIPS DWARF 2 extensions */
+ .debug_weaknames 0 : { *(.debug_weaknames) }
+ .debug_funcnames 0 : { *(.debug_funcnames) }
+ .debug_typenames 0 : { *(.debug_typenames) }
+ .debug_varnames 0 : { *(.debug_varnames) }
+ /* DWARF 3 */
+ .debug_pubtypes 0 : { *(.debug_pubtypes) }
+ .debug_ranges 0 : { *(.debug_ranges) }
+ .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
+
+ /DISCARD/ : {
+ *(.note.GNU-stack)
+ *(.branch_lt)
+ *(.data .data.* .gnu.linkonce.d.* .sdata*)
+ *(.bss .sbss .dynbss .dynsbss)
+ }
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME 0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+ VDSO_VERSION_STRING {
+ global:
+ /*
+ * Has to be there for the kernel to find
+ */
+ __kernel_compat_restart_syscall;
+ __kernel_compat_rt_sigreturn;
+ __kernel_compat_sigreturn;
+ local: *;
+ };
+}
diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S
new file mode 100644
index 000000000000..de2fb930471a
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+ __PAGE_ALIGNED_DATA
+
+ .globl vdso32_start, vdso32_end
+ .balign PAGE_SIZE
+vdso32_start:
+ .incbin "arch/s390/kernel/vdso32/vdso32.so"
+ .balign PAGE_SIZE
+vdso32_end:
+
+ .previous
diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
new file mode 100644
index 000000000000..2e645003fdaf
--- /dev/null
+++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/unistd.h>
+#include <asm/dwarf.h>
+
+.macro vdso_syscall func,syscall
+ .globl __kernel_compat_\func
+ .type __kernel_compat_\func,@function
+ __ALIGN
+__kernel_compat_\func:
+ CFI_STARTPROC
+ svc \syscall
+ /* Make sure we notice when a syscall returns, which shouldn't happen */
+ .word 0
+ CFI_ENDPROC
+ .size __kernel_compat_\func,.-__kernel_compat_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore
index 3fd18cf9fec2..4ec80685fecc 100644
--- a/arch/s390/kernel/vdso64/.gitignore
+++ b/arch/s390/kernel/vdso64/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
vdso64.lds
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index bec19e7e6e1c..e3c9085f8fa7 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -1,44 +1,56 @@
# SPDX-License-Identifier: GPL-2.0
-# List of files in the vdso, has to be asm only for now
+# List of files in the vdso
KCOV_INSTRUMENT := n
-obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o
+# Include the generic Makefile to check the built vdso.
+include $(srctree)/lib/vdso/Makefile
+obj-vdso64 = vdso_user_wrapper.o note.o
+obj-cvdso64 = vdso64_generic.o getcpu.o
+VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK)
+CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE)
+CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE)
# Build rules
-targets := $(obj-vdso64) vdso64.so vdso64.so.dbg
+targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg
obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64))
+obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64))
KBUILD_AFLAGS += -DBUILD_VDSO
-KBUILD_CFLAGS += -DBUILD_VDSO
+KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS))
-KBUILD_AFLAGS_64 += -m64 -s
+KBUILD_AFLAGS_64 += -m64
KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
-KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin
-KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \
- -Wl,--hash-style=both
+KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64))
+KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin
+ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \
+ --hash-style=both --build-id=sha1 -T
$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)
obj-y += vdso64_wrapper.o
-extra-y += vdso64.lds
+targets += vdso64.lds
CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
# Disable gcov profiling, ubsan and kasan for VDSO code
GCOV_PROFILE := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
# Force dependency (incbin is bad)
$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
+quiet_cmd_vdso_and_check = VDSO $@
+ cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check)
+
# link rule for the .so file, .lds has to be first
-$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE
- $(call if_changed,vdso64ld)
+$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE
+ $(call if_changed,vdso_and_check)
# strip rule for the .so file
$(obj)/%.so: OBJCOPYFLAGS := -S
@@ -49,18 +61,19 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
$(obj-vdso64): %.o: %.S FORCE
$(call if_changed_dep,vdso64as)
+$(obj-cvdso64): %.o: %.c FORCE
+ $(call if_changed_dep,vdso64cc)
+
# actual build commands
-quiet_cmd_vdso64ld = VDSO64L $@
- cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $(filter %.lds %.o,$^) -o $@
quiet_cmd_vdso64as = VDSO64A $@
cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $<
+quiet_cmd_vdso64cc = VDSO64C $@
+ cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $<
-# install commands for the unstripped file
-quiet_cmd_vdso_install = INSTALL $@
- cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
-
-vdso64.so: $(obj)/vdso64.so.dbg
- @mkdir -p $(MODLIB)/vdso
- $(call cmd,vdso_install)
+# Generate VDSO offsets using helper script
+gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh
+quiet_cmd_vdsosym = VDSOSYM $@
+ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@
-vdso_install: vdso64.so
+include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
+ $(call if_changed,vdsosym)
diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S
deleted file mode 100644
index 081435398e0a..000000000000
--- a/arch/s390/kernel/vdso64/clock_getres.S
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of clock_getres() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-
- .text
- .align 4
- .globl __kernel_clock_getres
- .type __kernel_clock_getres,@function
-__kernel_clock_getres:
- CFI_STARTPROC
- larl %r1,4f
- cghi %r2,__CLOCK_REALTIME_COARSE
- je 0f
- cghi %r2,__CLOCK_MONOTONIC_COARSE
- je 0f
- larl %r1,3f
- cghi %r2,__CLOCK_REALTIME
- je 0f
- cghi %r2,__CLOCK_MONOTONIC
- je 0f
- cghi %r2,__CLOCK_THREAD_CPUTIME_ID
- je 0f
- cghi %r2,-2 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */
- jne 2f
- larl %r5,_vdso_data
- icm %r0,15,__LC_ECTG_OK(%r5)
- jz 2f
-0: ltgr %r3,%r3
- jz 1f /* res == NULL */
- lg %r0,0(%r1)
- xc 0(8,%r3),0(%r3) /* set tp->tv_sec to zero */
- stg %r0,8(%r3) /* store tp->tv_usec */
-1: lghi %r2,0
- br %r14
-2: lghi %r1,__NR_clock_getres /* fallback to svc */
- svc 0
- br %r14
- CFI_ENDPROC
-3: .quad __CLOCK_REALTIME_RES
-4: .quad __CLOCK_COARSE_RES
- .size __kernel_clock_getres,.-__kernel_clock_getres
diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
deleted file mode 100644
index 9d2ee79b90f2..000000000000
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ /dev/null
@@ -1,163 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of clock_gettime() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-#include <asm/ptrace.h>
-
- .text
- .align 4
- .globl __kernel_clock_gettime
- .type __kernel_clock_gettime,@function
-__kernel_clock_gettime:
- CFI_STARTPROC
- aghi %r15,-16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
- larl %r5,_vdso_data
- cghi %r2,__CLOCK_REALTIME_COARSE
- je 4f
- cghi %r2,__CLOCK_REALTIME
- je 5f
- cghi %r2,-3 /* Per-thread CPUCLOCK with PID=0, VIRT=1 */
- je 9f
- cghi %r2,__CLOCK_MONOTONIC_COARSE
- je 3f
- cghi %r2,__CLOCK_MONOTONIC
- jne 12f
-
- /* CLOCK_MONOTONIC */
-0: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 0b
- stcke 0(%r15) /* Store TOD clock */
- lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- lg %r0,__VDSO_WTOM_SEC(%r5)
- lg %r1,1(%r15)
- sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_WTOM_NSEC(%r5)
- srlg %r1,%r1,0(%r2) /* >> tk->shift */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 0b
- larl %r5,13f
-1: clg %r1,0(%r5)
- jl 2f
- slg %r1,0(%r5)
- aghi %r0,1
- j 1b
-2: stg %r0,0(%r3) /* store tp->tv_sec */
- stg %r1,8(%r3) /* store tp->tv_nsec */
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* CLOCK_MONOTONIC_COARSE */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-3: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 3b
- lg %r0,__VDSO_WTOM_CRS_SEC(%r5)
- lg %r1,__VDSO_WTOM_CRS_NSEC(%r5)
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 3b
- j 2b
-
- /* CLOCK_REALTIME_COARSE */
-4: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 4b
- lg %r0,__VDSO_XTIME_CRS_SEC(%r5)
- lg %r1,__VDSO_XTIME_CRS_NSEC(%r5)
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 4b
- j 7f
-
- /* CLOCK_REALTIME */
-5: lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 5b
- stcke 0(%r15) /* Store TOD clock */
- lg %r1,1(%r15)
- lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */
- slgr %r0,%r1 /* now - ts_steering_end */
- ltgr %r0,%r0 /* past end of steering ? */
- jm 17f
- srlg %r0,%r0,15 /* 1 per 2^16 */
- tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */
- jz 18f
- lcgr %r0,%r0 /* negative TOD offset */
-18: algr %r1,%r0 /* add steering offset */
-17: lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */
- srlg %r1,%r1,0(%r2) /* >> tk->shift */
- lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 5b
- larl %r5,13f
-6: clg %r1,0(%r5)
- jl 7f
- slg %r1,0(%r5)
- aghi %r0,1
- j 6b
-7: stg %r0,0(%r3) /* store tp->tv_sec */
- stg %r1,8(%r3) /* store tp->tv_nsec */
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* CPUCLOCK_VIRT for this thread */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-9: lghi %r4,0
- icm %r0,15,__VDSO_ECTG_OK(%r5)
- jz 12f
- sacf 256 /* Magic ectg instruction */
- .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4
- sacf 0
- algr %r1,%r0 /* r1 = cputime as TOD value */
- mghi %r1,1000 /* convert to nanoseconds */
- srlg %r1,%r1,12 /* r1 = cputime in nanosec */
- lgr %r4,%r1
- larl %r5,13f
- srlg %r1,%r1,9 /* divide by 1000000000 */
- mlg %r0,8(%r5)
- srlg %r0,%r0,11 /* r0 = tv_sec */
- stg %r0,0(%r3)
- msg %r0,0(%r5) /* calculate tv_nsec */
- slgr %r4,%r0 /* r4 = tv_nsec */
- stg %r4,8(%r3)
- lghi %r2,0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
-
- /* Fallback to system call */
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD+16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
-12: lghi %r1,__NR_clock_gettime
- svc 0
- aghi %r15,16
- CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
- CFI_RESTORE 15
- br %r14
- CFI_ENDPROC
-
-13: .quad 1000000000
-14: .quad 19342813113834067
- .size __kernel_clock_gettime,.-__kernel_clock_gettime
diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
new file mode 100755
index 000000000000..37f05cb38dad
--- /dev/null
+++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# Match symbols in the DSO that look like VDSO_*; produce a header file
+# of constant offsets into the shared object.
+#
+# Doing this inside the Makefile will break the $(filter-out) function,
+# causing Kbuild to rebuild the vdso-offsets header file every time.
+#
+# Inspired by arm64 version.
+#
+
+LC_ALL=C
+sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p'
diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S
deleted file mode 100644
index 3c04f7328500..000000000000
--- a/arch/s390/kernel/vdso64/getcpu.S
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of getcpu() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2016
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/dwarf.h>
-
- .text
- .align 4
- .globl __kernel_getcpu
- .type __kernel_getcpu,@function
-__kernel_getcpu:
- CFI_STARTPROC
- sacf 256
- lm %r4,%r5,__VDSO_GETCPU_VAL(%r0)
- sacf 0
- ltgr %r2,%r2
- jz 2f
- st %r5,0(%r2)
-2: ltgr %r3,%r3
- jz 3f
- st %r4,0(%r3)
-3: lghi %r2,0
- br %r14
- CFI_ENDPROC
- .size __kernel_getcpu,.-__kernel_getcpu
diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso64/getcpu.c
new file mode 100644
index 000000000000..5c5d4a848b76
--- /dev/null
+++ b/arch/s390/kernel/vdso64/getcpu.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright IBM Corp. 2020 */
+
+#include <linux/compiler.h>
+#include <linux/getcpu.h>
+#include <asm/timex.h>
+#include "vdso.h"
+
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+ union tod_clock clk;
+
+ /* CPU number is stored in the programmable field of the TOD clock */
+ store_tod_clock_ext(&clk);
+ if (cpu)
+ *cpu = clk.pf;
+ /* NUMA node is always zero */
+ if (node)
+ *node = 0;
+ return 0;
+}
diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S
deleted file mode 100644
index aebe10dc7c99..000000000000
--- a/arch/s390/kernel/vdso64/gettimeofday.S
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Userland implementation of gettimeofday() for 64 bits processes in a
- * s390 kernel for use in the vDSO
- *
- * Copyright IBM Corp. 2008
- * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
- */
-#include <asm/vdso.h>
-#include <asm/asm-offsets.h>
-#include <asm/unistd.h>
-#include <asm/dwarf.h>
-#include <asm/ptrace.h>
-
- .text
- .align 4
- .globl __kernel_gettimeofday
- .type __kernel_gettimeofday,@function
-__kernel_gettimeofday:
- CFI_STARTPROC
- aghi %r15,-16
- CFI_ADJUST_CFA_OFFSET 16
- CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
- larl %r5,_vdso_data
-0: ltgr %r3,%r3 /* check if tz is NULL */
- je 1f
- mvc 0(8,%r3),__VDSO_TIMEZONE(%r5)
-1: ltgr %r2,%r2 /* check if tv is NULL */
- je 4f
- lg %r4,__VDSO_UPD_COUNT(%r5) /* load update counter */
- tmll %r4,0x0001 /* pending update ? loop */
- jnz 0b
- stcke 0(%r15) /* Store TOD clock */
- lg %r1,1(%r15)
- lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */
- slgr %r0,%r1 /* now - ts_steering_end */
- ltgr %r0,%r0 /* past end of steering ? */
- jm 6f
- srlg %r0,%r0,15 /* 1 per 2^16 */
- tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */
- jz 7f
- lcgr %r0,%r0 /* negative TOD offset */
-7: algr %r1,%r0 /* add steering offset */
-6: sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */
- msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */
- alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */
- lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */
- clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
- jne 0b
- lgf %r5,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */
- srlg %r1,%r1,0(%r5) /* >> tk->shift */
- larl %r5,5f
-2: clg %r1,0(%r5)
- jl 3f
- slg %r1,0(%r5)
- aghi %r0,1
- j 2b
-3: stg %r0,0(%r2) /* store tv->tv_sec */
- slgr %r0,%r0 /* tv_nsec -> tv_usec */
- ml %r0,8(%r5)
- srlg %r0,%r0,6
- stg %r0,8(%r2) /* store tv->tv_usec */
-4: lghi %r2,0
- aghi %r15,16
- CFI_ADJUST_CFA_OFFSET -16
- CFI_RESTORE 15
- br %r14
- CFI_ENDPROC
-5: .quad 1000000000
- .long 274877907
- .size __kernel_gettimeofday,.-__kernel_gettimeofday
diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h
new file mode 100644
index 000000000000..34c7a2312f9d
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H
+#define __ARCH_S390_KERNEL_VDSO64_VDSO_H
+
+#include <vdso/datapage.h>
+
+struct getcpu_cache;
+
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
+int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
+int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts);
+
+#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index 7ddb116b5e2e..4461ea151e49 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -13,7 +13,11 @@ ENTRY(_start)
SECTIONS
{
- . = VDSO64_LBASE + SIZEOF_HEADERS;
+ PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+ PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
+ . = VDSO_LBASE + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
.gnu.hash : { *(.gnu.hash) }
@@ -47,6 +51,7 @@ SECTIONS
.rela.dyn ALIGN(8) : { *(.rela.dyn) }
.got ALIGN(8) : { *(.got .toc) }
+ .got.plt ALIGN(8) : { *(.got.plt) }
_end = .;
PROVIDE(end = .);
@@ -94,9 +99,6 @@ SECTIONS
.debug_ranges 0 : { *(.debug_ranges) }
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
- . = ALIGN(PAGE_SIZE);
- PROVIDE(_vdso_data = .);
-
/DISCARD/ : {
*(.note.GNU-stack)
*(.branch_lt)
@@ -136,7 +138,9 @@ VERSION
__kernel_clock_gettime;
__kernel_clock_getres;
__kernel_getcpu;
-
+ __kernel_restart_syscall;
+ __kernel_rt_sigreturn;
+ __kernel_sigreturn;
local: *;
};
}
diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso64/vdso64_generic.c
new file mode 100644
index 000000000000..a9aa75643c08
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso64_generic.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../../../lib/vdso/gettimeofday.c"
+#include "vdso.h"
+
+int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv,
+ struct timezone *tz)
+{
+ return __cvdso_gettimeofday(tv, tz);
+}
+
+int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_getres(clock, ts);
+}
diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
new file mode 100644
index 000000000000..57f62596e53b
--- /dev/null
+++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/vdso.h>
+#include <asm/unistd.h>
+#include <asm/asm-offsets.h>
+#include <asm/dwarf.h>
+#include <asm/ptrace.h>
+
+#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8)
+
+/*
+ * Older glibc version called vdso without allocating a stackframe. This wrapper
+ * is just used to allocate a stackframe. See
+ * https://sourceware.org/git/?p=glibc.git;a=commit;h=478593e6374f3818da39332260dc453cb19cfa1e
+ * for details.
+ */
+.macro vdso_func func
+ .globl __kernel_\func
+ .type __kernel_\func,@function
+ __ALIGN
+__kernel_\func:
+ CFI_STARTPROC
+ aghi %r15,-WRAPPER_FRAME_SIZE
+ CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE)
+ CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD
+ stg %r14,STACK_FRAME_OVERHEAD(%r15)
+ brasl %r14,__s390_vdso_\func
+ lg %r14,STACK_FRAME_OVERHEAD(%r15)
+ aghi %r15,WRAPPER_FRAME_SIZE
+ CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD
+ CFI_RESTORE 15
+ br %r14
+ CFI_ENDPROC
+ .size __kernel_\func,.-__kernel_\func
+.endm
+
+vdso_func gettimeofday
+vdso_func clock_getres
+vdso_func clock_gettime
+vdso_func getcpu
+
+.macro vdso_syscall func,syscall
+ .globl __kernel_\func
+ .type __kernel_\func,@function
+ __ALIGN
+__kernel_\func:
+ CFI_STARTPROC
+ svc \syscall
+ /* Make sure we notice when a syscall returns, which shouldn't happen */
+ .word 0
+ CFI_ENDPROC
+ .size __kernel_\func,.-__kernel_\func
+.endm
+
+vdso_syscall restart_syscall,__NR_restart_syscall
+vdso_syscall sigreturn,__NR_sigreturn
+vdso_syscall rt_sigreturn,__NR_rt_sigreturn
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 37695499717d..e32ef446f451 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -5,16 +5,22 @@
#include <asm/thread_info.h>
#include <asm/page.h>
+#include <asm/ftrace.lds.h>
/*
* Put .bss..swapper_pg_dir as the first thing in .bss. This will
* make sure it has 16k alignment.
*/
-#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir)
+#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) \
+ *(.bss..invalid_pg_dir)
+
+#define RO_EXCEPTION_TABLE_ALIGN 16
/* Handle ro_after_init data on our own. */
#define RO_AFTER_INIT_DATA
+#define RUNTIME_DISCARD_EXIT
+
#define EMITS_PT_NOTE
#include <asm-generic/vmlinux.lds.h>
@@ -40,11 +46,11 @@ SECTIONS
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
- CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
+ FTRACE_HOTPATCH_TRAMPOLINES_TEXT
*(.text.*_indirect_*)
*(.fixup)
*(.gnu.warning)
@@ -63,13 +69,20 @@ SECTIONS
*(.data..ro_after_init)
JUMP_TABLE_DATA
} :data
- EXCEPTION_TABLE(16)
. = ALIGN(PAGE_SIZE);
__end_ro_after_init = .;
RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE)
BOOT_DATA_PRESERVED
+ . = ALIGN(8);
+ .amode31.refs : {
+ _start_amode31_refs = .;
+ *(.amode31.refs)
+ _end_amode31_refs = .;
+ }
+
+ . = ALIGN(PAGE_SIZE);
_edata = .; /* End of data section */
/* will be freed after init */
@@ -122,6 +135,7 @@ SECTIONS
/*
* Table with the patch locations to undo expolines
*/
+ . = ALIGN(4);
.nospec_call_table : {
__nospec_call_start = . ;
*(.s390_indirect*)
@@ -135,6 +149,32 @@ SECTIONS
BOOT_DATA
+ /*
+ * .amode31 section for code, data, ex_table that need to stay
+ * below 2 GB, even when the kernel is relocated above 2 GB.
+ */
+ . = ALIGN(PAGE_SIZE);
+ _samode31 = .;
+ .amode31.text : {
+ _stext_amode31 = .;
+ *(.amode31.text)
+ *(.amode31.text.*_indirect_*)
+ . = ALIGN(PAGE_SIZE);
+ _etext_amode31 = .;
+ }
+ . = ALIGN(16);
+ .amode31.ex_table : {
+ _start_amode31_ex_table = .;
+ KEEP(*(.amode31.ex_table))
+ _stop_amode31_ex_table = .;
+ }
+ . = ALIGN(PAGE_SIZE);
+ .amode31.data : {
+ *(.amode31.data)
+ }
+ . = ALIGN(PAGE_SIZE);
+ _eamode31 = .;
+
/* early.c uses stsi, which requires page aligned data. */
. = ALIGN(PAGE_SIZE);
INIT_DATA_SECTION(0x100)
@@ -157,6 +197,7 @@ SECTIONS
BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE)
+ . = ALIGN(PAGE_SIZE);
_end = . ;
/*
@@ -176,15 +217,28 @@ SECTIONS
QUAD(__dynsym_start) /* dynsym_start */
QUAD(__rela_dyn_start) /* rela_dyn_start */
QUAD(__rela_dyn_end) /* rela_dyn_end */
+ QUAD(_eamode31 - _samode31) /* amode31_size */
+ QUAD(init_mm)
+ QUAD(swapper_pg_dir)
+ QUAD(invalid_pg_dir)
+#ifdef CONFIG_KASAN
+ QUAD(kasan_early_shadow_page)
+ QUAD(kasan_early_shadow_pte)
+ QUAD(kasan_early_shadow_pmd)
+ QUAD(kasan_early_shadow_pud)
+ QUAD(kasan_early_shadow_p4d)
+#endif
} :NONE
/* Debugging sections. */
STABS_DEBUG
DWARF_DEBUG
+ ELF_DETAILS
/* Sections to be discarded */
DISCARDS
/DISCARD/ : {
*(.eh_frame)
+ *(.interp)
}
}
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 8df10d3c8f6c..e0a88dcaf5cb 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -7,13 +7,13 @@
*/
#include <linux/kernel_stat.h>
-#include <linux/sched/cputime.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/types.h>
#include <linux/time.h>
-
+#include <asm/alternative.h>
+#include <asm/cputime.h>
#include <asm/vtimer.h>
#include <asm/vtime.h>
#include <asm/cpu_mf.h>
@@ -130,13 +130,10 @@ static int do_account_vtime(struct task_struct *tsk)
clock = S390_lowcore.last_update_clock;
asm volatile(
" stpt %0\n" /* Store current cpu timer value */
-#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES
" stckf %1" /* Store current tod clock value */
-#else
- " stck %1" /* Store current tod clock value */
-#endif
: "=Q" (S390_lowcore.last_update_timer),
- "=Q" (S390_lowcore.last_update_clock));
+ "=Q" (S390_lowcore.last_update_clock)
+ : : "cc");
clock = S390_lowcore.last_update_clock - clock;
timer -= S390_lowcore.last_update_timer;
@@ -216,41 +213,56 @@ void vtime_flush(struct task_struct *tsk)
avg_steal = S390_lowcore.avg_steal_timer / 2;
if ((s64) steal > 0) {
S390_lowcore.steal_timer = 0;
- account_steal_time(steal);
+ account_steal_time(cputime_to_nsecs(steal));
avg_steal += steal;
}
S390_lowcore.avg_steal_timer = avg_steal;
}
+static u64 vtime_delta(void)
+{
+ u64 timer = S390_lowcore.last_update_timer;
+
+ S390_lowcore.last_update_timer = get_vtimer();
+
+ return timer - S390_lowcore.last_update_timer;
+}
+
/*
* Update process times based on virtual cpu times stored by entry.S
* to the lowcore fields user_timer, system_timer & steal_clock.
*/
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_account_kernel(struct task_struct *tsk)
{
- u64 timer;
+ u64 delta = vtime_delta();
- timer = S390_lowcore.last_update_timer;
- S390_lowcore.last_update_timer = get_vtimer();
- timer -= S390_lowcore.last_update_timer;
-
- if ((tsk->flags & PF_VCPU) && (irq_count() == 0))
- S390_lowcore.guest_timer += timer;
- else if (hardirq_count())
- S390_lowcore.hardirq_timer += timer;
- else if (in_serving_softirq())
- S390_lowcore.softirq_timer += timer;
+ if (tsk->flags & PF_VCPU)
+ S390_lowcore.guest_timer += delta;
else
- S390_lowcore.system_timer += timer;
+ S390_lowcore.system_timer += delta;
- virt_timer_forward(timer);
+ virt_timer_forward(delta);
}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-
-void vtime_account_kernel(struct task_struct *tsk)
-__attribute__((alias("vtime_account_irq_enter")));
EXPORT_SYMBOL_GPL(vtime_account_kernel);
+void vtime_account_softirq(struct task_struct *tsk)
+{
+ u64 delta = vtime_delta();
+
+ S390_lowcore.softirq_timer += delta;
+
+ virt_timer_forward(delta);
+}
+
+void vtime_account_hardirq(struct task_struct *tsk)
+{
+ u64 delta = vtime_delta();
+
+ S390_lowcore.hardirq_timer += delta;
+
+ virt_timer_forward(delta);
+}
+
/*
* Sorted add to a list. List is linear searched until first bigger
* element is found.