diff options
Diffstat (limited to 'arch/s390/kernel')
117 files changed, 11861 insertions, 8339 deletions
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 4a44ba5a2d73..42c83d60d6fa 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -10,6 +10,9 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) # Do not trace early setup code CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_stacktrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_unwind_bc.o = $(CC_FLAGS_FTRACE) endif @@ -33,48 +36,49 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls -obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o +obj-y := head.o traps.o time.o process.o early.o setup.o idle.o vtime.o obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o -obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o -obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o +obj-y += debug.o irq.o ipl.o dis.o vdso.o cpufeature.o +obj-y += sysinfo.o lgr.o os_info.o ctlreg.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o -obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o +obj-y += entry.o reipl.o kdebugfs.o alternative.o skey.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o +obj-y += diag/ -extra-y += head64.o vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) +obj-$(CONFIG_SYSFS) += cpacf.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o +obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o hiperdispatch.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AUDIT) += audit.o -compat-obj-$(CONFIG_AUDIT) += compat_audit.o -obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o -obj-$(CONFIG_COMPAT) += $(compat-obj-y) obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_KPROBES) += kprobes_insn_page.o -obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o +obj-$(CONFIG_KPROBES) += mcount.o +obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += mcount.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o - +obj-$(CONFIG_STACKPROTECTOR) += stackprotector.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o - +obj-$(CONFIG_CERT_STORE) += cert_store.o obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o -obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o +obj-$(CONFIG_PERF_EVENTS) += perf_pai.o obj-$(CONFIG_TRACEPOINTS) += trace.o -obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso -obj-y += vdso64/ -obj-$(CONFIG_COMPAT) += vdso32/ +obj-y += vdso/ diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c new file mode 100644 index 000000000000..6252b7d115dd --- /dev/null +++ b/arch/s390/kernel/abs_lowcore.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/pgtable.h> +#include <asm/abs_lowcore.h> +#include <asm/sections.h> + +unsigned long __bootdata_preserved(__abs_lowcore); + +int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + unsigned long phys = __pa(lc); + int rc, i; + + for (i = 0; i < LC_PAGES; i++) { + rc = __vmem_map_4k_page(addr, phys, PAGE_KERNEL, alloc); + if (rc) { + /* + * Do not unmap allocated page tables in case the + * allocation was not requested. In such a case the + * request is expected coming from an atomic context, + * while the unmap attempt might sleep. + */ + if (alloc) { + for (--i; i >= 0; i--) { + addr -= PAGE_SIZE; + vmem_unmap_4k_page(addr); + } + } + return rc; + } + addr += PAGE_SIZE; + phys += PAGE_SIZE; + } + return 0; +} + +void abs_lowcore_unmap(int cpu) +{ + unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore)); + int i; + + for (i = 0; i < LC_PAGES; i++) { + vmem_unmap_4k_page(addr); + addr += PAGE_SIZE; + } +} diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index c22ea1c3ef84..90c0e6408992 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,112 +1,90 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/module.h> -#include <asm/alternative.h> -#include <asm/facility.h> -#include <asm/nospec-branch.h> - -#define MAX_PATCH_LEN (255 - 1) -static int __initdata_or_module alt_instr_disabled; +#ifndef pr_fmt +#define pr_fmt(fmt) "alt: " fmt +#endif -static int __init disable_alternative_instructions(char *str) -{ - alt_instr_disabled = 1; - return 0; -} +#include <linux/uaccess.h> +#include <linux/printk.h> +#include <asm/nospec-branch.h> +#include <asm/abs_lowcore.h> +#include <asm/alternative.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/machine.h> -early_param("noaltinstr", disable_alternative_instructions); +#ifndef a_debug +#define a_debug pr_debug +#endif -struct brcl_insn { - u16 opc; - s32 disp; -} __packed; +#ifndef __kernel_va +#define __kernel_va(x) (void *)(x) +#endif -static u16 __initdata_or_module nop16 = 0x0700; -static u32 __initdata_or_module nop32 = 0x47000000; -static struct brcl_insn __initdata_or_module nop48 = { - 0xc004, 0 -}; +unsigned long __bootdata_preserved(machine_features[1]); -static const void *nops[] __initdata_or_module = { - &nop16, - &nop32, - &nop48 +struct alt_debug { + unsigned long facilities[MAX_FACILITY_BIT / BITS_PER_LONG]; + unsigned long mfeatures[MAX_MFEATURE_BIT / BITS_PER_LONG]; + int spec; }; -static void __init_or_module add_jump_padding(void *insns, unsigned int len) -{ - struct brcl_insn brcl = { - 0xc0f4, - len / 2 - }; - - memcpy(insns, &brcl, sizeof(brcl)); - insns += sizeof(brcl); - len -= sizeof(brcl); - - while (len > 0) { - memcpy(insns, &nop16, 2); - insns += 2; - len -= 2; - } -} +static struct alt_debug __bootdata_preserved(alt_debug); -static void __init_or_module add_padding(void *insns, unsigned int len) +static void alternative_dump(u8 *old, u8 *new, unsigned int len, unsigned int type, unsigned int data) { - if (len > 6) - add_jump_padding(insns, len); - else if (len >= 2) - memcpy(insns, nops[len / 2 - 1], len); + char oinsn[33], ninsn[33]; + unsigned long kptr; + unsigned int pos; + + for (pos = 0; pos < len && 2 * pos < sizeof(oinsn) - 3; pos++) + hex_byte_pack(&oinsn[2 * pos], old[pos]); + oinsn[2 * pos] = 0; + for (pos = 0; pos < len && 2 * pos < sizeof(ninsn) - 3; pos++) + hex_byte_pack(&ninsn[2 * pos], new[pos]); + ninsn[2 * pos] = 0; + kptr = (unsigned long)__kernel_va(old); + a_debug("[%d/%3d] %016lx: %s -> %s\n", type, data, kptr, oinsn, ninsn); } -static void __init_or_module __apply_alternatives(struct alt_instr *start, - struct alt_instr *end) +void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx) { + struct alt_debug *d; struct alt_instr *a; - u8 *instr, *replacement; - u8 insnbuf[MAX_PATCH_LEN]; + bool debug, replace; + u8 *old, *new; /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. */ + d = &alt_debug; for (a = start; a < end; a++) { - int insnbuf_sz = 0; - - instr = (u8 *)&a->instr_offset + a->instr_offset; - replacement = (u8 *)&a->repl_offset + a->repl_offset; - - if (!__test_facility(a->facility, alt_stfle_fac_list)) - continue; - - if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) { - WARN_ONCE(1, "cpu alternatives instructions length is " - "odd, skipping patching\n"); + if (!(a->ctx & ctx)) continue; + switch (a->type) { + case ALT_TYPE_FACILITY: + replace = test_facility(a->data); + debug = __test_facility(a->data, d->facilities); + break; + case ALT_TYPE_FEATURE: + replace = test_machine_feature(a->data); + debug = __test_machine_feature(a->data, d->mfeatures); + break; + case ALT_TYPE_SPEC: + replace = nobp_enabled(); + debug = d->spec; + break; + default: + replace = false; + debug = false; } - - memcpy(insnbuf, replacement, a->replacementlen); - insnbuf_sz = a->replacementlen; - - if (a->instrlen > a->replacementlen) { - add_padding(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); - insnbuf_sz += a->instrlen - a->replacementlen; - } - - s390_kernel_write(instr, insnbuf, insnbuf_sz); + if (!replace) + continue; + old = (u8 *)&a->instr_offset + a->instr_offset; + new = (u8 *)&a->repl_offset + a->repl_offset; + if (debug) + alternative_dump(old, new, a->instrlen, a->type, a->data); + s390_kernel_write(old, new, a->instrlen); } } - -void __init_or_module apply_alternatives(struct alt_instr *start, - struct alt_instr *end) -{ - if (!alt_instr_disabled) - __apply_alternatives(start, end); -} - -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -void __init apply_alternative_instructions(void) -{ - apply_alternatives(__alt_instructions, __alt_instructions_end); -} diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 77ff2130cb04..e1a5b5b54e4f 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -4,17 +4,16 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ - -#define ASM_OFFSETS_C +#define COMPILE_OFFSETS #include <linux/kbuild.h> -#include <linux/kvm_host.h> #include <linux/sched.h> #include <linux/purgatory.h> #include <linux/pgtable.h> -#include <asm/idle.h> -#include <asm/gmap.h> +#include <linux/ftrace_regs.h> +#include <asm/kvm_host_types.h> #include <asm/stacktrace.h> +#include <asm/ptrace.h> int main(void) { @@ -22,43 +21,67 @@ int main(void) OFFSET(__TASK_stack, task_struct, stack); OFFSET(__TASK_thread, task_struct, thread); OFFSET(__TASK_pid, task_struct, pid); +#ifdef CONFIG_STACKPROTECTOR + OFFSET(__TASK_stack_canary, task_struct, stack_canary); +#endif BLANK(); /* thread struct offsets */ OFFSET(__THREAD_ksp, thread_struct, ksp); BLANK(); /* thread info offsets */ OFFSET(__TI_flags, task_struct, thread_info.flags); + OFFSET(__TI_sie, task_struct, thread_info.sie); BLANK(); /* pt_regs offsets */ OFFSET(__PT_PSW, pt_regs, psw); OFFSET(__PT_GPRS, pt_regs, gprs); + OFFSET(__PT_R0, pt_regs, gprs[0]); + OFFSET(__PT_R1, pt_regs, gprs[1]); + OFFSET(__PT_R2, pt_regs, gprs[2]); + OFFSET(__PT_R3, pt_regs, gprs[3]); + OFFSET(__PT_R4, pt_regs, gprs[4]); + OFFSET(__PT_R5, pt_regs, gprs[5]); + OFFSET(__PT_R6, pt_regs, gprs[6]); + OFFSET(__PT_R7, pt_regs, gprs[7]); + OFFSET(__PT_R8, pt_regs, gprs[8]); + OFFSET(__PT_R9, pt_regs, gprs[9]); + OFFSET(__PT_R10, pt_regs, gprs[10]); + OFFSET(__PT_R11, pt_regs, gprs[11]); + OFFSET(__PT_R12, pt_regs, gprs[12]); + OFFSET(__PT_R13, pt_regs, gprs[13]); + OFFSET(__PT_R14, pt_regs, gprs[14]); + OFFSET(__PT_R15, pt_regs, gprs[15]); OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); + OFFSET(__PT_INT_CODE, pt_regs, int_code); OFFSET(__PT_FLAGS, pt_regs, flags); - OFFSET(__PT_CR1, pt_regs, cr1); + OFFSET(__PT_LAST_BREAK, pt_regs, last_break); DEFINE(__PT_SIZE, sizeof(struct pt_regs)); BLANK(); /* stack_frame offsets */ OFFSET(__SF_BACKCHAIN, stack_frame, back_chain); OFFSET(__SF_GPRS, stack_frame, gprs); - OFFSET(__SF_EMPTY, stack_frame, empty1[0]); - OFFSET(__SF_SIE_CONTROL, stack_frame, empty1[1]); - OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[2]); - OFFSET(__SF_SIE_REASON, stack_frame, empty1[3]); - OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[4]); + OFFSET(__SF_EMPTY, stack_frame, empty[0]); + OFFSET(__SF_SIE_CONTROL, stack_frame, sie_control_block); + OFFSET(__SF_SIE_SAVEAREA, stack_frame, sie_savearea); + OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); + OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); + OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); + OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce); + OFFSET(__SF_SIE_IRQ, stack_frame, sie_irq); + DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); - /* idle data offsets */ - OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); - OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit); - OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter); - OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit); - OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter); + OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain); + DEFINE(STACK_FRAME_USER_OVERHEAD, sizeof(struct stack_frame_user)); + OFFSET(__SFVDSO_RETURN_ADDRESS, stack_frame_vdso_wrapper, return_address); + DEFINE(STACK_FRAME_VDSO_OVERHEAD, sizeof(struct stack_frame_vdso_wrapper)); BLANK(); /* hardware defined lowcore locations 0x000 - 0x1ff */ OFFSET(__LC_EXT_PARAMS, lowcore, ext_params); OFFSET(__LC_EXT_CPU_ADDR, lowcore, ext_cpu_addr); OFFSET(__LC_EXT_INT_CODE, lowcore, ext_int_code); OFFSET(__LC_PGM_ILC, lowcore, pgm_ilc); - OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_code); + OFFSET(__LC_PGM_CODE, lowcore, pgm_code); + OFFSET(__LC_PGM_INT_CODE, lowcore, pgm_int_code); OFFSET(__LC_DATA_EXC_CODE, lowcore, data_exc_code); OFFSET(__LC_MON_CLASS_NR, lowcore, mon_class_num); OFFSET(__LC_PER_CODE, lowcore, per_code); @@ -77,7 +100,7 @@ int main(void) OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code); OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code); OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address); - OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr); + OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break); OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe); OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe); OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw); @@ -93,10 +116,9 @@ int main(void) OFFSET(__LC_MCK_NEW_PSW, lowcore, mcck_new_psw); OFFSET(__LC_IO_NEW_PSW, lowcore, io_new_psw); /* software defined lowcore locations 0x200 - 0xdff*/ - OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync); - OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async); + OFFSET(__LC_SAVE_AREA, lowcore, save_area); OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart); - OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags); + OFFSET(__LC_PCPU, lowcore, pcpu); OFFSET(__LC_RETURN_PSW, lowcore, return_psw); OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw); OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer); @@ -105,8 +127,6 @@ int main(void) OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer); OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); - OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); - OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); @@ -116,17 +136,17 @@ int main(void) OFFSET(__LC_RESTART_FN, lowcore, restart_fn); OFFSET(__LC_RESTART_DATA, lowcore, restart_data); OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source); + OFFSET(__LC_RESTART_FLAGS, lowcore, restart_flags); OFFSET(__LC_KERNEL_ASCE, lowcore, kernel_asce); OFFSET(__LC_USER_ASCE, lowcore, user_asce); OFFSET(__LC_LPP, lowcore, lpp); OFFSET(__LC_CURRENT_PID, lowcore, current_pid); - OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); - OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); - OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); - OFFSET(__LC_GMAP, lowcore, gmap); - OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline); + OFFSET(__LC_LAST_BREAK, lowcore, last_break); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ + OFFSET(__LC_STACK_CANARY, lowcore, stack_canary); OFFSET(__LC_DUMP_REIPL, lowcore, ipib); + OFFSET(__LC_VMCORE_INFO, lowcore, vmcore_info); + OFFSET(__LC_OS_INFO, lowcore, os_info); /* hardware defined lowcore locations 0x1000 - 0x18ff */ OFFSET(__LC_MCESAD, lowcore, mcesad); OFFSET(__LC_EXT_PARAMS2, lowcore, ext_params2); @@ -138,12 +158,12 @@ int main(void) OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area); OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area); OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area); + OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area); OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area); OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); BLANK(); /* gmap/sie offsets */ - OFFSET(__GMAP_ASCE, gmap, asce); OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20); /* kexec_sha_region */ @@ -152,5 +172,17 @@ int main(void) DEFINE(__KEXEC_SHA_REGION_SIZE, sizeof(struct kexec_sha_region)); /* sizeof kernel parameter area */ DEFINE(__PARMAREA_SIZE, sizeof(struct parmarea)); + /* kernel parameter area offsets */ + DEFINE(IPL_DEVICE, PARMAREA + offsetof(struct parmarea, ipl_device)); + DEFINE(INITRD_START, PARMAREA + offsetof(struct parmarea, initrd_start)); + DEFINE(INITRD_SIZE, PARMAREA + offsetof(struct parmarea, initrd_size)); + DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base)); + DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); + DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); + DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); + OFFSET(__FTRACE_REGS_PT_REGS, __arch_ftrace_regs, regs); + DEFINE(__FTRACE_REGS_SIZE, sizeof(struct __arch_ftrace_regs)); + + OFFSET(__PCPU_FLAGS, pcpu, flags); return 0; } diff --git a/arch/s390/kernel/audit.c b/arch/s390/kernel/audit.c index d395c6c9944c..7897d9411e13 100644 --- a/arch/s390/kernel/audit.c +++ b/arch/s390/kernel/audit.c @@ -3,7 +3,6 @@ #include <linux/types.h> #include <linux/audit.h> #include <asm/unistd.h> -#include "audit.h" static unsigned dir_class[] = { #include <asm-generic/audit_dir_write.h> @@ -32,42 +31,29 @@ static unsigned signal_class[] = { int audit_classify_arch(int arch) { -#ifdef CONFIG_COMPAT - if (arch == AUDIT_ARCH_S390) - return 1; -#endif return 0; } int audit_classify_syscall(int abi, unsigned syscall) { -#ifdef CONFIG_COMPAT - if (abi == AUDIT_ARCH_S390) - return s390_classify_syscall(syscall); -#endif switch(syscall) { case __NR_open: - return 2; + return AUDITSC_OPEN; case __NR_openat: - return 3; + return AUDITSC_OPENAT; case __NR_socketcall: - return 4; + return AUDITSC_SOCKETCALL; case __NR_execve: - return 5; + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; default: - return 0; + return AUDITSC_NATIVE; } } static int __init audit_classes_init(void) { -#ifdef CONFIG_COMPAT - audit_register_class(AUDIT_CLASS_WRITE_32, s390_write_class); - audit_register_class(AUDIT_CLASS_READ_32, s390_read_class); - audit_register_class(AUDIT_CLASS_DIR_WRITE_32, s390_dir_class); - audit_register_class(AUDIT_CLASS_CHATTR_32, s390_chattr_class); - audit_register_class(AUDIT_CLASS_SIGNAL_32, s390_signal_class); -#endif audit_register_class(AUDIT_CLASS_WRITE, write_class); audit_register_class(AUDIT_CLASS_READ, read_class); audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); diff --git a/arch/s390/kernel/audit.h b/arch/s390/kernel/audit.h deleted file mode 100644 index 4d4b596412ec..000000000000 --- a/arch/s390/kernel/audit.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ARCH_S390_KERNEL_AUDIT_H -#define __ARCH_S390_KERNEL_AUDIT_H - -#include <linux/types.h> - -#ifdef CONFIG_COMPAT -extern int s390_classify_syscall(unsigned); -extern __u32 s390_dir_class[]; -extern __u32 s390_write_class[]; -extern __u32 s390_read_class[]; -extern __u32 s390_chattr_class[]; -extern __u32 s390_signal_class[]; -#endif /* CONFIG_COMPAT */ - -#endif /* __ARCH_S390_KERNEL_AUDIT_H */ diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S deleted file mode 100644 index d255c69c1779..000000000000 --- a/arch/s390/kernel/base.S +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * arch/s390/kernel/base.S - * - * Copyright IBM Corp. 2006, 2007 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> - * Michael Holzheu <holzheu@de.ibm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/nospec-insn.h> -#include <asm/ptrace.h> - - GEN_BR_THUNK %r9 - GEN_BR_THUNK %r14 - -ENTRY(s390_base_pgm_handler) - stmg %r0,%r15,__LC_SAVE_AREA_SYNC - basr %r13,0 -0: aghi %r15,-STACK_FRAME_OVERHEAD - larl %r1,s390_base_pgm_handler_fn - lg %r9,0(%r1) - ltgr %r9,%r9 - jz 1f - BASR_EX %r14,%r9 - lmg %r0,%r15,__LC_SAVE_AREA_SYNC - lpswe __LC_PGM_OLD_PSW -1: lpswe disabled_wait_psw-0b(%r13) -ENDPROC(s390_base_pgm_handler) - - .align 8 -disabled_wait_psw: - .quad 0x0002000180000000,0x0000000000000000 + s390_base_pgm_handler - - .section .bss - .align 8 - .globl s390_base_pgm_handler_fn -s390_base_pgm_handler_fn: - .quad 0 - .previous diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c index d66825e53fce..4f2669030220 100644 --- a/arch/s390/kernel/cache.c +++ b/arch/s390/kernel/cache.c @@ -3,7 +3,6 @@ * Extract CPU cache information and expose them via sysfs. * * Copyright IBM Corp. 2012 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #include <linux/seq_file.h> @@ -47,7 +46,7 @@ struct cache_info { #define CACHE_MAX_LEVEL 8 union cache_topology { struct cache_info ci[CACHE_MAX_LEVEL]; - unsigned long long raw; + unsigned long raw; }; static const char * const cache_type_string[] = { @@ -71,8 +70,6 @@ void show_cacheinfo(struct seq_file *m) struct cacheinfo *cache; int idx; - if (!test_facility(34)) - return; this_cpu_ci = get_cpu_cacheinfo(cpumask_any(cpu_online_mask)); for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { cache = this_cpu_ci->info_list + idx; @@ -132,8 +129,6 @@ int init_cache_level(unsigned int cpu) union cache_topology ct; enum cache_type ctype; - if (!test_facility(34)) - return -EOPNOTSUPP; if (!this_cpu_ci) return -EINVAL; ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); @@ -157,8 +152,6 @@ int populate_cache_leaves(unsigned int cpu) union cache_topology ct; enum cache_type ctype; - if (!test_facility(34)) - return -EOPNOTSUPP; ct.raw = ecag(EXTRACT_TOPOLOGY, 0, 0); for (idx = 0, level = 0; level < this_cpu_ci->num_levels && idx < this_cpu_ci->num_leaves; idx++, level++) { @@ -173,5 +166,6 @@ int populate_cache_leaves(unsigned int cpu) ci_leaf_init(this_leaf++, pvt, ctype, level, cpu); } } + this_cpu_ci->cpu_map_populated = true; return 0; } diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c new file mode 100644 index 000000000000..c217a5e64094 --- /dev/null +++ b/arch/s390/kernel/cert_store.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DIAG 0x320 support and certificate store handling + * + * Copyright IBM Corp. 2023 + * Author(s): Anastasia Eskova <anastasia.eskova@ibm.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/key-type.h> +#include <linux/key.h> +#include <linux/keyctl.h> +#include <linux/kobject.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/vmalloc.h> +#include <crypto/sha2.h> +#include <keys/user-type.h> +#include <asm/debug.h> +#include <asm/diag.h> +#include <asm/ebcdic.h> +#include <asm/sclp.h> + +#define DIAG_MAX_RETRIES 10 + +#define VCE_FLAGS_VALID_MASK 0x80 + +#define ISM_LEN_DWORDS 4 +#define VCSSB_LEN_BYTES 128 +#define VCSSB_LEN_NO_CERTS 4 +#define VCB_LEN_NO_CERTS 64 +#define VC_NAME_LEN_BYTES 64 + +#define CERT_STORE_KEY_TYPE_NAME "cert_store_key" +#define CERT_STORE_KEYRING_NAME "cert_store" + +static debug_info_t *cert_store_dbf; +static debug_info_t *cert_store_hexdump; + +#define pr_dbf_msg(fmt, ...) \ + debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__) + +enum diag320_subcode { + DIAG320_SUBCODES = 0, + DIAG320_STORAGE = 1, + DIAG320_CERT_BLOCK = 2, +}; + +enum diag320_rc { + DIAG320_RC_OK = 0x0001, + DIAG320_RC_CS_NOMATCH = 0x0306, +}; + +/* Verification Certificates Store Support Block (VCSSB). */ +struct vcssb { + u32 vcssb_length; + u8 pad_0x04[3]; + u8 version; + u8 pad_0x08[8]; + u32 cs_token; + u8 pad_0x14[12]; + u16 total_vc_index_count; + u16 max_vc_index_count; + u8 pad_0x24[28]; + u32 max_vce_length; + u32 max_vcxe_length; + u8 pad_0x48[8]; + u32 max_single_vcb_length; + u32 total_vcb_length; + u32 max_single_vcxb_length; + u32 total_vcxb_length; + u8 pad_0x60[32]; +} __packed __aligned(8); + +/* Verification Certificate Entry (VCE) Header. */ +struct vce_header { + u32 vce_length; + u8 flags; + u8 key_type; + u16 vc_index; + u8 vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */ + u8 vc_format; + u8 pad_0x49; + u16 key_id_length; + u8 pad_0x4c; + u8 vc_hash_type; + u16 vc_hash_length; + u8 pad_0x50[4]; + u32 vc_length; + u8 pad_0x58[8]; + u16 vc_hash_offset; + u16 vc_offset; + u8 pad_0x64[28]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB) Header. */ +struct vcb_header { + u32 vcb_input_length; + u8 pad_0x04[4]; + u16 first_vc_index; + u16 last_vc_index; + u32 pad_0x0c; + u32 cs_token; + u8 pad_0x14[12]; + u32 vcb_output_length; + u8 pad_0x24[3]; + u8 version; + u16 stored_vc_count; + u16 remaining_vc_count; + u8 pad_0x2c[20]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB). */ +struct vcb { + struct vcb_header vcb_hdr; + u8 vcb_buf[]; +} __packed __aligned(4); + +/* Verification Certificate Entry (VCE). */ +struct vce { + struct vce_header vce_hdr; + u8 cert_data_buf[]; +} __packed __aligned(4); + +static void cert_store_key_describe(const struct key *key, struct seq_file *m) +{ + char ascii[VC_NAME_LEN_BYTES + 1]; + + /* + * First 64 bytes of the key description is key name in EBCDIC CP 500. + * Convert it to ASCII for displaying in /proc/keys. + */ + strscpy(ascii, key->description); + EBCASC_500(ascii, VC_NAME_LEN_BYTES); + seq_puts(m, ascii); + + seq_puts(m, &key->description[VC_NAME_LEN_BYTES]); + if (key_is_positive(key)) + seq_printf(m, ": %u", key->datalen); +} + +/* + * Certificate store key type takes over properties of + * user key but cannot be updated. + */ +static struct key_type key_type_cert_store_key = { + .name = CERT_STORE_KEY_TYPE_NAME, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = cert_store_key_describe, + .read = user_read, +}; + +/* Logging functions. */ +static void pr_dbf_vcb(const struct vcb *b) +{ + pr_dbf_msg("VCB Header:"); + pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length); + pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index); + pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index); + pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token); + pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length); + pr_dbf_msg("version: %d", b->vcb_hdr.version); + pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count); + pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count); +} + +static void pr_dbf_vce(const struct vce *e) +{ + unsigned char vc_name[VC_NAME_LEN_BYTES + 1]; + char log_string[VC_NAME_LEN_BYTES + 40]; + + pr_dbf_msg("VCE Header:"); + pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length); + pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags); + pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type); + pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index); + pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format); + pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length); + pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type); + pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length); + pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset); + pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length); + pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset); + + /* Certificate name in ASCII. */ + memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES); + EBCASC_500(vc_name, VC_NAME_LEN_BYTES); + vc_name[VC_NAME_LEN_BYTES] = '\0'; + + snprintf(log_string, sizeof(log_string), + "index: %d vce_hdr.vc_name (ASCII): %s", + e->vce_hdr.vc_index, vc_name); + debug_text_event(cert_store_hexdump, 3, log_string); + + /* Certificate data. */ + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start"); + debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128); + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end"); + debug_event(cert_store_hexdump, 3, + (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128); +} + +static void pr_dbf_vcssb(const struct vcssb *s) +{ + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1"); + debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES); + + pr_dbf_msg("VCSSB:"); + pr_dbf_msg("vcssb_length: %u", s->vcssb_length); + pr_dbf_msg("version: %u", s->version); + pr_dbf_msg("cs_token: %u", s->cs_token); + pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count); + pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count); + pr_dbf_msg("max_vce_length: %u", s->max_vce_length); + pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length); + pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length); + pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length); + pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length); + pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length); +} + +static int __diag320(unsigned long subcode, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr, }; + + asm_inline volatile( + " diag %[rp],%[subcode],0x320\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "cc", "memory"); + + return rp.odd; +} + +static int diag320(unsigned long subcode, void *addr) +{ + diag_stat_inc(DIAG_STAT_X320); + + return __diag320(subcode, addr); +} + +/* + * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in + * VCE. Return -EINVAL if hashes don't match. + */ +static int check_certificate_hash(const struct vce *vce) +{ + u8 hash[SHA256_DIGEST_SIZE]; + u16 vc_hash_length; + u8 *vce_hash; + + vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset; + vc_hash_length = vce->vce_hdr.vc_hash_length; + sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash); + if (memcmp(vce_hash, hash, vc_hash_length) == 0) + return 0; + + pr_dbf_msg("SHA256 hash of received certificate does not match"); + debug_text_event(cert_store_hexdump, 3, "VCE hash:"); + debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE); + debug_text_event(cert_store_hexdump, 3, "Calculated hash:"); + debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE); + + return -EINVAL; +} + +static int check_certificate_valid(const struct vce *vce) +{ + if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) { + pr_dbf_msg("Certificate entry is invalid"); + return -EINVAL; + } + if (vce->vce_hdr.vc_format != 1) { + pr_dbf_msg("Certificate format is not supported"); + return -EINVAL; + } + if (vce->vce_hdr.vc_hash_type != 1) { + pr_dbf_msg("Hash type is not supported"); + return -EINVAL; + } + + return check_certificate_hash(vce); +} + +static struct key *get_user_session_keyring(void) +{ + key_ref_t us_keyring_ref; + + us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING, + KEY_LOOKUP_CREATE, KEY_NEED_LINK); + if (IS_ERR(us_keyring_ref)) { + pr_dbf_msg("Couldn't get user session keyring: %ld", + PTR_ERR(us_keyring_ref)); + return ERR_PTR(-ENOKEY); + } + key_ref_put(us_keyring_ref); + return key_ref_to_ptr(us_keyring_ref); +} + +/* Invalidate all keys from cert_store keyring. */ +static int invalidate_keyring_keys(struct key *keyring) +{ + unsigned long num_keys, key_index; + size_t keyring_payload_len; + key_serial_t *key_array; + struct key *current_key; + int rc; + + keyring_payload_len = key_type_keyring.read(keyring, NULL, 0); + num_keys = keyring_payload_len / sizeof(key_serial_t); + key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL); + if (!key_array) + return -ENOMEM; + + rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len); + if (rc != keyring_payload_len) { + pr_dbf_msg("Couldn't read keyring payload"); + goto out; + } + + for (key_index = 0; key_index < num_keys; key_index++) { + current_key = key_lookup(key_array[key_index]); + pr_dbf_msg("Invalidating key %08x", current_key->serial); + + key_invalidate(current_key); + key_put(current_key); + rc = key_unlink(keyring, current_key); + if (rc) { + pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc); + break; + } + } +out: + kfree(key_array); + return rc; +} + +static struct key *find_cs_keyring(void) +{ + key_ref_t cs_keyring_ref; + struct key *cs_keyring; + + cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true), + &key_type_keyring, CERT_STORE_KEYRING_NAME, + false); + if (!IS_ERR(cs_keyring_ref)) { + cs_keyring = key_ref_to_ptr(cs_keyring_ref); + key_ref_put(cs_keyring_ref); + goto found; + } + /* Search default locations: thread, process, session keyrings */ + cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL); + if (IS_ERR(cs_keyring)) + return NULL; + key_put(cs_keyring); +found: + return cs_keyring; +} + +static void cleanup_cs_keys(void) +{ + struct key *cs_keyring; + + cs_keyring = find_cs_keyring(); + if (!cs_keyring) + return; + + pr_dbf_msg("Found cert_store keyring. Purging..."); + /* + * Remove cert_store_key_type in case invalidation + * of old cert_store keys failed (= severe error). + */ + if (invalidate_keyring_keys(cs_keyring)) + unregister_key_type(&key_type_cert_store_key); + + keyring_clear(cs_keyring); + key_invalidate(cs_keyring); + key_put(cs_keyring); + key_unlink(get_user_session_keyring(), cs_keyring); +} + +static struct key *create_cs_keyring(void) +{ + static struct key *cs_keyring; + + /* Cleanup previous cs_keyring and all associated keys if any. */ + cleanup_cs_keys(); + cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID, + GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP, + NULL, get_user_session_keyring()); + if (IS_ERR(cs_keyring)) { + pr_dbf_msg("Can't allocate cert_store keyring"); + return NULL; + } + + pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial); + + /* + * In case a previous clean-up ran into an + * error and unregistered key type. + */ + register_key_type(&key_type_cert_store_key); + + return cs_keyring; +} + +/* + * Allocate memory and create key description in format + * [key name in EBCDIC]:[VCE index]:[CS token]. + * Return a pointer to key description or NULL if memory + * allocation failed. Memory should be freed by caller. + */ +static char *get_key_description(struct vcssb *vcssb, const struct vce *vce) +{ + size_t len, name_len; + u32 cs_token; + char *desc; + + cs_token = vcssb->cs_token; + /* Description string contains "%64s:%05u:%010u\0". */ + name_len = sizeof(vce->vce_hdr.vc_name); + len = name_len + 1 + 5 + 1 + 10 + 1; + desc = kmalloc(len, GFP_KERNEL); + if (!desc) + return NULL; + + memcpy(desc, vce->vce_hdr.vc_name, name_len); + snprintf(desc + name_len, len - name_len, ":%05u:%010u", + vce->vce_hdr.vc_index, cs_token); + + return desc; +} + +/* + * Create a key of type "cert_store_key" using the data from VCE for key + * payload and key description. Link the key to "cert_store" keyring. + */ +static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce, + struct key *keyring) +{ + key_ref_t newkey; + char *desc; + int rc; + + desc = get_key_description(vcssb, vce); + if (!desc) + return -ENOMEM; + + newkey = key_create_or_update( + make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME, + desc, (u8 *)vce + vce->vce_hdr.vc_offset, + vce->vce_hdr.vc_length, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + + rc = PTR_ERR_OR_ZERO(newkey); + if (rc) { + pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc); + rc = -ENOKEY; + goto out; + } + + key_ref_put(newkey); +out: + kfree(desc); + return rc; +} + +/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */ +static int get_vcssb(struct vcssb *vcssb) +{ + int diag320_rc; + + memset(vcssb, 0, sizeof(*vcssb)); + vcssb->vcssb_length = VCSSB_LEN_BYTES; + diag320_rc = diag320(DIAG320_STORAGE, vcssb); + pr_dbf_vcssb(vcssb); + + if (diag320_rc != DIAG320_RC_OK) { + pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc); + return -EIO; + } + if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) { + pr_dbf_msg("No certificates available for current configuration"); + return -ENOKEY; + } + + return 0; +} + +static u32 get_4k_mult_vcb_size(struct vcssb *vcssb) +{ + return round_up(vcssb->max_single_vcb_length, PAGE_SIZE); +} + +/* Fill input fields of single-entry VCB that will be read by LPAR. */ +static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index) +{ + memset(vcb, 0, sizeof(*vcb)); + vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb); + vcb->vcb_hdr.cs_token = vcssb->cs_token; + + /* Request single entry. */ + vcb->vcb_hdr.first_vc_index = index; + vcb->vcb_hdr.last_vc_index = index; +} + +static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce) +{ + struct vce *extracted_vce; + + extracted_vce = (struct vce *)vcb->vcb_buf; + memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length); + pr_dbf_vce(vce); +} + +static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb) +{ + int rc, diag320_rc; + + fill_vcb_input(vcssb, vcb, index); + + diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb); + pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc); + pr_dbf_vcb(vcb); + + switch (diag320_rc) { + case DIAG320_RC_OK: + rc = 0; + if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) { + pr_dbf_msg("No certificate entry for index %u", index); + rc = -ENOKEY; + } else if (vcb->vcb_hdr.remaining_vc_count != 0) { + /* Retry on insufficient space. */ + pr_dbf_msg("Couldn't get all requested certificates"); + rc = -EAGAIN; + } + break; + case DIAG320_RC_CS_NOMATCH: + pr_dbf_msg("Certificate Store token mismatch"); + rc = -EAGAIN; + break; + default: + pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc); + rc = -EINVAL; + break; + } + + return rc; +} + +/* + * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call, + * extract VCE and create a key from its' certificate. + */ +static int create_key_from_sevcb(struct vcssb *vcssb, u16 index, + struct key *keyring) +{ + struct vcb *vcb; + struct vce *vce; + int rc; + + rc = -ENOMEM; + vcb = vmalloc(get_4k_mult_vcb_size(vcssb)); + vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr)); + if (!vcb || !vce) + goto out; + + rc = get_sevcb(vcssb, index, vcb); + if (rc) + goto out; + + extract_vce_from_sevcb(vcb, vce); + rc = check_certificate_valid(vce); + if (rc) + goto out; + + rc = create_key_from_vce(vcssb, vce, keyring); + if (rc) + goto out; + + pr_dbf_msg("Successfully created key from Certificate Entry %d", index); +out: + vfree(vce); + vfree(vcb); + return rc; +} + +/* + * Request a single-entry VCB for each VCE available for the partition. + * Create a key from it and link it to cert_store keyring. If no keys + * could be created (i.e. VCEs were invalid) return -ENOKEY. + */ +static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring) +{ + int rc, index, count, added; + + count = 0; + added = 0; + /* Certificate Store entries indices start with 1 and have no gaps. */ + for (index = 1; index < vcssb->total_vc_index_count + 1; index++) { + pr_dbf_msg("Creating key from VCE %u", index); + rc = create_key_from_sevcb(vcssb, index, keyring); + count++; + + if (rc == -EAGAIN) + return rc; + + if (rc) + pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc); + else + added++; + } + + if (added == 0) { + pr_dbf_msg("Processed %d entries. No keys created", count); + return -ENOKEY; + } + + pr_info("Added %d of %d keys to cert_store keyring", added, count); + + /* + * Do not allow to link more keys to certificate store keyring after all + * the VCEs were processed. + */ + rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL); + if (rc) + pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc); + + return 0; +} + +/* + * Check which DIAG320 subcodes are installed. + * Return -ENOENT if subcodes 1 or 2 are not available. + */ +static int query_diag320_subcodes(void) +{ + unsigned long ism[ISM_LEN_DWORDS]; + int rc; + + rc = diag320(0, ism); + if (rc != DIAG320_RC_OK) { + pr_dbf_msg("DIAG320 subcode query returned %04x", rc); + return -ENOENT; + } + + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0"); + debug_event(cert_store_hexdump, 3, ism, sizeof(ism)); + + if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) { + pr_dbf_msg("Not all required DIAG320 subcodes are installed"); + return -ENOENT; + } + + return 0; +} + +/* + * Check if Certificate Store is supported by the firmware and DIAG320 subcodes + * 1 and 2 are installed. Create cert_store keyring and link all certificates + * available for the current partition to it as "cert_store_key" type + * keys. On refresh or error invalidate cert_store keyring and destroy + * all keys of "cert_store_key" type. + */ +static int fill_cs_keyring(void) +{ + struct key *cs_keyring; + struct vcssb *vcssb; + int rc; + + rc = -ENOMEM; + vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL); + if (!vcssb) + goto cleanup_keys; + + rc = -ENOENT; + if (!sclp.has_diag320) { + pr_dbf_msg("Certificate Store is not supported"); + goto cleanup_keys; + } + + rc = query_diag320_subcodes(); + if (rc) + goto cleanup_keys; + + rc = get_vcssb(vcssb); + if (rc) + goto cleanup_keys; + + rc = -ENOMEM; + cs_keyring = create_cs_keyring(); + if (!cs_keyring) + goto cleanup_keys; + + rc = add_certificates_to_keyring(vcssb, cs_keyring); + if (rc) + goto cleanup_cs_keyring; + + goto out; + +cleanup_cs_keyring: + key_put(cs_keyring); +cleanup_keys: + cleanup_cs_keys(); +out: + kfree(vcssb); + return rc; +} + +static DEFINE_MUTEX(cs_refresh_lock); +static int cs_status_val = -1; + +static ssize_t cs_status_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + if (cs_status_val == -1) + return sysfs_emit(buf, "uninitialized\n"); + else if (cs_status_val == 0) + return sysfs_emit(buf, "ok\n"); + + return sysfs_emit(buf, "failed (%d)\n", cs_status_val); +} + +static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status); + +static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int rc, retries; + + pr_dbf_msg("Refresh certificate store information requested"); + rc = mutex_lock_interruptible(&cs_refresh_lock); + if (rc) + return rc; + + for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) { + /* Request certificates from certificate store. */ + rc = fill_cs_keyring(); + if (rc) + pr_dbf_msg("Failed to refresh certificate store information (%d)", rc); + if (rc != -EAGAIN) + break; + } + cs_status_val = rc; + mutex_unlock(&cs_refresh_lock); + + return rc ?: count; +} + +static struct kobj_attribute refresh_attr = __ATTR_WO(refresh); + +static const struct attribute *cert_store_attrs[] __initconst = { + &cs_status_attr.attr, + &refresh_attr.attr, + NULL, +}; + +static struct kobject *cert_store_kobj; + +static int __init cert_store_init(void) +{ + int rc = -ENOMEM; + + cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64); + if (!cert_store_dbf) + goto cleanup_dbf; + + cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128); + if (!cert_store_hexdump) + goto cleanup_dbf; + + debug_register_view(cert_store_hexdump, &debug_hex_ascii_view); + debug_register_view(cert_store_dbf, &debug_sprintf_view); + + /* Create directory /sys/firmware/cert_store. */ + cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj); + if (!cert_store_kobj) + goto cleanup_dbf; + + rc = sysfs_create_files(cert_store_kobj, cert_store_attrs); + if (rc) + goto cleanup_kobj; + + register_key_type(&key_type_cert_store_key); + + return rc; + +cleanup_kobj: + kobject_put(cert_store_kobj); +cleanup_dbf: + debug_unregister(cert_store_dbf); + debug_unregister(cert_store_hexdump); + + return rc; +} +device_initcall(cert_store_init); diff --git a/arch/s390/kernel/compat_audit.c b/arch/s390/kernel/compat_audit.c deleted file mode 100644 index 444fb1f66944..000000000000 --- a/arch/s390/kernel/compat_audit.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#undef __s390x__ -#include <asm/unistd.h> -#include "audit.h" - -unsigned s390_dir_class[] = { -#include <asm-generic/audit_dir_write.h> -~0U -}; - -unsigned s390_chattr_class[] = { -#include <asm-generic/audit_change_attr.h> -~0U -}; - -unsigned s390_write_class[] = { -#include <asm-generic/audit_write.h> -~0U -}; - -unsigned s390_read_class[] = { -#include <asm-generic/audit_read.h> -~0U -}; - -unsigned s390_signal_class[] = { -#include <asm-generic/audit_signal.h> -~0U -}; - -int s390_classify_syscall(unsigned syscall) -{ - switch(syscall) { - case __NR_open: - return 2; - case __NR_openat: - return 3; - case __NR_socketcall: - return 4; - case __NR_execve: - return 5; - default: - return 1; - } -} diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c deleted file mode 100644 index f9d418d1b619..000000000000 --- a/arch/s390/kernel/compat_linux.c +++ /dev/null @@ -1,289 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * S390 version - * Copyright IBM Corp. 2000 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * Gerhard Tonn (ton@de.ibm.com) - * Thomas Spatzier (tspat@de.ibm.com) - * - * Conversion between 31bit and 64bit native syscalls. - * - * Heavily inspired by the 32-bit Sparc compat code which is - * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) - * - */ - - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/file.h> -#include <linux/signal.h> -#include <linux/resource.h> -#include <linux/times.h> -#include <linux/smp.h> -#include <linux/sem.h> -#include <linux/msg.h> -#include <linux/shm.h> -#include <linux/uio.h> -#include <linux/quota.h> -#include <linux/poll.h> -#include <linux/personality.h> -#include <linux/stat.h> -#include <linux/filter.h> -#include <linux/highmem.h> -#include <linux/mman.h> -#include <linux/ipv6.h> -#include <linux/in.h> -#include <linux/icmpv6.h> -#include <linux/syscalls.h> -#include <linux/sysctl.h> -#include <linux/binfmts.h> -#include <linux/capability.h> -#include <linux/compat.h> -#include <linux/vfs.h> -#include <linux/ptrace.h> -#include <linux/fadvise.h> -#include <linux/ipc.h> -#include <linux/slab.h> - -#include <asm/types.h> -#include <linux/uaccess.h> - -#include <net/scm.h> -#include <net/sock.h> - -#include "compat_linux.h" - -#ifdef CONFIG_SYSVIPC -COMPAT_SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, compat_ulong_t, second, - compat_ulong_t, third, compat_uptr_t, ptr) -{ - if (call >> 16) /* hack for backward compatibility */ - return -EINVAL; - return compat_ksys_ipc(call, first, second, third, ptr, third); -} -#endif - -COMPAT_SYSCALL_DEFINE3(s390_truncate64, const char __user *, path, u32, high, u32, low) -{ - return ksys_truncate(path, (unsigned long)high << 32 | low); -} - -COMPAT_SYSCALL_DEFINE3(s390_ftruncate64, unsigned int, fd, u32, high, u32, low) -{ - return ksys_ftruncate(fd, (unsigned long)high << 32 | low); -} - -COMPAT_SYSCALL_DEFINE5(s390_pread64, unsigned int, fd, char __user *, ubuf, - compat_size_t, count, u32, high, u32, low) -{ - if ((compat_ssize_t) count < 0) - return -EINVAL; - return ksys_pread64(fd, ubuf, count, (unsigned long)high << 32 | low); -} - -COMPAT_SYSCALL_DEFINE5(s390_pwrite64, unsigned int, fd, const char __user *, ubuf, - compat_size_t, count, u32, high, u32, low) -{ - if ((compat_ssize_t) count < 0) - return -EINVAL; - return ksys_pwrite64(fd, ubuf, count, (unsigned long)high << 32 | low); -} - -COMPAT_SYSCALL_DEFINE4(s390_readahead, int, fd, u32, high, u32, low, s32, count) -{ - return ksys_readahead(fd, (unsigned long)high << 32 | low, count); -} - -struct stat64_emu31 { - unsigned long long st_dev; - unsigned int __pad1; -#define STAT64_HAS_BROKEN_ST_INO 1 - u32 __st_ino; - unsigned int st_mode; - unsigned int st_nlink; - u32 st_uid; - u32 st_gid; - unsigned long long st_rdev; - unsigned int __pad3; - long st_size; - u32 st_blksize; - unsigned char __pad4[4]; - u32 __pad5; /* future possible st_blocks high bits */ - u32 st_blocks; /* Number 512-byte blocks allocated. */ - u32 st_atime; - u32 __pad6; - u32 st_mtime; - u32 __pad7; - u32 st_ctime; - u32 __pad8; /* will be high 32 bits of ctime someday */ - unsigned long st_ino; -}; - -static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat) -{ - struct stat64_emu31 tmp; - - memset(&tmp, 0, sizeof(tmp)); - - tmp.st_dev = huge_encode_dev(stat->dev); - tmp.st_ino = stat->ino; - tmp.__st_ino = (u32)stat->ino; - tmp.st_mode = stat->mode; - tmp.st_nlink = (unsigned int)stat->nlink; - tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); - tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); - tmp.st_rdev = huge_encode_dev(stat->rdev); - tmp.st_size = stat->size; - tmp.st_blksize = (u32)stat->blksize; - tmp.st_blocks = (u32)stat->blocks; - tmp.st_atime = (u32)stat->atime.tv_sec; - tmp.st_mtime = (u32)stat->mtime.tv_sec; - tmp.st_ctime = (u32)stat->ctime.tv_sec; - - return copy_to_user(ubuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; -} - -COMPAT_SYSCALL_DEFINE2(s390_stat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf) -{ - struct kstat stat; - int ret = vfs_stat(filename, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -COMPAT_SYSCALL_DEFINE2(s390_lstat64, const char __user *, filename, struct stat64_emu31 __user *, statbuf) -{ - struct kstat stat; - int ret = vfs_lstat(filename, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -COMPAT_SYSCALL_DEFINE2(s390_fstat64, unsigned int, fd, struct stat64_emu31 __user *, statbuf) -{ - struct kstat stat; - int ret = vfs_fstat(fd, &stat); - if (!ret) - ret = cp_stat64(statbuf, &stat); - return ret; -} - -COMPAT_SYSCALL_DEFINE4(s390_fstatat64, unsigned int, dfd, const char __user *, filename, - struct stat64_emu31 __user *, statbuf, int, flag) -{ - struct kstat stat; - int error; - - error = vfs_fstatat(dfd, filename, &stat, flag); - if (error) - return error; - return cp_stat64(statbuf, &stat); -} - -/* - * Linux/i386 didn't use to be able to handle more than - * 4 system call parameters, so these system calls used a memory - * block for parameter passing.. - */ - -struct mmap_arg_struct_emu31 { - compat_ulong_t addr; - compat_ulong_t len; - compat_ulong_t prot; - compat_ulong_t flags; - compat_ulong_t fd; - compat_ulong_t offset; -}; - -COMPAT_SYSCALL_DEFINE1(s390_old_mmap, struct mmap_arg_struct_emu31 __user *, arg) -{ - struct mmap_arg_struct_emu31 a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - if (a.offset & ~PAGE_MASK) - return -EINVAL; - return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, - a.offset >> PAGE_SHIFT); -} - -COMPAT_SYSCALL_DEFINE1(s390_mmap2, struct mmap_arg_struct_emu31 __user *, arg) -{ - struct mmap_arg_struct_emu31 a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); -} - -COMPAT_SYSCALL_DEFINE3(s390_read, unsigned int, fd, char __user *, buf, compat_size_t, count) -{ - if ((compat_ssize_t) count < 0) - return -EINVAL; - - return ksys_read(fd, buf, count); -} - -COMPAT_SYSCALL_DEFINE3(s390_write, unsigned int, fd, const char __user *, buf, compat_size_t, count) -{ - if ((compat_ssize_t) count < 0) - return -EINVAL; - - return ksys_write(fd, buf, count); -} - -/* - * 31 bit emulation wrapper functions for sys_fadvise64/fadvise64_64. - * These need to rewrite the advise values for POSIX_FADV_{DONTNEED,NOREUSE} - * because the 31 bit values differ from the 64 bit values. - */ - -COMPAT_SYSCALL_DEFINE5(s390_fadvise64, int, fd, u32, high, u32, low, compat_size_t, len, int, advise) -{ - if (advise == 4) - advise = POSIX_FADV_DONTNEED; - else if (advise == 5) - advise = POSIX_FADV_NOREUSE; - return ksys_fadvise64_64(fd, (unsigned long)high << 32 | low, len, - advise); -} - -struct fadvise64_64_args { - int fd; - long long offset; - long long len; - int advice; -}; - -COMPAT_SYSCALL_DEFINE1(s390_fadvise64_64, struct fadvise64_64_args __user *, args) -{ - struct fadvise64_64_args a; - - if ( copy_from_user(&a, args, sizeof(a)) ) - return -EFAULT; - if (a.advice == 4) - a.advice = POSIX_FADV_DONTNEED; - else if (a.advice == 5) - a.advice = POSIX_FADV_NOREUSE; - return ksys_fadvise64_64(a.fd, a.offset, a.len, a.advice); -} - -COMPAT_SYSCALL_DEFINE6(s390_sync_file_range, int, fd, u32, offhigh, u32, offlow, - u32, nhigh, u32, nlow, unsigned int, flags) -{ - return ksys_sync_file_range(fd, ((loff_t)offhigh << 32) + offlow, - ((u64)nhigh << 32) + nlow, flags); -} - -COMPAT_SYSCALL_DEFINE6(s390_fallocate, int, fd, int, mode, u32, offhigh, u32, offlow, - u32, lenhigh, u32, lenlow) -{ - return ksys_fallocate(fd, mode, ((loff_t)offhigh << 32) + offlow, - ((u64)lenhigh << 32) + lenlow); -} diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h deleted file mode 100644 index 64509e7dbd3b..000000000000 --- a/arch/s390/kernel/compat_linux.h +++ /dev/null @@ -1,130 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_S390X_S390_H -#define _ASM_S390X_S390_H - -#include <linux/compat.h> -#include <linux/socket.h> -#include <linux/syscalls.h> - -/* Macro that masks the high order bit of an 32 bit pointer and converts it*/ -/* to a 64 bit pointer */ -#define A(__x) ((unsigned long)((__x) & 0x7FFFFFFFUL)) -#define AA(__x) \ - ((unsigned long)(__x)) - -/* Now 32bit compatibility types */ -struct ipc_kludge_32 { - __u32 msgp; /* pointer */ - __s32 msgtyp; -}; - -/* asm/sigcontext.h */ -typedef union -{ - __u64 d; - __u32 f; -} freg_t32; - -typedef struct -{ - unsigned int fpc; - unsigned int pad; - freg_t32 fprs[__NUM_FPRS]; -} _s390_fp_regs32; - -typedef struct -{ - __u32 mask; - __u32 addr; -} _psw_t32 __attribute__ ((aligned(8))); - -typedef struct -{ - _psw_t32 psw; - __u32 gprs[__NUM_GPRS]; - __u32 acrs[__NUM_ACRS]; -} _s390_regs_common32; - -typedef struct -{ - _s390_regs_common32 regs; - _s390_fp_regs32 fpregs; -} _sigregs32; - -typedef struct -{ - __u32 gprs_high[__NUM_GPRS]; - __u64 vxrs_low[__NUM_VXRS_LOW]; - __vector128 vxrs_high[__NUM_VXRS_HIGH]; - __u8 __reserved[128]; -} _sigregs_ext32; - -#define _SIGCONTEXT_NSIG32 64 -#define _SIGCONTEXT_NSIG_BPW32 32 -#define __SIGNAL_FRAMESIZE32 96 -#define _SIGMASK_COPY_SIZE32 (sizeof(u32)*2) - -struct sigcontext32 -{ - __u32 oldmask[_COMPAT_NSIG_WORDS]; - __u32 sregs; /* pointer */ -}; - -/* asm/signal.h */ - -/* asm/ucontext.h */ -struct ucontext32 { - __u32 uc_flags; - __u32 uc_link; /* pointer */ - compat_stack_t uc_stack; - _sigregs32 uc_mcontext; - compat_sigset_t uc_sigmask; - /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ - unsigned char __unused[128 - sizeof(compat_sigset_t)]; - _sigregs_ext32 uc_mcontext_ext; -}; - -struct stat64_emu31; -struct mmap_arg_struct_emu31; -struct fadvise64_64_args; - -long compat_sys_s390_chown16(const char __user *filename, u16 user, u16 group); -long compat_sys_s390_lchown16(const char __user *filename, u16 user, u16 group); -long compat_sys_s390_fchown16(unsigned int fd, u16 user, u16 group); -long compat_sys_s390_setregid16(u16 rgid, u16 egid); -long compat_sys_s390_setgid16(u16 gid); -long compat_sys_s390_setreuid16(u16 ruid, u16 euid); -long compat_sys_s390_setuid16(u16 uid); -long compat_sys_s390_setresuid16(u16 ruid, u16 euid, u16 suid); -long compat_sys_s390_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user *suid); -long compat_sys_s390_setresgid16(u16 rgid, u16 egid, u16 sgid); -long compat_sys_s390_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user *sgid); -long compat_sys_s390_setfsuid16(u16 uid); -long compat_sys_s390_setfsgid16(u16 gid); -long compat_sys_s390_getgroups16(int gidsetsize, u16 __user *grouplist); -long compat_sys_s390_setgroups16(int gidsetsize, u16 __user *grouplist); -long compat_sys_s390_getuid16(void); -long compat_sys_s390_geteuid16(void); -long compat_sys_s390_getgid16(void); -long compat_sys_s390_getegid16(void); -long compat_sys_s390_truncate64(const char __user *path, u32 high, u32 low); -long compat_sys_s390_ftruncate64(unsigned int fd, u32 high, u32 low); -long compat_sys_s390_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, u32 high, u32 low); -long compat_sys_s390_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, u32 high, u32 low); -long compat_sys_s390_readahead(int fd, u32 high, u32 low, s32 count); -long compat_sys_s390_stat64(const char __user *filename, struct stat64_emu31 __user *statbuf); -long compat_sys_s390_lstat64(const char __user *filename, struct stat64_emu31 __user *statbuf); -long compat_sys_s390_fstat64(unsigned int fd, struct stat64_emu31 __user *statbuf); -long compat_sys_s390_fstatat64(unsigned int dfd, const char __user *filename, struct stat64_emu31 __user *statbuf, int flag); -long compat_sys_s390_old_mmap(struct mmap_arg_struct_emu31 __user *arg); -long compat_sys_s390_mmap2(struct mmap_arg_struct_emu31 __user *arg); -long compat_sys_s390_read(unsigned int fd, char __user * buf, compat_size_t count); -long compat_sys_s390_write(unsigned int fd, const char __user * buf, compat_size_t count); -long compat_sys_s390_fadvise64(int fd, u32 high, u32 low, compat_size_t len, int advise); -long compat_sys_s390_fadvise64_64(struct fadvise64_64_args __user *args); -long compat_sys_s390_sync_file_range(int fd, u32 offhigh, u32 offlow, u32 nhigh, u32 nlow, unsigned int flags); -long compat_sys_s390_fallocate(int fd, int mode, u32 offhigh, u32 offlow, u32 lenhigh, u32 lenlow); -long compat_sys_sigreturn(void); -long compat_sys_rt_sigreturn(void); - -#endif /* _ASM_S390X_S390_H */ diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h deleted file mode 100644 index 3c400fc7e987..000000000000 --- a/arch/s390/kernel/compat_ptrace.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PTRACE32_H -#define _PTRACE32_H - -#include <asm/ptrace.h> /* needed for NUM_CR_WORDS */ -#include "compat_linux.h" /* needed for psw_compat_t */ - -struct compat_per_struct_kernel { - __u32 cr9; /* PER control bits */ - __u32 cr10; /* PER starting address */ - __u32 cr11; /* PER ending address */ - __u32 bits; /* Obsolete software bits */ - __u32 starting_addr; /* User specified start address */ - __u32 ending_addr; /* User specified end address */ - __u16 perc_atmid; /* PER trap ATMID */ - __u32 address; /* PER trap instruction address */ - __u8 access_id; /* PER trap access identification */ -}; - -struct compat_user_regs_struct -{ - psw_compat_t psw; - u32 gprs[NUM_GPRS]; - u32 acrs[NUM_ACRS]; - u32 orig_gpr2; - /* nb: there's a 4-byte hole here */ - s390_fp_regs fp_regs; - /* - * These per registers are in here so that gdb can modify them - * itself as there is no "official" ptrace interface for hardware - * watchpoints. This is the way intel does it. - */ - struct compat_per_struct_kernel per_info; - u32 ieee_instruction_pointer; /* obsolete, always 0 */ -}; - -struct compat_user { - /* We start with the registers, to mimic the way that "memory" - is returned from the ptrace(3,...) function. */ - struct compat_user_regs_struct regs; - /* The rest of this junk is to help gdb figure out what goes where */ - u32 u_tsize; /* Text segment size (pages). */ - u32 u_dsize; /* Data segment size (pages). */ - u32 u_ssize; /* Stack segment size (pages). */ - u32 start_code; /* Starting virtual address of text. */ - u32 start_stack; /* Starting virtual address of stack area. - This is actually the bottom of the stack, - the top of the stack is always found in the - esp register. */ - s32 signal; /* Signal that caused the core dump. */ - u32 u_ar0; /* Used by gdb to help find the values for */ - /* the registers. */ - u32 magic; /* To uniquely identify a core file */ - char u_comm[32]; /* User command that was responsible */ -}; - -typedef struct -{ - __u32 len; - __u32 kernel_addr; - __u32 process_addr; -} compat_ptrace_area; - -#endif /* _PTRACE32_H */ diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c deleted file mode 100644 index cca142fbb516..000000000000 --- a/arch/s390/kernel/compat_signal.c +++ /dev/null @@ -1,422 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corp. 2000, 2006 - * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com) - * Gerhard Tonn (ton@de.ibm.com) - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - */ - -#include <linux/compat.h> -#include <linux/sched.h> -#include <linux/sched/task_stack.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/ptrace.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/tty.h> -#include <linux/personality.h> -#include <linux/binfmts.h> -#include <asm/ucontext.h> -#include <linux/uaccess.h> -#include <asm/lowcore.h> -#include <asm/switch_to.h> -#include <asm/vdso.h> -#include "compat_linux.h" -#include "compat_ptrace.h" -#include "entry.h" - -typedef struct -{ - __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; - struct sigcontext32 sc; - _sigregs32 sregs; - int signo; - _sigregs_ext32 sregs_ext; - __u16 svc_insn; /* Offset of svc_insn is NOT fixed! */ -} sigframe32; - -typedef struct -{ - __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; - __u16 svc_insn; - compat_siginfo_t info; - struct ucontext32 uc; -} rt_sigframe32; - -/* Store registers needed to create the signal frame */ -static void store_sigregs(void) -{ - save_access_regs(current->thread.acrs); - save_fpu_regs(); -} - -/* Load registers after signal return */ -static void load_sigregs(void) -{ - restore_access_regs(current->thread.acrs); -} - -static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) -{ - _sigregs32 user_sregs; - int i; - - user_sregs.regs.psw.mask = (__u32)(regs->psw.mask >> 32); - user_sregs.regs.psw.mask &= PSW32_MASK_USER | PSW32_MASK_RI; - user_sregs.regs.psw.mask |= PSW32_USER_BITS; - user_sregs.regs.psw.addr = (__u32) regs->psw.addr | - (__u32)(regs->psw.mask & PSW_MASK_BA); - for (i = 0; i < NUM_GPRS; i++) - user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; - memcpy(&user_sregs.regs.acrs, current->thread.acrs, - sizeof(user_sregs.regs.acrs)); - fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); - if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) - return -EFAULT; - return 0; -} - -static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) -{ - _sigregs32 user_sregs; - int i; - - /* Alwys make any pending restarted system call return -EINTR */ - current->restart_block.fn = do_no_restart_syscall; - - if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) - return -EFAULT; - - if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI)) - return -EINVAL; - - /* Test the floating-point-control word. */ - if (test_fp_ctl(user_sregs.fpregs.fpc)) - return -EINVAL; - - /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ - regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | - (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_USER) << 32 | - (__u64)(user_sregs.regs.psw.mask & PSW32_MASK_RI) << 32 | - (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_AMODE); - /* Check for invalid user address space control. */ - if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_HOME) - regs->psw.mask = PSW_ASC_PRIMARY | - (regs->psw.mask & ~PSW_MASK_ASC); - regs->psw.addr = (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_INSN); - for (i = 0; i < NUM_GPRS; i++) - regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; - memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, - sizeof(current->thread.acrs)); - fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); - - clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ - return 0; -} - -static int save_sigregs_ext32(struct pt_regs *regs, - _sigregs_ext32 __user *sregs_ext) -{ - __u32 gprs_high[NUM_GPRS]; - __u64 vxrs[__NUM_VXRS_LOW]; - int i; - - /* Save high gprs to signal stack */ - for (i = 0; i < NUM_GPRS; i++) - gprs_high[i] = regs->gprs[i] >> 32; - if (__copy_to_user(&sregs_ext->gprs_high, &gprs_high, - sizeof(sregs_ext->gprs_high))) - return -EFAULT; - - /* Save vector registers to signal stack */ - if (MACHINE_HAS_VX) { - for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); - if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, - sizeof(sregs_ext->vxrs_low)) || - __copy_to_user(&sregs_ext->vxrs_high, - current->thread.fpu.vxrs + __NUM_VXRS_LOW, - sizeof(sregs_ext->vxrs_high))) - return -EFAULT; - } - return 0; -} - -static int restore_sigregs_ext32(struct pt_regs *regs, - _sigregs_ext32 __user *sregs_ext) -{ - __u32 gprs_high[NUM_GPRS]; - __u64 vxrs[__NUM_VXRS_LOW]; - int i; - - /* Restore high gprs from signal stack */ - if (__copy_from_user(&gprs_high, &sregs_ext->gprs_high, - sizeof(sregs_ext->gprs_high))) - return -EFAULT; - for (i = 0; i < NUM_GPRS; i++) - *(__u32 *)®s->gprs[i] = gprs_high[i]; - - /* Restore vector registers from signal stack */ - if (MACHINE_HAS_VX) { - if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, - sizeof(sregs_ext->vxrs_low)) || - __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, - &sregs_ext->vxrs_high, - sizeof(sregs_ext->vxrs_high))) - return -EFAULT; - for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; - } - return 0; -} - -COMPAT_SYSCALL_DEFINE0(sigreturn) -{ - struct pt_regs *regs = task_pt_regs(current); - sigframe32 __user *frame = (sigframe32 __user *)regs->gprs[15]; - sigset_t set; - - if (get_compat_sigset(&set, (compat_sigset_t __user *)frame->sc.oldmask)) - goto badframe; - set_current_blocked(&set); - save_fpu_regs(); - if (restore_sigregs32(regs, &frame->sregs)) - goto badframe; - if (restore_sigregs_ext32(regs, &frame->sregs_ext)) - goto badframe; - load_sigregs(); - return regs->gprs[2]; -badframe: - force_sig(SIGSEGV); - return 0; -} - -COMPAT_SYSCALL_DEFINE0(rt_sigreturn) -{ - struct pt_regs *regs = task_pt_regs(current); - rt_sigframe32 __user *frame = (rt_sigframe32 __user *)regs->gprs[15]; - sigset_t set; - - if (get_compat_sigset(&set, &frame->uc.uc_sigmask)) - goto badframe; - set_current_blocked(&set); - if (compat_restore_altstack(&frame->uc.uc_stack)) - goto badframe; - save_fpu_regs(); - if (restore_sigregs32(regs, &frame->uc.uc_mcontext)) - goto badframe; - if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) - goto badframe; - load_sigregs(); - return regs->gprs[2]; -badframe: - force_sig(SIGSEGV); - return 0; -} - -/* - * Set up a signal frame. - */ - - -/* - * Determine which stack to use.. - */ -static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) -{ - unsigned long sp; - - /* Default to using normal stack */ - sp = (unsigned long) A(regs->gprs[15]); - - /* Overflow on alternate signal stack gives SIGSEGV. */ - if (on_sig_stack(sp) && !on_sig_stack((sp - frame_size) & -8UL)) - return (void __user *) -1UL; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (! sas_ss_flags(sp)) - sp = current->sas_ss_sp + current->sas_ss_size; - } - - return (void __user *)((sp - frame_size) & -8ul); -} - -static int setup_frame32(struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - int sig = ksig->sig; - sigframe32 __user *frame; - unsigned long restorer; - size_t frame_size; - - /* - * gprs_high are always present for 31-bit compat tasks. - * The space for vector registers is only allocated if - * the machine supports it - */ - frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved); - if (!MACHINE_HAS_VX) - frame_size -= sizeof(frame->sregs_ext.vxrs_low) + - sizeof(frame->sregs_ext.vxrs_high); - frame = get_sigframe(&ksig->ka, regs, frame_size); - if (frame == (void __user *) -1UL) - return -EFAULT; - - /* Set up backchain. */ - if (__put_user(regs->gprs[15], (unsigned int __user *) frame)) - return -EFAULT; - - /* Create struct sigcontext32 on the signal stack */ - if (put_compat_sigset((compat_sigset_t __user *)frame->sc.oldmask, - set, sizeof(compat_sigset_t))) - return -EFAULT; - if (__put_user(ptr_to_compat(&frame->sregs), &frame->sc.sregs)) - return -EFAULT; - - /* Store registers needed to create the signal frame */ - store_sigregs(); - - /* Create _sigregs32 on the signal stack */ - if (save_sigregs32(regs, &frame->sregs)) - return -EFAULT; - - /* Place signal number on stack to allow backtrace from handler. */ - if (__put_user(regs->gprs[2], (int __force __user *) &frame->signo)) - return -EFAULT; - - /* Create _sigregs_ext32 on the signal stack */ - if (save_sigregs_ext32(regs, &frame->sregs_ext)) - return -EFAULT; - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - restorer = (unsigned long __force) - ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; - } else { - restorer = VDSO32_SYMBOL(current, sigreturn); - } - - /* Set up registers for signal handler */ - regs->gprs[14] = restorer; - regs->gprs[15] = (__force __u64) frame; - /* Force 31 bit amode and default user address space control. */ - regs->psw.mask = PSW_MASK_BA | - (PSW_USER_BITS & PSW_MASK_ASC) | - (regs->psw.mask & ~PSW_MASK_ASC); - regs->psw.addr = (__force __u64) ksig->ka.sa.sa_handler; - - regs->gprs[2] = sig; - regs->gprs[3] = (__force __u64) &frame->sc; - - /* We forgot to include these in the sigcontext. - To avoid breaking binary compatibility, they are passed as args. */ - if (sig == SIGSEGV || sig == SIGBUS || sig == SIGILL || - sig == SIGTRAP || sig == SIGFPE) { - /* set extra registers only for synchronous signals */ - regs->gprs[4] = regs->int_code & 127; - regs->gprs[5] = regs->int_parm_long; - regs->gprs[6] = current->thread.last_break; - } - - return 0; -} - -static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, - struct pt_regs *regs) -{ - rt_sigframe32 __user *frame; - unsigned long restorer; - size_t frame_size; - u32 uc_flags; - - frame_size = sizeof(*frame) - - sizeof(frame->uc.uc_mcontext_ext.__reserved); - /* - * gprs_high are always present for 31-bit compat tasks. - * The space for vector registers is only allocated if - * the machine supports it - */ - uc_flags = UC_GPRS_HIGH; - if (MACHINE_HAS_VX) { - uc_flags |= UC_VXRS; - } else - frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) + - sizeof(frame->uc.uc_mcontext_ext.vxrs_high); - frame = get_sigframe(&ksig->ka, regs, frame_size); - if (frame == (void __user *) -1UL) - return -EFAULT; - - /* Set up backchain. */ - if (__put_user(regs->gprs[15], (unsigned int __force __user *) frame)) - return -EFAULT; - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - restorer = (unsigned long __force) - ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; - } else { - restorer = VDSO32_SYMBOL(current, rt_sigreturn); - } - - /* Create siginfo on the signal stack */ - if (copy_siginfo_to_user32(&frame->info, &ksig->info)) - return -EFAULT; - - /* Store registers needed to create the signal frame */ - store_sigregs(); - - /* Create ucontext on the signal stack. */ - if (__put_user(uc_flags, &frame->uc.uc_flags) || - __put_user(0, &frame->uc.uc_link) || - __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || - save_sigregs32(regs, &frame->uc.uc_mcontext) || - put_compat_sigset(&frame->uc.uc_sigmask, set, sizeof(compat_sigset_t)) || - save_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->gprs[14] = restorer; - regs->gprs[15] = (__force __u64) frame; - /* Force 31 bit amode and default user address space control. */ - regs->psw.mask = PSW_MASK_BA | - (PSW_USER_BITS & PSW_MASK_ASC) | - (regs->psw.mask & ~PSW_MASK_ASC); - regs->psw.addr = (__u64 __force) ksig->ka.sa.sa_handler; - - regs->gprs[2] = ksig->sig; - regs->gprs[3] = (__force __u64) &frame->info; - regs->gprs[4] = (__force __u64) &frame->uc; - regs->gprs[5] = current->thread.last_break; - return 0; -} - -/* - * OK, we're invoking a handler - */ - -void handle_signal32(struct ksignal *ksig, sigset_t *oldset, - struct pt_regs *regs) -{ - int ret; - - /* Set up the stack frame */ - if (ksig->ka.sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame32(ksig, oldset, regs); - else - ret = setup_frame32(ksig, oldset, regs); - - signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLE_STEP)); -} - diff --git a/arch/s390/kernel/cpacf.c b/arch/s390/kernel/cpacf.c new file mode 100644 index 000000000000..9d85b4bc7036 --- /dev/null +++ b/arch/s390/kernel/cpacf.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define pr_fmt(fmt) "cpacf: " fmt + +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/sysfs.h> +#include <asm/cpacf.h> + +#define CPACF_QUERY(name, instruction) \ +static ssize_t name##_query_raw_read(struct file *fp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t offs, \ + size_t count) \ +{ \ + cpacf_mask_t mask; \ + \ + if (!cpacf_query(CPACF_##instruction, &mask)) \ + return -EOPNOTSUPP; \ + return memory_read_from_buffer(buf, count, &offs, &mask, sizeof(mask)); \ +} \ +static const BIN_ATTR_RO(name##_query_raw, sizeof(cpacf_mask_t)) + +CPACF_QUERY(km, KM); +CPACF_QUERY(kmc, KMC); +CPACF_QUERY(kimd, KIMD); +CPACF_QUERY(klmd, KLMD); +CPACF_QUERY(kmac, KMAC); +CPACF_QUERY(pckmo, PCKMO); +CPACF_QUERY(kmf, KMF); +CPACF_QUERY(kmctr, KMCTR); +CPACF_QUERY(kmo, KMO); +CPACF_QUERY(pcc, PCC); +CPACF_QUERY(prno, PRNO); +CPACF_QUERY(kma, KMA); +CPACF_QUERY(kdsa, KDSA); + +#define CPACF_QAI(name, instruction) \ +static ssize_t name##_query_auth_info_raw_read( \ + struct file *fp, struct kobject *kobj, \ + const struct bin_attribute *attr, char *buf, loff_t offs, \ + size_t count) \ +{ \ + cpacf_qai_t qai; \ + \ + if (!cpacf_qai(CPACF_##instruction, &qai)) \ + return -EOPNOTSUPP; \ + return memory_read_from_buffer(buf, count, &offs, &qai, \ + sizeof(qai)); \ +} \ +static const BIN_ATTR_RO(name##_query_auth_info_raw, sizeof(cpacf_qai_t)) + +CPACF_QAI(km, KM); +CPACF_QAI(kmc, KMC); +CPACF_QAI(kimd, KIMD); +CPACF_QAI(klmd, KLMD); +CPACF_QAI(kmac, KMAC); +CPACF_QAI(pckmo, PCKMO); +CPACF_QAI(kmf, KMF); +CPACF_QAI(kmctr, KMCTR); +CPACF_QAI(kmo, KMO); +CPACF_QAI(pcc, PCC); +CPACF_QAI(prno, PRNO); +CPACF_QAI(kma, KMA); +CPACF_QAI(kdsa, KDSA); + +static const struct bin_attribute *const cpacf_attrs[] = { + &bin_attr_km_query_raw, + &bin_attr_kmc_query_raw, + &bin_attr_kimd_query_raw, + &bin_attr_klmd_query_raw, + &bin_attr_kmac_query_raw, + &bin_attr_pckmo_query_raw, + &bin_attr_kmf_query_raw, + &bin_attr_kmctr_query_raw, + &bin_attr_kmo_query_raw, + &bin_attr_pcc_query_raw, + &bin_attr_prno_query_raw, + &bin_attr_kma_query_raw, + &bin_attr_kdsa_query_raw, + &bin_attr_km_query_auth_info_raw, + &bin_attr_kmc_query_auth_info_raw, + &bin_attr_kimd_query_auth_info_raw, + &bin_attr_klmd_query_auth_info_raw, + &bin_attr_kmac_query_auth_info_raw, + &bin_attr_pckmo_query_auth_info_raw, + &bin_attr_kmf_query_auth_info_raw, + &bin_attr_kmctr_query_auth_info_raw, + &bin_attr_kmo_query_auth_info_raw, + &bin_attr_pcc_query_auth_info_raw, + &bin_attr_prno_query_auth_info_raw, + &bin_attr_kma_query_auth_info_raw, + &bin_attr_kdsa_query_auth_info_raw, + NULL, +}; + +static const struct attribute_group cpacf_attr_grp = { + .name = "cpacf", + .bin_attrs = cpacf_attrs, +}; + +static int __init cpacf_init(void) +{ + struct device *cpu_root; + int rc = 0; + + cpu_root = bus_get_dev_root(&cpu_subsys); + if (cpu_root) { + rc = sysfs_create_group(&cpu_root->kobj, &cpacf_attr_grp); + put_device(cpu_root); + } + return rc; +} +device_initcall(cpacf_init); diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 54efc279f54e..ab611764642a 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -6,8 +6,7 @@ * Christian Borntraeger (cborntra@de.ibm.com), */ -#define KMSG_COMPONENT "cpcmd" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpcmd: " fmt #include <linux/kernel.h> #include <linux/export.h> @@ -16,10 +15,11 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/mm.h> +#include <linux/io.h> #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> -#include <asm/io.h> +#include <asm/asm.h> static DEFINE_SPINLOCK(cpcmd_lock); static char cpcmd_buf[241]; @@ -29,7 +29,7 @@ static int diag8_noresponse(int cmdlen) asm volatile( " diag %[rx],%[ry],0x8\n" : [ry] "+&d" (cmdlen) - : [rx] "d" ((addr_t) cpcmd_buf) + : [rx] "d" (__pa(cpcmd_buf)) : "cc"); return cmdlen; } @@ -39,18 +39,17 @@ static int diag8_response(int cmdlen, char *response, int *rlen) union register_pair rx, ry; int cc; - rx.even = (addr_t) cpcmd_buf; - rx.odd = (addr_t) response; + rx.even = __pa(cpcmd_buf); + rx.odd = __pa(response); ry.even = cmdlen | 0x40000000L; ry.odd = *rlen; asm volatile( " diag %[rx],%[ry],0x8\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=&d" (cc), [ry] "+&d" (ry.pair) + CC_IPM(cc) + : CC_OUT(cc, cc), [ry] "+d" (ry.pair) : [rx] "d" (rx.pair) - : "cc"); - if (cc) + : CC_CLOBBER); + if (CC_TRANSFORM(cc)) *rlen += ry.odd; else *rlen = ry.odd; diff --git a/arch/s390/kernel/cpufeature.c b/arch/s390/kernel/cpufeature.c new file mode 100644 index 000000000000..c9eef9ed876b --- /dev/null +++ b/arch/s390/kernel/cpufeature.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2022 + */ + +#include <linux/cpufeature.h> +#include <linux/export.h> +#include <linux/bug.h> +#include <asm/machine.h> +#include <asm/elf.h> + +enum { + TYPE_HWCAP, + TYPE_FACILITY, + TYPE_MACHINE, +}; + +struct s390_cpu_feature { + unsigned int type : 4; + unsigned int num : 28; +}; + +static struct s390_cpu_feature s390_cpu_features[MAX_CPU_FEATURES] = { + [S390_CPU_FEATURE_MSA] = {.type = TYPE_HWCAP, .num = HWCAP_NR_MSA}, + [S390_CPU_FEATURE_VXRS] = {.type = TYPE_HWCAP, .num = HWCAP_NR_VXRS}, + [S390_CPU_FEATURE_UV] = {.type = TYPE_FACILITY, .num = 158}, + [S390_CPU_FEATURE_D288] = {.type = TYPE_MACHINE, .num = MFEATURE_DIAG288}, +}; + +/* + * cpu_have_feature - Test CPU features on module initialization + */ +int cpu_have_feature(unsigned int num) +{ + struct s390_cpu_feature *feature; + + if (WARN_ON_ONCE(num >= MAX_CPU_FEATURES)) + return 0; + feature = &s390_cpu_features[num]; + switch (feature->type) { + case TYPE_HWCAP: + return !!(elf_hwcap & BIT(feature->num)); + case TYPE_FACILITY: + return test_facility(feature->num); + case TYPE_MACHINE: + return test_machine_feature(feature->num); + default: + WARN_ON_ONCE(1); + return 0; + } +} +EXPORT_SYMBOL(cpu_have_feature); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 0e36dfc9ccd6..d4839de8ce9d 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -7,6 +7,7 @@ */ #include <linux/crash_dump.h> +#include <linux/export.h> #include <asm/lowcore.h> #include <linux/kernel.h> #include <linux/init.h> @@ -15,11 +16,14 @@ #include <linux/slab.h> #include <linux/memblock.h> #include <linux/elf.h> +#include <linux/uio.h> #include <asm/asm-offsets.h> #include <asm/os_info.h> #include <asm/elf.h> #include <asm/ipl.h> #include <asm/sclp.h> +#include <asm/maccess.h> +#include <asm/fpu.h> #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) @@ -44,7 +48,7 @@ struct save_area { u64 fprs[16]; u32 fpc; u32 prefix; - u64 todpreg; + u32 todpreg; u64 timer; u64 todcmp; u64 vxrs_low[16]; @@ -60,9 +64,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) { struct save_area *sa; - sa = (void *) memblock_phys_alloc(sizeof(*sa), 8); - if (!sa) - panic("Failed to allocate save area\n"); + sa = memblock_alloc_or_panic(sizeof(*sa), 8); if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); @@ -108,126 +110,65 @@ void __init save_area_add_vxrs(struct save_area *sa, __vector128 *vxrs) /* Copy lower halves of vector registers 0-15 */ for (i = 0; i < 16; i++) - memcpy(&sa->vxrs_low[i], &vxrs[i].u[2], 8); + sa->vxrs_low[i] = vxrs[i].low; /* Copy vector registers 16-31 */ memcpy(sa->vxrs_high, vxrs + 16, 16 * sizeof(__vector128)); } -/* - * Return physical address for virtual address - */ -static inline void *load_real_addr(void *addr) -{ - unsigned long real_addr; - - asm volatile( - " lra %0,0(%1)\n" - " jz 0f\n" - " la %0,0\n" - "0:" - : "=a" (real_addr) : "a" (addr) : "cc"); - return (void *)real_addr; -} - -/* - * Copy memory of the old, dumped system to a kernel space virtual address - */ -int copy_oldmem_kernel(void *dst, void *src, size_t count) +static size_t copy_oldmem_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long from, len; - void *ra; - int rc; + size_t len, copied, res = 0; while (count) { - from = __pa(src); - if (!OLDMEM_BASE && from < sclp.hsa_size) { + if (!oldmem_data.start && src < sclp.hsa_size) { /* Copy from zfcp/nvme dump HSA area */ - len = min(count, sclp.hsa_size - from); - rc = memcpy_hsa_kernel(dst, from, len); - if (rc) - return rc; + len = min(count, sclp.hsa_size - src); + copied = memcpy_hsa_iter(iter, src, len); } else { /* Check for swapped kdump oldmem areas */ - if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) { - from -= OLDMEM_BASE; - len = min(count, OLDMEM_SIZE - from); - } else if (OLDMEM_BASE && from < OLDMEM_SIZE) { - len = min(count, OLDMEM_SIZE - from); - from += OLDMEM_BASE; + if (oldmem_data.start && src - oldmem_data.start < oldmem_data.size) { + src -= oldmem_data.start; + len = min(count, oldmem_data.size - src); + } else if (oldmem_data.start && src < oldmem_data.size) { + len = min(count, oldmem_data.size - src); + src += oldmem_data.start; } else { len = count; } - if (is_vmalloc_or_module_addr(dst)) { - ra = load_real_addr(dst); - len = min(PAGE_SIZE - offset_in_page(ra), len); - } else { - ra = dst; - } - if (memcpy_real(ra, (void *) from, len)) - return -EFAULT; + copied = memcpy_real_iter(iter, src, len); } - dst += len; - src += len; - count -= len; + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - return 0; + return res; } -/* - * Copy memory of the old, dumped system to a user space virtual address - */ -static int copy_oldmem_user(void __user *dst, void *src, size_t count) +int copy_oldmem_kernel(void *dst, unsigned long src, size_t count) { - unsigned long from, len; - int rc; + struct iov_iter iter; + struct kvec kvec; - while (count) { - from = __pa(src); - if (!OLDMEM_BASE && from < sclp.hsa_size) { - /* Copy from zfcp/nvme dump HSA area */ - len = min(count, sclp.hsa_size - from); - rc = memcpy_hsa_user(dst, from, len); - if (rc) - return rc; - } else { - /* Check for swapped kdump oldmem areas */ - if (OLDMEM_BASE && from - OLDMEM_BASE < OLDMEM_SIZE) { - from -= OLDMEM_BASE; - len = min(count, OLDMEM_SIZE - from); - } else if (OLDMEM_BASE && from < OLDMEM_SIZE) { - len = min(count, OLDMEM_SIZE - from); - from += OLDMEM_BASE; - } else { - len = count; - } - rc = copy_to_user_real(dst, (void *) from, count); - if (rc) - return rc; - } - dst += len; - src += len; - count -= len; - } + kvec.iov_base = dst; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (copy_oldmem_iter(&iter, src, count) < count) + return -EFAULT; return 0; } /* * Copy one page from "oldmem" */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { - void *src; - int rc; + unsigned long src; - if (!csize) - return 0; - src = (void *) (pfn << PAGE_SHIFT) + offset; - if (userbuf) - rc = copy_oldmem_user((void __force __user *) buf, src, csize); - else - rc = copy_oldmem_kernel((void *) buf, src, csize); - return rc; + src = pfn_to_phys(pfn) + offset; + return copy_oldmem_iter(iter, src, csize); } /* @@ -243,10 +184,10 @@ static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma, unsigned long size_old; int rc; - if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) { - size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT)); + if (pfn < oldmem_data.size >> PAGE_SHIFT) { + size_old = min(size, oldmem_data.size - (pfn << PAGE_SHIFT)); rc = remap_pfn_range(vma, from, - pfn + (OLDMEM_BASE >> PAGE_SHIFT), + pfn + (oldmem_data.start >> PAGE_SHIFT), size_old, prot); if (rc || size == size_old) return rc; @@ -288,21 +229,23 @@ static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma, int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { - if (OLDMEM_BASE) + if (oldmem_data.start) return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot); else return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size, prot); } -static const char *nt_name(Elf64_Word type) +/* + * Return true only when in a kdump or stand-alone kdump environment. + * Note that /proc/vmcore might also be available in "standard zfcp/nvme dump" + * environments, where this function returns false; see dump_available(). + */ +bool is_kdump_kernel(void) { - const char *name = "LINUX"; - - if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG) - name = KEXEC_CORE_NOTE_NAME; - return name; + return oldmem_data.start; } +EXPORT_SYMBOL_GPL(is_kdump_kernel); /* * Initialize ELF note @@ -328,10 +271,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, void *desc, int d_len, return PTR_ADD(buf, len); } -static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len) -{ - return nt_init_name(buf, type, desc, d_len, nt_name(type)); -} +#define nt_init(buf, type, desc) \ + nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type) /* * Calculate the size of ELF note @@ -347,10 +288,7 @@ static size_t nt_size_name(int d_len, const char *name) return size; } -static inline size_t nt_size(Elf64_Word type, int d_len) -{ - return nt_size_name(d_len, nt_name(type)); -} +#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type) /* * Fill ELF notes for one CPU with save area registers @@ -371,18 +309,16 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs)); /* Create ELF notes for the CPU */ - ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus)); - ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset)); - ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer)); - ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp)); - ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg)); - ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs)); - ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix)); - if (MACHINE_HAS_VX) { - ptr = nt_init(ptr, NT_S390_VXRS_HIGH, - &sa->vxrs_high, sizeof(sa->vxrs_high)); - ptr = nt_init(ptr, NT_S390_VXRS_LOW, - &sa->vxrs_low, sizeof(sa->vxrs_low)); + ptr = nt_init(ptr, PRSTATUS, nt_prstatus); + ptr = nt_init(ptr, PRFPREG, nt_fpregset); + ptr = nt_init(ptr, S390_TIMER, sa->timer); + ptr = nt_init(ptr, S390_TODCMP, sa->todcmp); + ptr = nt_init(ptr, S390_TODPREG, sa->todpreg); + ptr = nt_init(ptr, S390_CTRS, sa->ctrs); + ptr = nt_init(ptr, S390_PREFIX, sa->prefix); + if (cpu_has_vx()) { + ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high); + ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low); } return ptr; } @@ -395,16 +331,16 @@ static size_t get_cpu_elf_notes_size(void) struct save_area *sa = NULL; size_t size; - size = nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus)); - size += nt_size(NT_PRFPREG, sizeof(elf_fpregset_t)); - size += nt_size(NT_S390_TIMER, sizeof(sa->timer)); - size += nt_size(NT_S390_TODCMP, sizeof(sa->todcmp)); - size += nt_size(NT_S390_TODPREG, sizeof(sa->todpreg)); - size += nt_size(NT_S390_CTRS, sizeof(sa->ctrs)); - size += nt_size(NT_S390_PREFIX, sizeof(sa->prefix)); - if (MACHINE_HAS_VX) { - size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high)); - size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low)); + size = nt_size(PRSTATUS, struct elf_prstatus); + size += nt_size(PRFPREG, elf_fpregset_t); + size += nt_size(S390_TIMER, sa->timer); + size += nt_size(S390_TODCMP, sa->todcmp); + size += nt_size(S390_TODPREG, sa->todpreg); + size += nt_size(S390_CTRS, sa->ctrs); + size += nt_size(S390_PREFIX, sa->prefix); + if (cpu_has_vx()) { + size += nt_size(S390_VXRS_HIGH, sa->vxrs_high); + size += nt_size(S390_VXRS_LOW, sa->vxrs_low); } return size; @@ -419,8 +355,8 @@ static void *nt_prpsinfo(void *ptr) memset(&prpsinfo, 0, sizeof(prpsinfo)); prpsinfo.pr_sname = 'R'; - strcpy(prpsinfo.pr_fname, "vmlinux"); - return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); + strscpy(prpsinfo.pr_fname, "vmlinux"); + return nt_init(ptr, PRPSINFO, prpsinfo); } /* @@ -429,10 +365,10 @@ static void *nt_prpsinfo(void *ptr) static void *get_vmcoreinfo_old(unsigned long *size) { char nt_name[11], *vmcoreinfo; + unsigned long addr; Elf64_Nhdr note; - void *addr; - if (copy_oldmem_kernel(&addr, &S390_lowcore.vmcore_info, sizeof(addr))) + if (copy_oldmem_kernel(&addr, __LC_VMCORE_INFO, sizeof(addr))) return NULL; memset(nt_name, 0, sizeof(nt_name)); if (copy_oldmem_kernel(¬e, addr, sizeof(note))) @@ -509,7 +445,7 @@ static void *nt_final(void *ptr) /* * Initialize ELF header (new kernel) */ -static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +static void *ehdr_init(Elf64_Ehdr *ehdr, int phdr_count) { memset(ehdr, 0, sizeof(*ehdr)); memcpy(ehdr->e_ident, ELFMAG, SELFMAG); @@ -523,7 +459,8 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) ehdr->e_phoff = sizeof(Elf64_Ehdr); ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - ehdr->e_phnum = mem_chunk_cnt + 1; + /* Number of PT_LOAD program headers plus PT_NOTE program header */ + ehdr->e_phnum = phdr_count + 1; return ehdr + 1; } @@ -554,27 +491,77 @@ static int get_mem_chunk_cnt(void) return cnt; } +static void fill_ptload(Elf64_Phdr *phdr, unsigned long paddr, + unsigned long vaddr, unsigned long size) +{ + phdr->p_type = PT_LOAD; + phdr->p_vaddr = vaddr; + phdr->p_offset = paddr; + phdr->p_paddr = paddr; + phdr->p_filesz = size; + phdr->p_memsz = size; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; +} + /* * Initialize ELF loads (new kernel) */ -static void loads_init(Elf64_Phdr *phdr, u64 loads_offset) +static void loads_init(Elf64_Phdr *phdr, bool os_info_has_vm) { + unsigned long old_identity_base = 0; phys_addr_t start, end; u64 idx; + if (os_info_has_vm) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); for_each_physmem_range(idx, &oldmem_type, &start, &end) { - phdr->p_filesz = end - start; - phdr->p_type = PT_LOAD; - phdr->p_offset = start; - phdr->p_vaddr = start; - phdr->p_paddr = start; - phdr->p_memsz = end - start; - phdr->p_flags = PF_R | PF_W | PF_X; - phdr->p_align = PAGE_SIZE; + fill_ptload(phdr, start, old_identity_base + start, + end - start); phdr++; } } +static bool os_info_has_vm(void) +{ + return os_info_old_value(OS_INFO_KASLR_OFFSET); +} + +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM +/* + * Fill PT_LOAD for a physical memory range owned by a device and detected by + * its device driver. + */ +void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr, + unsigned long long paddr, unsigned long long size) +{ + unsigned long old_identity_base = 0; + + if (os_info_has_vm()) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); + fill_ptload(phdr, paddr, old_identity_base + paddr, size); +} +#endif + +/* + * Prepare PT_LOAD type program header for kernel image region + */ +static void text_init(Elf64_Phdr *phdr) +{ + unsigned long start_phys = os_info_old_value(OS_INFO_IMAGE_PHYS); + unsigned long start = os_info_old_value(OS_INFO_IMAGE_START); + unsigned long end = os_info_old_value(OS_INFO_IMAGE_END); + + phdr->p_type = PT_LOAD; + phdr->p_vaddr = start; + phdr->p_filesz = end - start; + phdr->p_memsz = end - start; + phdr->p_offset = start_phys; + phdr->p_paddr = start_phys; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; +} + /* * Initialize notes (new kernel) */ @@ -600,7 +587,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) return ptr; } -static size_t get_elfcorehdr_size(int mem_chunk_cnt) +static size_t get_elfcorehdr_size(int phdr_count) { size_t size; @@ -608,7 +595,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) /* PT_NOTES */ size += sizeof(Elf64_Phdr); /* nt_prpsinfo */ - size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo)); + size += nt_size(PRPSINFO, struct elf_prpsinfo); /* regsets */ size += get_cpu_cnt() * get_cpu_elf_notes_size(); /* nt_vmcoreinfo */ @@ -616,7 +603,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) /* nt_final */ size += sizeof(Elf64_Nhdr); /* PT_LOADS */ - size += mem_chunk_cnt * sizeof(Elf64_Phdr); + size += phdr_count * sizeof(Elf64_Phdr); return size; } @@ -626,33 +613,35 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) */ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { - Elf64_Phdr *phdr_notes, *phdr_loads; - int mem_chunk_cnt; + Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text; + int mem_chunk_cnt, phdr_text_cnt; + size_t alloc_size; void *ptr, *hdr; - u32 alloc_size; u64 hdr_off; /* If we are not in kdump or zfcp/nvme dump mode return */ - if (!OLDMEM_BASE && !is_ipl_type_dump()) + if (!oldmem_data.start && !is_ipl_type_dump()) return 0; /* If we cannot get HSA size for zfcp/nvme dump return error */ if (is_ipl_type_dump() && !sclp.hsa_size) return -ENODEV; /* For kdump, exclude previous crashkernel memory */ - if (OLDMEM_BASE) { - oldmem_region.base = OLDMEM_BASE; - oldmem_region.size = OLDMEM_SIZE; - oldmem_type.total_size = OLDMEM_SIZE; + if (oldmem_data.start) { + oldmem_region.base = oldmem_data.start; + oldmem_region.size = oldmem_data.size; + oldmem_type.total_size = oldmem_data.size; } mem_chunk_cnt = get_mem_chunk_cnt(); + phdr_text_cnt = os_info_has_vm() ? 1 : 0; - alloc_size = get_elfcorehdr_size(mem_chunk_cnt); + alloc_size = get_elfcorehdr_size(mem_chunk_cnt + phdr_text_cnt); hdr = kzalloc(alloc_size, GFP_KERNEL); - /* Without elfcorehdr /proc/vmcore cannot be created. Thus creating + /* + * Without elfcorehdr /proc/vmcore cannot be created. Thus creating * a dump with this crash kernel will fail. Panic now to allow other * dump mechanisms to take over. */ @@ -660,18 +649,25 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) panic("s390 kdump allocating elfcorehdr failed"); /* Init elf header */ - ptr = ehdr_init(hdr, mem_chunk_cnt); + phdr_notes = ehdr_init(hdr, mem_chunk_cnt + phdr_text_cnt); /* Init program headers */ - phdr_notes = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); - phdr_loads = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + if (phdr_text_cnt) { + phdr_text = phdr_notes + 1; + phdr_loads = phdr_text + 1; + } else { + phdr_loads = phdr_notes + 1; + } + ptr = PTR_ADD(phdr_loads, sizeof(Elf64_Phdr) * mem_chunk_cnt); /* Init notes */ hdr_off = PTR_DIFF(ptr, hdr); ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + /* Init kernel text program header */ + if (phdr_text_cnt) + text_init(phdr_text); /* Init loads */ + loads_init(phdr_loads, phdr_text_cnt); + /* Finalize program headers */ hdr_off = PTR_DIFF(ptr, hdr); - loads_init(phdr_loads, hdr_off); *addr = (unsigned long long) hdr; *size = (unsigned long long) hdr_off; BUG_ON(elfcorehdr_size > alloc_size); diff --git a/arch/s390/kernel/ctlreg.c b/arch/s390/kernel/ctlreg.c new file mode 100644 index 000000000000..a0501f4c7e7a --- /dev/null +++ b/arch/s390/kernel/ctlreg.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include <linux/irqflags.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/cache.h> +#include <asm/abs_lowcore.h> +#include <asm/ctlreg.h> + +/* + * ctl_lock guards access to global control register contents which + * are kept in the control register save area within absolute lowcore + * at physical address zero. + */ +static DEFINE_SPINLOCK(system_ctl_lock); + +void system_ctlreg_lock(void) + __acquires(&system_ctl_lock) +{ + spin_lock(&system_ctl_lock); +} + +void system_ctlreg_unlock(void) + __releases(&system_ctl_lock) +{ + spin_unlock(&system_ctl_lock); +} + +static bool system_ctlreg_area_init __ro_after_init; + +void __init system_ctlreg_init_save_area(struct lowcore *lc) +{ + struct lowcore *abs_lc; + + abs_lc = get_abs_lowcore(); + __local_ctl_store(0, 15, lc->cregs_save_area); + __local_ctl_store(0, 15, abs_lc->cregs_save_area); + put_abs_lowcore(abs_lc); + system_ctlreg_area_init = true; +} + +struct ctlreg_parms { + unsigned long andval; + unsigned long orval; + unsigned long val; + int request; + int cr; +}; + +static void ctlreg_callback(void *info) +{ + struct ctlreg_parms *pp = info; + struct ctlreg regs[16]; + + __local_ctl_store(0, 15, regs); + if (pp->request == CTLREG_LOAD) { + regs[pp->cr].val = pp->val; + } else { + regs[pp->cr].val &= pp->andval; + regs[pp->cr].val |= pp->orval; + } + __local_ctl_load(0, 15, regs); +} + +static void system_ctlreg_update(void *info) +{ + unsigned long flags; + + if (system_state == SYSTEM_BOOTING) { + /* + * For very early calls do not call on_each_cpu() + * since not everything might be setup. + */ + local_irq_save(flags); + ctlreg_callback(info); + local_irq_restore(flags); + } else { + on_each_cpu(ctlreg_callback, info, 1); + } +} + +void system_ctlreg_modify(unsigned int cr, unsigned long data, int request) +{ + struct ctlreg_parms pp = { .cr = cr, .request = request, }; + struct lowcore *abs_lc; + + switch (request) { + case CTLREG_SET_BIT: + pp.orval = 1UL << data; + pp.andval = -1UL; + break; + case CTLREG_CLEAR_BIT: + pp.orval = 0; + pp.andval = ~(1UL << data); + break; + case CTLREG_LOAD: + pp.val = data; + break; + } + if (system_ctlreg_area_init) { + system_ctlreg_lock(); + abs_lc = get_abs_lowcore(); + if (request == CTLREG_LOAD) { + abs_lc->cregs_save_area[cr].val = pp.val; + } else { + abs_lc->cregs_save_area[cr].val &= pp.andval; + abs_lc->cregs_save_area[cr].val |= pp.orval; + } + put_abs_lowcore(abs_lc); + system_ctlreg_update(&pp); + system_ctlreg_unlock(); + } else { + system_ctlreg_update(&pp); + } +} +EXPORT_SYMBOL(system_ctlreg_modify); diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 09b6c6402f9b..71cdb6845dd7 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -10,8 +10,7 @@ * Bugreports to: <Linux390@de.ibm.com> */ -#define KMSG_COMPONENT "s390dbf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "s390dbf: " fmt #include <linux/stddef.h> #include <linux/kernel.h> @@ -24,6 +23,8 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/math.h> +#include <linux/minmax.h> #include <linux/debugfs.h> #include <asm/debug.h> @@ -37,13 +38,13 @@ typedef struct file_private_info { loff_t offset; /* offset of last read in file */ - int act_area; /* number of last formated area */ + int act_area; /* number of last formatted area */ int act_page; /* act page in given area */ - int act_entry; /* last formated entry (offset */ + int act_entry; /* last formatted entry (offset */ /* relative to beginning of last */ - /* formated page) */ + /* formatted page) */ size_t act_entry_offset; /* up to this offset we copied */ - /* in last read the last formated */ + /* in last read the last formatted */ /* entry to userland */ char temp_buf[2048]; /* buffer for output */ debug_info_t *debug_info_org; /* original debug information */ @@ -59,10 +60,10 @@ typedef struct { * except of floats, and long long (32 bit) * */ - long args[0]; + long args[]; } debug_sprintf_entry_t; -/* internal function prototyes */ +/* internal function prototypes */ static int debug_init(void); static ssize_t debug_output(struct file *file, char __user *user_buf, @@ -76,12 +77,14 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area, static void debug_info_get(debug_info_t *); static void debug_info_put(debug_info_t *); static int debug_prolog_level_fn(debug_info_t *id, - struct debug_view *view, char *out_buf); + struct debug_view *view, char *out_buf, + size_t out_buf_size); static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); static int debug_prolog_pages_fn(debug_info_t *id, - struct debug_view *view, char *out_buf); + struct debug_view *view, char *out_buf, + size_t out_buf_size); static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); @@ -89,9 +92,10 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf); -static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, debug_sprintf_entry_t *curr_event); + char *out_buf, size_t out_buf_size, + const char *in_buf); +static void debug_areas_swap(debug_info_t *a, debug_info_t *b); +static void debug_events_append(debug_info_t *dest, debug_info_t *src); /* globals */ @@ -136,7 +140,7 @@ struct debug_view debug_sprintf_view = { "sprintf", NULL, &debug_dflt_header_fn, - (debug_format_proc_t *)&debug_sprintf_format_fn, + &debug_sprintf_format_fn, NULL, NULL }; @@ -160,7 +164,6 @@ static const struct file_operations debug_file_ops = { .write = debug_input, .open = debug_open, .release = debug_close, - .llseek = no_llseek, }; static struct dentry *debug_debugfs_root_entry; @@ -247,7 +250,7 @@ static debug_info_t *debug_info_alloc(const char *name, int pages_per_area, rc->level = level; rc->buf_size = buf_size; rc->entry_size = sizeof(debug_entry_t) + buf_size; - strlcpy(rc->name, name, sizeof(rc->name)); + strscpy(rc->name, name); memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0, DEBUG_MAX_VIEWS * sizeof(struct dentry *)); refcount_set(&(rc->ref_count), 0); @@ -311,24 +314,6 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area, goto out; rc->mode = mode & ~S_IFMT; - - /* create root directory */ - rc->debugfs_root_entry = debugfs_create_dir(rc->name, - debug_debugfs_root_entry); - - /* append new element to linked list */ - if (!debug_area_first) { - /* first element in list */ - debug_area_first = rc; - rc->prev = NULL; - } else { - /* append element to end of list */ - debug_area_last->next = rc; - rc->prev = debug_area_last; - } - debug_area_last = rc; - rc->next = NULL; - refcount_set(&rc->ref_count, 1); out: return rc; @@ -366,7 +351,10 @@ static debug_info_t *debug_info_copy(debug_info_t *in, int mode) for (i = 0; i < in->nr_areas; i++) { for (j = 0; j < in->pages_per_area; j++) memcpy(rc->areas[i][j], in->areas[i][j], PAGE_SIZE); + rc->active_pages[i] = in->active_pages[i]; + rc->active_entries[i] = in->active_entries[i]; } + rc->active_area = in->active_area; out: spin_unlock_irqrestore(&in->lock, flags); return rc; @@ -388,32 +376,15 @@ static void debug_info_get(debug_info_t *db_info) */ static void debug_info_put(debug_info_t *db_info) { - int i; - if (!db_info) return; - if (refcount_dec_and_test(&db_info->ref_count)) { - for (i = 0; i < DEBUG_MAX_VIEWS; i++) { - if (!db_info->views[i]) - continue; - debugfs_remove(db_info->debugfs_entries[i]); - } - debugfs_remove(db_info->debugfs_root_entry); - if (db_info == debug_area_first) - debug_area_first = db_info->next; - if (db_info == debug_area_last) - debug_area_last = db_info->prev; - if (db_info->prev) - db_info->prev->next = db_info->next; - if (db_info->next) - db_info->next->prev = db_info->prev; + if (refcount_dec_and_test(&db_info->ref_count)) debug_info_free(db_info); - } } /* * debug_format_entry: - * - format one debug entry and return size of formated data + * - format one debug entry and return size of formatted data */ static int debug_format_entry(file_private_info_t *p_info) { @@ -424,8 +395,10 @@ static int debug_format_entry(file_private_info_t *p_info) if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { /* print prolog */ - if (view->prolog_proc) - len += view->prolog_proc(id_snap, view, p_info->temp_buf); + if (view->prolog_proc) { + len += view->prolog_proc(id_snap, view, p_info->temp_buf, + sizeof(p_info->temp_buf)); + } goto out; } if (!id_snap->areas) /* this is true, if we have a prolog only view */ @@ -435,21 +408,31 @@ static int debug_format_entry(file_private_info_t *p_info) if (act_entry->clock == 0LL) goto out; /* empty entry */ - if (view->header_proc) + if (view->header_proc) { len += view->header_proc(id_snap, view, p_info->act_area, - act_entry, p_info->temp_buf + len); - if (view->format_proc) + act_entry, p_info->temp_buf + len, + sizeof(p_info->temp_buf) - len); + } + if (view->format_proc) { len += view->format_proc(id_snap, view, p_info->temp_buf + len, + sizeof(p_info->temp_buf) - len, DEBUG_DATA(act_entry)); + } out: return len; } -/* - * debug_next_entry: - * - goto next entry in p_info +/** + * debug_next_entry - Go to the next entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the next entry. If no further entry + * exists the current position is set to one after the end the return value + * indicates that no further entries exist. + * + * Return: True if there are more following entries, false otherwise */ -static inline int debug_next_entry(file_private_info_t *p_info) +static inline bool debug_next_entry(file_private_info_t *p_info) { debug_info_t *id; @@ -457,10 +440,10 @@ static inline int debug_next_entry(file_private_info_t *p_info) if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { p_info->act_entry = 0; p_info->act_page = 0; - goto out; + return true; } if (!id->areas) - return 1; + return false; p_info->act_entry += id->entry_size; /* switch to next page, if we reached the end of the page */ if (p_info->act_entry > (PAGE_SIZE - id->entry_size)) { @@ -473,16 +456,93 @@ static inline int debug_next_entry(file_private_info_t *p_info) p_info->act_page = 0; } if (p_info->act_area >= id->nr_areas) - return 1; + return false; } -out: - return 0; + return true; +} + +/** + * debug_to_act_entry - Go to the currently active entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the currently active + * entry of @p_info->debug_info_snap + */ +static void debug_to_act_entry(file_private_info_t *p_info) +{ + debug_info_t *snap_id; + + snap_id = p_info->debug_info_snap; + p_info->act_area = snap_id->active_area; + p_info->act_page = snap_id->active_pages[snap_id->active_area]; + p_info->act_entry = snap_id->active_entries[snap_id->active_area]; +} + +/** + * debug_prev_entry - Go to the previous entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the previous entry. If no previous entry + * exists the current position is set left as DEBUG_PROLOG_ENTRY and the return value + * indicates that no previous entries exist. + * + * Return: True if there are more previous entries, false otherwise + */ + +static inline bool debug_prev_entry(file_private_info_t *p_info) +{ + debug_info_t *id; + + id = p_info->debug_info_snap; + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) + debug_to_act_entry(p_info); + if (!id->areas) + return false; + p_info->act_entry -= id->entry_size; + /* switch to prev page, if we reached the beginning of the page */ + if (p_info->act_entry < 0) { + /* end of previous page */ + p_info->act_entry = rounddown(PAGE_SIZE, id->entry_size) - id->entry_size; + p_info->act_page--; + if (p_info->act_page < 0) { + /* previous area */ + p_info->act_area--; + p_info->act_page = id->pages_per_area - 1; + } + if (p_info->act_area < 0) + p_info->act_area = (id->nr_areas - 1) % id->nr_areas; + } + /* check full circle */ + if (id->active_area == p_info->act_area && + id->active_pages[id->active_area] == p_info->act_page && + id->active_entries[id->active_area] == p_info->act_entry) + return false; + return true; +} + +/** + * debug_move_entry - Go to next entry in either the forward or backward direction + * @p_info: Private info that is manipulated + * @reverse: If true go to the next entry in reverse i.e. previous + * + * Sets the current position in @p_info to the next (@reverse == false) or + * previous (@reverse == true) entry. + * + * Return: True if there are further entries in that direction, + * false otherwise. + */ +static bool debug_move_entry(file_private_info_t *p_info, bool reverse) +{ + if (reverse) + return debug_prev_entry(p_info); + else + return debug_next_entry(p_info); } /* * debug_output: * - called for user read() - * - copies formated debug entries to the user buffer + * - copies formatted debug entries to the user buffer */ static ssize_t debug_output(struct file *file, /* file descriptor */ char __user *user_buf, /* user buffer */ @@ -518,7 +578,7 @@ static ssize_t debug_output(struct file *file, /* file descriptor */ } if (copy_size == formatted_line_residue) { entry_offset = 0; - if (debug_next_entry(p_info)) + if (!debug_next_entry(p_info)) goto out; } } @@ -553,15 +613,51 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf, return rc; /* number of input characters */ } +static file_private_info_t *debug_file_private_alloc(debug_info_t *debug_info, + struct debug_view *view) +{ + debug_info_t *debug_info_snapshot; + file_private_info_t *p_info; + + /* + * Make snapshot of current debug areas to get it consistent. + * To copy all the areas is only needed, if we have a view which + * formats the debug areas. + */ + if (!view->format_proc && !view->header_proc) + debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); + else + debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); + + if (!debug_info_snapshot) + return NULL; + p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + if (!p_info) { + debug_info_free(debug_info_snapshot); + return NULL; + } + p_info->offset = 0; + p_info->debug_info_snap = debug_info_snapshot; + p_info->debug_info_org = debug_info; + p_info->view = view; + p_info->act_area = 0; + p_info->act_page = 0; + p_info->act_entry = DEBUG_PROLOG_ENTRY; + p_info->act_entry_offset = 0; + debug_info_get(debug_info); + + return p_info; +} + /* * debug_open: * - called for user open() - * - copies formated output to private_data area of the file + * - copies formatted output to private_data area of the file * handle */ static int debug_open(struct inode *inode, struct file *file) { - debug_info_t *debug_info, *debug_info_snapshot; + debug_info_t *debug_info; file_private_info_t *p_info; int i, rc = 0; @@ -579,42 +675,26 @@ static int debug_open(struct inode *inode, struct file *file) goto out; found: - - /* Make snapshot of current debug areas to get it consistent. */ - /* To copy all the areas is only needed, if we have a view which */ - /* formats the debug areas. */ - - if (!debug_info->views[i]->format_proc && !debug_info->views[i]->header_proc) - debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); - else - debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); - - if (!debug_info_snapshot) { - rc = -ENOMEM; - goto out; - } - p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + p_info = debug_file_private_alloc(debug_info, debug_info->views[i]); if (!p_info) { - debug_info_free(debug_info_snapshot); rc = -ENOMEM; goto out; } - p_info->offset = 0; - p_info->debug_info_snap = debug_info_snapshot; - p_info->debug_info_org = debug_info; - p_info->view = debug_info->views[i]; - p_info->act_area = 0; - p_info->act_page = 0; - p_info->act_entry = DEBUG_PROLOG_ENTRY; - p_info->act_entry_offset = 0; file->private_data = p_info; - debug_info_get(debug_info); nonseekable_open(inode, file); out: mutex_unlock(&debug_mutex); return rc; } +static void debug_file_private_free(file_private_info_t *p_info) +{ + if (p_info->debug_info_snap) + debug_info_free(p_info->debug_info_snap); + debug_info_put(p_info->debug_info_org); + kfree(p_info); +} + /* * debug_close: * - called for user close() @@ -625,14 +705,85 @@ static int debug_close(struct inode *inode, struct file *file) file_private_info_t *p_info; p_info = (file_private_info_t *) file->private_data; - if (p_info->debug_info_snap) - debug_info_free(p_info->debug_info_snap); - debug_info_put(p_info->debug_info_org); - kfree(file->private_data); + debug_file_private_free(p_info); + file->private_data = NULL; return 0; /* success */ } /** + * debug_dump - Get a textual representation of debug info, or as much as fits + * @id: Debug information to use + * @view: View with which to dump the debug information + * @buf: Buffer the textual debug data representation is written to + * @buf_size: Size of the buffer, including the trailing '\0' byte + * @reverse: Go backwards from the last written entry + * + * This function may be used whenever a textual representation of the debug + * information is required without using an s390dbf file. + * + * Note: It is the callers responsibility to supply a view that is compatible + * with the debug information data. + * + * Return: On success returns the number of bytes written to the buffer not + * including the trailing '\0' byte. If bug_size == 0 the function returns 0. + * On failure an error code less than 0 is returned. + */ +ssize_t debug_dump(debug_info_t *id, struct debug_view *view, + char *buf, size_t buf_size, bool reverse) +{ + file_private_info_t *p_info; + size_t size, offset = 0; + + /* Need space for '\0' byte */ + if (buf_size < 1) + return 0; + buf_size--; + + p_info = debug_file_private_alloc(id, view); + if (!p_info) + return -ENOMEM; + + /* There is always at least the DEBUG_PROLOG_ENTRY */ + do { + size = debug_format_entry(p_info); + size = min(size, buf_size - offset); + memcpy(buf + offset, p_info->temp_buf, size); + offset += size; + if (offset >= buf_size) + break; + } while (debug_move_entry(p_info, reverse)); + debug_file_private_free(p_info); + buf[offset] = '\0'; + + return offset; +} + +/* Create debugfs entries and add to internal list. */ +static void _debug_register(debug_info_t *id) +{ + /* create root directory */ + id->debugfs_root_entry = debugfs_create_dir(id->name, + debug_debugfs_root_entry); + + /* append new element to linked list */ + if (!debug_area_first) { + /* first element in list */ + debug_area_first = id; + id->prev = NULL; + } else { + /* append element to end of list */ + debug_area_last->next = id; + id->prev = debug_area_last; + } + debug_area_last = id; + id->next = NULL; + + debug_register_view(id, &debug_level_view); + debug_register_view(id, &debug_flush_view); + debug_register_view(id, &debug_pages_view); +} + +/** * debug_register_mode() - creates and initializes debug area. * * @name: Name of debug log (e.g. used for debugfs entry) @@ -661,19 +812,16 @@ debug_info_t *debug_register_mode(const char *name, int pages_per_area, if ((uid != 0) || (gid != 0)) pr_warn("Root becomes the owner of all s390dbf files in sysfs\n"); BUG_ON(!initialized); - mutex_lock(&debug_mutex); /* create new debug_info */ rc = debug_info_create(name, pages_per_area, nr_areas, buf_size, mode); - if (!rc) - goto out; - debug_register_view(rc, &debug_level_view); - debug_register_view(rc, &debug_flush_view); - debug_register_view(rc, &debug_pages_view); -out: - if (!rc) + if (rc) { + mutex_lock(&debug_mutex); + _debug_register(rc); + mutex_unlock(&debug_mutex); + } else { pr_err("Registering debug feature %s failed\n", name); - mutex_unlock(&debug_mutex); + } return rc; } EXPORT_SYMBOL(debug_register_mode); @@ -703,6 +851,82 @@ debug_info_t *debug_register(const char *name, int pages_per_area, EXPORT_SYMBOL(debug_register); /** + * debug_register_static() - registers a static debug area + * + * @id: Handle for static debug area + * @pages_per_area: Number of pages per area + * @nr_areas: Number of debug areas + * + * Register debug_info_t defined using DEFINE_STATIC_DEBUG_INFO. + * + * Note: This function is called automatically via an initcall generated by + * DEFINE_STATIC_DEBUG_INFO. + */ +void debug_register_static(debug_info_t *id, int pages_per_area, int nr_areas) +{ + unsigned long flags; + debug_info_t *copy; + + if (!initialized) { + pr_err("Tried to register debug feature %s too early\n", + id->name); + return; + } + + copy = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size, + id->level, ALL_AREAS); + if (!copy) { + pr_err("Registering debug feature %s failed\n", id->name); + + /* Clear pointers to prevent tracing into released initdata. */ + spin_lock_irqsave(&id->lock, flags); + id->areas = NULL; + id->active_pages = NULL; + id->active_entries = NULL; + spin_unlock_irqrestore(&id->lock, flags); + + return; + } + + /* Replace static trace area with dynamic copy. */ + spin_lock_irqsave(&id->lock, flags); + debug_events_append(copy, id); + debug_areas_swap(id, copy); + spin_unlock_irqrestore(&id->lock, flags); + + /* Clear pointers to initdata and discard copy. */ + copy->areas = NULL; + copy->active_pages = NULL; + copy->active_entries = NULL; + debug_info_free(copy); + + mutex_lock(&debug_mutex); + _debug_register(id); + mutex_unlock(&debug_mutex); +} + +/* Remove debugfs entries and remove from internal list. */ +static void _debug_unregister(debug_info_t *id) +{ + int i; + + for (i = 0; i < DEBUG_MAX_VIEWS; i++) { + if (!id->views[i]) + continue; + debugfs_remove(id->debugfs_entries[i]); + } + debugfs_remove(id->debugfs_root_entry); + if (id == debug_area_first) + debug_area_first = id->next; + if (id == debug_area_last) + debug_area_last = id->prev; + if (id->prev) + id->prev->next = id->next; + if (id->next) + id->next->prev = id->prev; +} + +/** * debug_unregister() - give back debug area. * * @id: handle for debug log @@ -715,8 +939,10 @@ void debug_unregister(debug_info_t *id) if (!id) return; mutex_lock(&debug_mutex); - debug_info_put(id); + _debug_unregister(id); mutex_unlock(&debug_mutex); + + debug_info_put(id); } EXPORT_SYMBOL(debug_unregister); @@ -726,35 +952,28 @@ EXPORT_SYMBOL(debug_unregister); */ static int debug_set_size(debug_info_t *id, int nr_areas, int pages_per_area) { - debug_entry_t ***new_areas; + debug_info_t *new_id; unsigned long flags; - int rc = 0; if (!id || (nr_areas <= 0) || (pages_per_area < 0)) return -EINVAL; - if (pages_per_area > 0) { - new_areas = debug_areas_alloc(pages_per_area, nr_areas); - if (!new_areas) { - pr_info("Allocating memory for %i pages failed\n", - pages_per_area); - rc = -ENOMEM; - goto out; - } - } else { - new_areas = NULL; + + new_id = debug_info_alloc("", pages_per_area, nr_areas, id->buf_size, + id->level, ALL_AREAS); + if (!new_id) { + pr_info("Allocating memory for %i pages failed\n", + pages_per_area); + return -ENOMEM; } + spin_lock_irqsave(&id->lock, flags); - debug_areas_free(id); - id->areas = new_areas; - id->nr_areas = nr_areas; - id->pages_per_area = pages_per_area; - id->active_area = 0; - memset(id->active_entries, 0, sizeof(int)*id->nr_areas); - memset(id->active_pages, 0, sizeof(int)*id->nr_areas); + debug_events_append(new_id, id); + debug_areas_swap(new_id, id); + debug_info_free(new_id); spin_unlock_irqrestore(&id->lock, flags); pr_info("%s: set new size (%i pages)\n", id->name, pages_per_area); -out: - return rc; + + return 0; } /** @@ -772,16 +991,17 @@ void debug_set_level(debug_info_t *id, int new_level) if (!id) return; - spin_lock_irqsave(&id->lock, flags); + if (new_level == DEBUG_OFF_LEVEL) { - id->level = DEBUG_OFF_LEVEL; pr_info("%s: switched off\n", id->name); } else if ((new_level > DEBUG_MAX_LEVEL) || (new_level < 0)) { pr_info("%s: level %i is out of range (%i - %i)\n", id->name, new_level, 0, DEBUG_MAX_LEVEL); - } else { - id->level = new_level; + return; } + + spin_lock_irqsave(&id->lock, flags); + id->level = new_level; spin_unlock_irqrestore(&id->lock, flags); } EXPORT_SYMBOL(debug_set_level); @@ -821,6 +1041,42 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id) id->active_entries[id->active_area]); } +/* Swap debug areas of a and b. */ +static void debug_areas_swap(debug_info_t *a, debug_info_t *b) +{ + swap(a->nr_areas, b->nr_areas); + swap(a->pages_per_area, b->pages_per_area); + swap(a->areas, b->areas); + swap(a->active_area, b->active_area); + swap(a->active_pages, b->active_pages); + swap(a->active_entries, b->active_entries); +} + +/* Append all debug events in active area from source to destination log. */ +static void debug_events_append(debug_info_t *dest, debug_info_t *src) +{ + debug_entry_t *from, *to, *last; + + if (!src->areas || !dest->areas) + return; + + /* Loop over all entries in src, starting with oldest. */ + from = get_active_entry(src); + last = from; + do { + if (from->clock != 0LL) { + to = get_active_entry(dest); + memset(to, 0, dest->entry_size); + memcpy(to, from, min(src->entry_size, + dest->entry_size)); + proceed_active_entry(dest); + } + + proceed_active_entry(src); + from = get_active_entry(src); + } while (from != last); +} + /* * debug_finish_entry: * - set timestamp, caller address, cpu number etc. @@ -856,7 +1112,7 @@ static int debug_active = 1; * always allow read, allow write only if debug_stoppable is set or * if debug_active is already off */ -static int s390dbf_procactive(struct ctl_table *table, int write, +static int s390dbf_procactive(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) @@ -865,7 +1121,7 @@ static int s390dbf_procactive(struct ctl_table *table, int write, return 0; } -static struct ctl_table s390dbf_table[] = { +static const struct ctl_table s390dbf_table[] = { { .procname = "debug_stoppable", .data = &debug_stoppable, @@ -880,17 +1136,6 @@ static struct ctl_table s390dbf_table[] = { .mode = S_IRUGO | S_IWUSR, .proc_handler = s390dbf_procactive, }, - { } -}; - -static struct ctl_table s390dbf_dir_table[] = { - { - .procname = "s390dbf", - .maxlen = 0, - .mode = S_IRUGO | S_IXUGO, - .child = s390dbf_table, - }, - { } }; static struct ctl_table_header *s390dbf_sysctl_header; @@ -1111,16 +1356,17 @@ int debug_register_view(debug_info_t *id, struct debug_view *view) break; } if (i == DEBUG_MAX_VIEWS) { - pr_err("Registering view %s/%s would exceed the maximum " - "number of views %i\n", id->name, view->name, i); rc = -1; } else { id->views[i] = view; id->debugfs_entries[i] = pde; } spin_unlock_irqrestore(&id->lock, flags); - if (rc) + if (rc) { + pr_err("Registering view %s/%s would exceed the maximum " + "number of views %i\n", id->name, view->name, i); debugfs_remove(pde); + } out: return rc; } @@ -1169,18 +1415,12 @@ static inline char *debug_get_user_string(const char __user *user_buf, { char *buffer; - buffer = kmalloc(user_len + 1, GFP_KERNEL); - if (!buffer) - return ERR_PTR(-ENOMEM); - if (copy_from_user(buffer, user_buf, user_len) != 0) { - kfree(buffer); - return ERR_PTR(-EFAULT); - } + buffer = memdup_user_nul(user_buf, user_len); + if (IS_ERR(buffer)) + return buffer; /* got the string, now strip linefeed. */ if (buffer[user_len - 1] == '\n') buffer[user_len - 1] = 0; - else - buffer[user_len] = 0; return buffer; } @@ -1205,9 +1445,9 @@ static inline int debug_get_uint(char *buf) */ static int debug_prolog_pages_fn(debug_info_t *id, struct debug_view *view, - char *out_buf) + char *out_buf, size_t out_buf_size) { - return sprintf(out_buf, "%i\n", id->pages_per_area); + return scnprintf(out_buf, out_buf_size, "%i\n", id->pages_per_area); } /* @@ -1254,14 +1494,14 @@ out: * prints out actual debug level */ static int debug_prolog_level_fn(debug_info_t *id, struct debug_view *view, - char *out_buf) + char *out_buf, size_t out_buf_size) { int rc = 0; if (id->level == DEBUG_OFF_LEVEL) - rc = sprintf(out_buf, "-\n"); + rc = scnprintf(out_buf, out_buf_size, "-\n"); else - rc = sprintf(out_buf, "%i\n", id->level); + rc = scnprintf(out_buf, out_buf_size, "%i\n", id->level); return rc; } @@ -1378,22 +1618,24 @@ out: * prints debug data in hex/ascii format */ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf) + char *out_buf, size_t out_buf_size, const char *in_buf) { int i, rc = 0; - for (i = 0; i < id->buf_size; i++) - rc += sprintf(out_buf + rc, "%02x ", ((unsigned char *) in_buf)[i]); - rc += sprintf(out_buf + rc, "| "); + for (i = 0; i < id->buf_size; i++) { + rc += scnprintf(out_buf + rc, out_buf_size - rc, + "%02x ", ((unsigned char *)in_buf)[i]); + } + rc += scnprintf(out_buf + rc, out_buf_size - rc, "| "); for (i = 0; i < id->buf_size; i++) { unsigned char c = in_buf[i]; if (isascii(c) && isprint(c)) - rc += sprintf(out_buf + rc, "%c", c); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "%c", c); else - rc += sprintf(out_buf + rc, "."); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "."); } - rc += sprintf(out_buf + rc, "\n"); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "\n"); return rc; } @@ -1401,7 +1643,8 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, * prints header for debug entry */ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf) + int area, debug_entry_t *entry, char *out_buf, + size_t out_buf_size) { unsigned long sec, usec; unsigned long caller; @@ -1418,23 +1661,24 @@ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, else except_str = "-"; caller = (unsigned long) entry->caller; - rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ", - area, sec, usec, level, except_str, - entry->cpu, (void *)caller); + rc += scnprintf(out_buf, out_buf_size, "%02i %011ld:%06lu %1u %1s %04u %px ", + area, sec, usec, level, except_str, + entry->cpu, (void *)caller); return rc; } EXPORT_SYMBOL(debug_dflt_header_fn); /* - * prints debug data sprintf-formated: - * debug_sprinf_event/exception calls must be used together with this view + * prints debug data sprintf-formatted: + * debug_sprintf_event/exception calls must be used together with this view */ #define DEBUG_SPRINTF_MAX_ARGS 10 -static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, debug_sprintf_entry_t *curr_event) +int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, size_t out_buf_size, const char *inbuf) { + debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf; int num_longs, num_used_args = 0, i, rc = 0; int index[DEBUG_SPRINTF_MAX_ARGS]; @@ -1445,8 +1689,9 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, goto out; /* bufsize of entry too small */ if (num_longs == 1) { /* no args, we use only the string */ - strcpy(out_buf, curr_event->string); - rc = strlen(curr_event->string); + rc = strscpy(out_buf, curr_event->string, out_buf_size); + if (rc == -E2BIG) + rc = out_buf_size; goto out; } @@ -1458,15 +1703,17 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, for (i = 0; i < num_used_args; i++) index[i] = i; - rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]], - curr_event->args[index[1]], curr_event->args[index[2]], - curr_event->args[index[3]], curr_event->args[index[4]], - curr_event->args[index[5]], curr_event->args[index[6]], - curr_event->args[index[7]], curr_event->args[index[8]], - curr_event->args[index[9]]); + rc = scnprintf(out_buf, out_buf_size, + curr_event->string, curr_event->args[index[0]], + curr_event->args[index[1]], curr_event->args[index[2]], + curr_event->args[index[3]], curr_event->args[index[4]], + curr_event->args[index[5]], curr_event->args[index[6]], + curr_event->args[index[7]], curr_event->args[index[8]], + curr_event->args[index[9]]); out: return rc; } +EXPORT_SYMBOL(debug_sprintf_format_fn); /* * debug_init: @@ -1474,7 +1721,7 @@ out: */ static int __init debug_init(void) { - s390dbf_sysctl_header = register_sysctl_table(s390dbf_dir_table); + s390dbf_sysctl_header = register_sysctl("s390dbf", s390dbf_table); mutex_lock(&debug_mutex); debug_debugfs_root_entry = debugfs_create_dir(DEBUG_DIR_ROOT, NULL); initialized = 1; diff --git a/arch/s390/kernel/diag/Makefile b/arch/s390/kernel/diag/Makefile new file mode 100644 index 000000000000..956aee6c4090 --- /dev/null +++ b/arch/s390/kernel/diag/Makefile @@ -0,0 +1 @@ +obj-y := diag_misc.o diag324.o diag.o diag310.o diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag/diag.c index a3f47464c3f1..56b862ba9be8 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag/diag.c @@ -11,9 +11,13 @@ #include <linux/cpu.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/vmalloc.h> +#include <asm/asm-extable.h> #include <asm/diag.h> #include <asm/trace/diag.h> #include <asm/sections.h> +#include <asm/asm.h> +#include "../entry.h" struct diag_stat { unsigned int counter[NR_DIAG_STAT]; @@ -33,6 +37,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" }, [DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" }, [DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" }, + [DIAG_STAT_X08C] = { .code = 0x08c, .name = "Access 3270 Display Device Information" }, [DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" }, [DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" }, [DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" }, @@ -46,12 +51,28 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X2FC] = { .code = 0x2fc, .name = "Guest Performance Data" }, [DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" }, [DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" }, + [DIAG_STAT_X310] = { .code = 0x310, .name = "Memory Topology Information" }, [DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" }, + [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" }, + [DIAG_STAT_X324] = { .code = 0x324, .name = "Power Information Block" }, + [DIAG_STAT_X49C] = { .code = 0x49c, .name = "Warning-Track Interruption" }, [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; -struct diag_ops __bootdata_preserved(diag_dma_ops); -struct diag210 *__bootdata_preserved(__diag210_tmp_dma); +struct diag_ops __amode31_ref diag_amode31_ops = { + .diag210 = _diag210_amode31, + .diag26c = _diag26c_amode31, + .diag14 = _diag14_amode31, + .diag0c = _diag0c_amode31, + .diag8c = _diag8c_amode31, + .diag308_reset = _diag308_reset_amode31 +}; + +static struct diag210 _diag210_tmp_amode31 __section(".amode31.data"); +struct diag210 __amode31_ref *__diag210_tmp_amode31 = &_diag210_tmp_amode31; + +static struct diag8c _diag8c_tmp_amode31 __section(".amode31.data"); +static struct diag8c __amode31_ref *__diag8c_tmp_amode31 = &_diag8c_tmp_amode31; static int show_diag_stat(struct seq_file *m, void *v) { @@ -59,7 +80,7 @@ static int show_diag_stat(struct seq_file *m, void *v) unsigned long n = (unsigned long) v - 1; int cpu, prec, tmp; - get_online_cpus(); + cpus_read_lock(); if (n == 0) { seq_puts(m, " "); @@ -78,7 +99,7 @@ static int show_diag_stat(struct seq_file *m, void *v) } seq_printf(m, " %s\n", diag_map[n-1].name); } - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -130,20 +151,51 @@ void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr) EXPORT_SYMBOL(diag_stat_inc_norecursion); /* + * Diagnose 0c: Pseudo Timer + */ +void diag0c(struct hypfs_diag0c_entry *data) +{ + diag_stat_inc(DIAG_STAT_X00C); + diag_amode31_ops.diag0c(virt_to_phys(data)); +} + +/* * Diagnose 14: Input spool file manipulation + * + * The subcode parameter determines the type of the first parameter rx. + * Currently used are the following 3 subcommands: + * 0x0: Read the Next Spool File Buffer (Data Record) + * 0x28: Position a Spool File to the Designated Record + * 0xfff: Retrieve Next File Descriptor + * + * For subcommands 0x0 and 0xfff, the value of the first parameter is + * a virtual address of a memory buffer and needs virtual to physical + * address translation. For other subcommands the rx parameter is not + * a virtual address. */ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) { diag_stat_inc(DIAG_STAT_X014); - return diag_dma_ops.diag14(rx, ry1, subcode); + switch (subcode) { + case 0x0: + case 0xfff: + rx = virt_to_phys((void *)rx); + break; + default: + /* Do nothing */ + break; + } + return diag_amode31_ops.diag14(rx, ry1, subcode); } EXPORT_SYMBOL(diag14); +#define DIAG204_BUSY_RC 8 + static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) { union register_pair rp = { .even = *subcode, .odd = size }; - asm volatile( + asm_inline volatile( " diag %[addr],%[rp],0x204\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) @@ -152,12 +204,35 @@ static inline int __diag204(unsigned long *subcode, unsigned long size, void *ad return rp.odd; } +/** + * diag204() - Issue diagnose 204 call. + * @subcode: Subcode of diagnose 204 to be executed. + * @size: Size of area in pages which @area points to, if given. + * @addr: Vmalloc'ed memory area where the result is written to. + * + * Execute diagnose 204 with the given subcode and write the result to the + * memory area specified with @addr. For subcodes which do not write a + * result to memory both @size and @addr must be zero. If @addr is + * specified it must be page aligned and must have been allocated with + * vmalloc(). Conversion to real / physical addresses will be handled by + * this function if required. + */ int diag204(unsigned long subcode, unsigned long size, void *addr) { + if (addr) { + if (WARN_ON_ONCE(!is_vmalloc_addr(addr))) + return -EINVAL; + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE))) + return -EINVAL; + } + if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4) + addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr)); diag_stat_inc(DIAG_STAT_X204); size = __diag204(&subcode, size, addr); - if (subcode) - return -1; + if (subcode == DIAG204_BUSY_RC) + return -EBUSY; + else if (subcode) + return -EOPNOTSUPP; return size; } EXPORT_SYMBOL(diag204); @@ -172,29 +247,53 @@ int diag210(struct diag210 *addr) int ccode; spin_lock_irqsave(&diag210_lock, flags); - *__diag210_tmp_dma = *addr; + *__diag210_tmp_amode31 = *addr; diag_stat_inc(DIAG_STAT_X210); - ccode = diag_dma_ops.diag210(__diag210_tmp_dma); + ccode = diag_amode31_ops.diag210(__diag210_tmp_amode31); - *addr = *__diag210_tmp_dma; + *addr = *__diag210_tmp_amode31; spin_unlock_irqrestore(&diag210_lock, flags); return ccode; } EXPORT_SYMBOL(diag210); +/* + * Diagnose 8C: Access 3270 Display Device Information + */ +int diag8c(struct diag8c *addr, struct ccw_dev_id *devno) +{ + static DEFINE_SPINLOCK(diag8c_lock); + unsigned long flags; + int ccode; + + spin_lock_irqsave(&diag8c_lock, flags); + + diag_stat_inc(DIAG_STAT_X08C); + ccode = diag_amode31_ops.diag8c(__diag8c_tmp_amode31, devno, sizeof(*addr)); + + *addr = *__diag8c_tmp_amode31; + spin_unlock_irqrestore(&diag8c_lock, flags); + + return ccode; +} +EXPORT_SYMBOL(diag8c); + int diag224(void *ptr) { + unsigned long addr = __pa(ptr); int rc = -EOPNOTSUPP; diag_stat_inc(DIAG_STAT_X224); - asm volatile( - " diag %1,%2,0x224\n" - "0: lhi %0,0x0\n" + asm_inline volatile("\n" + " diag %[type],%[addr],0x224\n" + "0: lhi %[rc],0\n" "1:\n" EX_TABLE(0b,1b) - : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); + : [rc] "+d" (rc) + , "=m" (*(struct { char buf[PAGE_SIZE]; } *)ptr) + : [type] "d" (0), [addr] "d" (addr)); return rc; } EXPORT_SYMBOL(diag224); @@ -205,6 +304,21 @@ EXPORT_SYMBOL(diag224); int diag26c(void *req, void *resp, enum diag26c_sc subcode) { diag_stat_inc(DIAG_STAT_X26C); - return diag_dma_ops.diag26c(req, resp, subcode); + return diag_amode31_ops.diag26c(virt_to_phys(req), virt_to_phys(resp), subcode); } EXPORT_SYMBOL(diag26c); + +int diag49c(unsigned long subcode) +{ + int cc; + + diag_stat_inc(DIAG_STAT_X49C); + asm volatile( + " diag %[subcode],0,0x49c\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [subcode] "d" (subcode) + : CC_CLOBBER); + return CC_TRANSFORM(cc); +} +EXPORT_SYMBOL(diag49c); diff --git a/arch/s390/kernel/diag/diag310.c b/arch/s390/kernel/diag/diag310.c new file mode 100644 index 000000000000..f411562aa7f6 --- /dev/null +++ b/arch/s390/kernel/diag/diag310.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Request memory topology information via diag0x310. + * + * Copyright IBM Corp. 2025 + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <asm/diag.h> +#include <asm/sclp.h> +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +#define DIAG310_LEVELMIN 1 +#define DIAG310_LEVELMAX 6 + +enum diag310_sc { + DIAG310_SUBC_0 = 0, + DIAG310_SUBC_1 = 1, + DIAG310_SUBC_4 = 4, + DIAG310_SUBC_5 = 5 +}; + +enum diag310_retcode { + DIAG310_RET_SUCCESS = 0x0001, + DIAG310_RET_BUSY = 0x0101, + DIAG310_RET_OPNOTSUPP = 0x0102, + DIAG310_RET_SC4_INVAL = 0x0401, + DIAG310_RET_SC4_NODATA = 0x0402, + DIAG310_RET_SC5_INVAL = 0x0501, + DIAG310_RET_SC5_NODATA = 0x0502, + DIAG310_RET_SC5_ESIZE = 0x0503 +}; + +union diag310_response { + u64 response; + struct { + u64 result : 32; + u64 : 16; + u64 rc : 16; + }; +}; + +union diag310_req_subcode { + u64 subcode; + struct { + u64 : 48; + u64 st : 8; + u64 sc : 8; + }; +}; + +union diag310_req_size { + u64 size; + struct { + u64 page_count : 32; + u64 : 32; + }; +}; + +static inline unsigned long diag310(unsigned long subcode, unsigned long size, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr, .odd = size }; + + diag_stat_inc(DIAG_STAT_X310); + asm volatile("diag %[rp],%[subcode],0x310" + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "memory"); + return rp.odd; +} + +static int diag310_result_to_errno(unsigned int result) +{ + switch (result) { + case DIAG310_RET_BUSY: + return -EBUSY; + case DIAG310_RET_OPNOTSUPP: + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + +static int diag310_get_subcode_mask(unsigned long *mask) +{ + union diag310_response res; + + res.response = diag310(DIAG310_SUBC_0, 0, NULL); + if (res.rc != DIAG310_RET_SUCCESS) + return diag310_result_to_errno(res.rc); + *mask = res.response; + return 0; +} + +static int diag310_get_memtop_stride(unsigned long *stride) +{ + union diag310_response res; + + res.response = diag310(DIAG310_SUBC_1, 0, NULL); + if (res.rc != DIAG310_RET_SUCCESS) + return diag310_result_to_errno(res.rc); + *stride = res.result; + return 0; +} + +static int diag310_get_memtop_size(unsigned long *pages, unsigned long level) +{ + union diag310_req_subcode req = { .sc = DIAG310_SUBC_4, .st = level }; + union diag310_response res; + + res.response = diag310(req.subcode, 0, NULL); + switch (res.rc) { + case DIAG310_RET_SUCCESS: + *pages = res.result; + return 0; + case DIAG310_RET_SC4_NODATA: + return -ENODATA; + case DIAG310_RET_SC4_INVAL: + return -EINVAL; + default: + return diag310_result_to_errno(res.rc); + } +} + +static int diag310_store_topology_map(void *buf, unsigned long pages, unsigned long level) +{ + union diag310_req_subcode req_sc = { .sc = DIAG310_SUBC_5, .st = level }; + union diag310_req_size req_size = { .page_count = pages }; + union diag310_response res; + + res.response = diag310(req_sc.subcode, req_size.size, buf); + switch (res.rc) { + case DIAG310_RET_SUCCESS: + return 0; + case DIAG310_RET_SC5_NODATA: + return -ENODATA; + case DIAG310_RET_SC5_ESIZE: + return -EOVERFLOW; + case DIAG310_RET_SC5_INVAL: + return -EINVAL; + default: + return diag310_result_to_errno(res.rc); + } +} + +static int diag310_check_features(void) +{ + static int features_available; + unsigned long mask; + int rc; + + if (READ_ONCE(features_available)) + return 0; + if (!sclp.has_diag310) + return -EOPNOTSUPP; + rc = diag310_get_subcode_mask(&mask); + if (rc) + return rc; + if (!test_bit_inv(DIAG310_SUBC_1, &mask)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG310_SUBC_4, &mask)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG310_SUBC_5, &mask)) + return -EOPNOTSUPP; + WRITE_ONCE(features_available, 1); + return 0; +} + +static int memtop_get_stride_len(unsigned long *res) +{ + static unsigned long memtop_stride; + unsigned long stride; + int rc; + + stride = READ_ONCE(memtop_stride); + if (!stride) { + rc = diag310_get_memtop_stride(&stride); + if (rc) + return rc; + WRITE_ONCE(memtop_stride, stride); + } + *res = stride; + return 0; +} + +static int memtop_get_page_count(unsigned long *res, unsigned long level) +{ + static unsigned long memtop_pages[DIAG310_LEVELMAX]; + unsigned long pages; + int rc; + + if (level > DIAG310_LEVELMAX || level < DIAG310_LEVELMIN) + return -EINVAL; + pages = READ_ONCE(memtop_pages[level - 1]); + if (!pages) { + rc = diag310_get_memtop_size(&pages, level); + if (rc) + return rc; + WRITE_ONCE(memtop_pages[level - 1], pages); + } + *res = pages; + return 0; +} + +long diag310_memtop_stride(unsigned long arg) +{ + size_t __user *argp = (void __user *)arg; + unsigned long stride; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + rc = memtop_get_stride_len(&stride); + if (rc) + return rc; + if (put_user(stride, argp)) + return -EFAULT; + return 0; +} + +long diag310_memtop_len(unsigned long arg) +{ + size_t __user *argp = (void __user *)arg; + unsigned long pages, level; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + if (get_user(level, argp)) + return -EFAULT; + rc = memtop_get_page_count(&pages, level); + if (rc) + return rc; + if (put_user(pages * PAGE_SIZE, argp)) + return -EFAULT; + return 0; +} + +long diag310_memtop_buf(unsigned long arg) +{ + struct diag310_memtop __user *udata = (struct diag310_memtop __user *)arg; + unsigned long level, pages, data_size; + u64 address; + void *buf; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + if (get_user(level, &udata->nesting_lvl)) + return -EFAULT; + if (get_user(address, &udata->address)) + return -EFAULT; + rc = memtop_get_page_count(&pages, level); + if (rc) + return rc; + data_size = pages * PAGE_SIZE; + buf = __vmalloc_node(data_size, PAGE_SIZE, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, + NUMA_NO_NODE, __builtin_return_address(0)); + if (!buf) + return -ENOMEM; + rc = diag310_store_topology_map(buf, pages, level); + if (rc) + goto out; + if (copy_to_user((void __user *)address, buf, data_size)) + rc = -EFAULT; +out: + vfree(buf); + return rc; +} diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c new file mode 100644 index 000000000000..fe325c2a2d0d --- /dev/null +++ b/arch/s390/kernel/diag/diag324.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Request power readings for resources in a computing environment via + * diag 0x324. diag 0x324 stores the power readings in the power information + * block (pib). + * + * Copyright IBM Corp. 2024 + */ + +#define pr_fmt(fmt) "diag324: " fmt +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/ioctl.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/ktime.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> + +#include <asm/diag.h> +#include <asm/sclp.h> +#include <asm/timex.h> +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +enum subcode { + DIAG324_SUBC_0 = 0, + DIAG324_SUBC_1 = 1, + DIAG324_SUBC_2 = 2, +}; + +enum retcode { + DIAG324_RET_SUCCESS = 0x0001, + DIAG324_RET_SUBC_NOTAVAIL = 0x0103, + DIAG324_RET_INSUFFICIENT_SIZE = 0x0104, + DIAG324_RET_READING_UNAVAILABLE = 0x0105, +}; + +union diag324_response { + u64 response; + struct { + u64 installed : 32; + u64 : 16; + u64 rc : 16; + } sc0; + struct { + u64 format : 16; + u64 : 16; + u64 pib_len : 16; + u64 rc : 16; + } sc1; + struct { + u64 : 48; + u64 rc : 16; + } sc2; +}; + +union diag324_request { + u64 request; + struct { + u64 : 32; + u64 allocated : 16; + u64 : 12; + u64 sc : 4; + } sc2; +}; + +struct pib { + u32 : 8; + u32 num : 8; + u32 len : 16; + u32 : 24; + u32 hlen : 8; + u64 : 64; + u64 intv; + u8 r[]; +} __packed; + +struct pibdata { + struct pib *pib; + ktime_t expire; + u64 sequence; + size_t len; + int rc; +}; + +static DEFINE_MUTEX(pibmutex); +static struct pibdata pibdata; + +#define PIBWORK_DELAY (5 * NSEC_PER_SEC) + +static void pibwork_handler(struct work_struct *work); +static DECLARE_DELAYED_WORK(pibwork, pibwork_handler); + +static unsigned long diag324(unsigned long subcode, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr }; + + diag_stat_inc(DIAG_STAT_X324); + asm volatile("diag %[rp],%[subcode],0x324" + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "memory"); + return rp.odd; +} + +static void pibwork_handler(struct work_struct *work) +{ + struct pibdata *data = &pibdata; + ktime_t timedout; + + mutex_lock(&pibmutex); + timedout = ktime_add_ns(data->expire, PIBWORK_DELAY); + if (ktime_before(ktime_get(), timedout)) { + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + goto out; + } + vfree(data->pib); + data->pib = NULL; +out: + mutex_unlock(&pibmutex); +} + +static void pib_update(struct pibdata *data) +{ + union diag324_request req = { .sc2.sc = DIAG324_SUBC_2, .sc2.allocated = data->len }; + union diag324_response res; + int rc; + + memset(data->pib, 0, data->len); + res.response = diag324(req.request, data->pib); + switch (res.sc2.rc) { + case DIAG324_RET_SUCCESS: + rc = 0; + break; + case DIAG324_RET_SUBC_NOTAVAIL: + rc = -ENOENT; + break; + case DIAG324_RET_INSUFFICIENT_SIZE: + rc = -EMSGSIZE; + break; + case DIAG324_RET_READING_UNAVAILABLE: + rc = -EBUSY; + break; + default: + rc = -EINVAL; + } + data->rc = rc; +} + +long diag324_pibbuf(unsigned long arg) +{ + struct diag324_pib __user *udata = (struct diag324_pib __user *)arg; + struct pibdata *data = &pibdata; + static bool first = true; + u64 address; + int rc; + + if (!data->len) + return -EOPNOTSUPP; + if (get_user(address, &udata->address)) + return -EFAULT; + mutex_lock(&pibmutex); + rc = -ENOMEM; + if (!data->pib) + data->pib = vmalloc(data->len); + if (!data->pib) + goto out; + if (first || ktime_after(ktime_get(), data->expire)) { + pib_update(data); + data->sequence++; + data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + first = false; + } + rc = data->rc; + if (rc != 0 && rc != -EBUSY) + goto out; + rc = copy_to_user((void __user *)address, data->pib, data->pib->len); + rc |= put_user(data->sequence, &udata->sequence); + if (rc) + rc = -EFAULT; +out: + mutex_unlock(&pibmutex); + return rc; +} + +long diag324_piblen(unsigned long arg) +{ + struct pibdata *data = &pibdata; + + if (!data->len) + return -EOPNOTSUPP; + if (put_user(data->len, (size_t __user *)arg)) + return -EFAULT; + return 0; +} + +static int __init diag324_init(void) +{ + union diag324_response res; + unsigned long installed; + + if (!sclp.has_diag324) + return -EOPNOTSUPP; + res.response = diag324(DIAG324_SUBC_0, NULL); + if (res.sc0.rc != DIAG324_RET_SUCCESS) + return -EOPNOTSUPP; + installed = res.response; + if (!test_bit_inv(DIAG324_SUBC_1, &installed)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG324_SUBC_2, &installed)) + return -EOPNOTSUPP; + res.response = diag324(DIAG324_SUBC_1, NULL); + if (res.sc1.rc != DIAG324_RET_SUCCESS) + return -EOPNOTSUPP; + pibdata.len = res.sc1.pib_len; + return 0; +} +device_initcall(diag324_init); diff --git a/arch/s390/kernel/diag/diag_ioctl.h b/arch/s390/kernel/diag/diag_ioctl.h new file mode 100644 index 000000000000..7080be946785 --- /dev/null +++ b/arch/s390/kernel/diag/diag_ioctl.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _DIAG_IOCTL_H +#define _DIAG_IOCTL_H + +#include <linux/types.h> + +long diag324_pibbuf(unsigned long arg); +long diag324_piblen(unsigned long arg); + +long diag310_memtop_stride(unsigned long arg); +long diag310_memtop_len(unsigned long arg); +long diag310_memtop_buf(unsigned long arg); + +#endif /* _DIAG_IOCTL_H */ diff --git a/arch/s390/kernel/diag/diag_misc.c b/arch/s390/kernel/diag/diag_misc.c new file mode 100644 index 000000000000..efffe02ea02e --- /dev/null +++ b/arch/s390/kernel/diag/diag_misc.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide diagnose information via misc device /dev/diag. + * + * Copyright IBM Corp. 2024 + */ + +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/ioctl.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/types.h> + +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +static long diag_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + long rc; + + switch (cmd) { + case DIAG324_GET_PIBLEN: + rc = diag324_piblen(arg); + break; + case DIAG324_GET_PIBBUF: + rc = diag324_pibbuf(arg); + break; + case DIAG310_GET_STRIDE: + rc = diag310_memtop_stride(arg); + break; + case DIAG310_GET_MEMTOPLEN: + rc = diag310_memtop_len(arg); + break; + case DIAG310_GET_MEMTOPBUF: + rc = diag310_memtop_buf(arg); + break; + default: + rc = -ENOIOCTLCMD; + break; + } + return rc; +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .unlocked_ioctl = diag_ioctl, +}; + +static struct miscdevice diagdev = { + .name = "diag", + .minor = MISC_DYNAMIC_MINOR, + .fops = &fops, + .mode = 0444, +}; + +static int diag_init(void) +{ + return misc_register(&diagdev); +} + +device_initcall(diag_init); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 5412efe328f8..1cec93895b3a 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -17,15 +17,14 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/delay.h> -#include <linux/export.h> #include <linux/kallsyms.h> #include <linux/reboot.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/atomic.h> +#include <linux/io.h> #include <asm/dis.h> -#include <asm/io.h> #include <asm/cpcmd.h> #include <asm/lowcore.h> #include <asm/debug.h> @@ -122,6 +121,7 @@ enum { U8_32, /* 8 bit unsigned value starting at 32 */ U12_16, /* 12 bit unsigned value starting at 16 */ U16_16, /* 16 bit unsigned value starting at 16 */ + U16_20, /* 16 bit unsigned value starting at 20 */ U16_32, /* 16 bit unsigned value starting at 32 */ U32_16, /* 32 bit unsigned value starting at 16 */ VX_12, /* Vector index register starting at position 12 */ @@ -184,6 +184,7 @@ static const struct s390_operand operands[] = { [U8_32] = { 8, 32, 0 }, [U12_16] = { 12, 16, 0 }, [U16_16] = { 16, 16, 0 }, + [U16_20] = { 16, 20, 0 }, [U16_32] = { 16, 32, 0 }, [U32_16] = { 32, 16, 0 }, [VX_12] = { 4, 12, OPERAND_INDEX | OPERAND_VR }, @@ -257,7 +258,6 @@ static const unsigned char formats[][6] = { [INSTR_RSL_R0RD] = { D_20, L4_8, B_16, 0, 0, 0 }, [INSTR_RSY_AARD] = { A_8, A_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_CCRD] = { C_8, C_12, D20_20, B_16, 0, 0 }, - [INSTR_RSY_RDRU] = { R_8, D20_20, B_16, U4_12, 0, 0 }, [INSTR_RSY_RRRD] = { R_8, R_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_RURD] = { R_8, U4_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_RURD2] = { R_8, D20_20, B_16, U4_12, 0, 0 }, @@ -278,6 +278,7 @@ static const unsigned char formats[][6] = { [INSTR_SIL_RDI] = { D_20, B_16, I16_32, 0, 0, 0 }, [INSTR_SIL_RDU] = { D_20, B_16, U16_32, 0, 0, 0 }, [INSTR_SIY_IRD] = { D20_20, B_16, I8_8, 0, 0, 0 }, + [INSTR_SIY_RD] = { D20_20, B_16, 0, 0, 0, 0 }, [INSTR_SIY_URD] = { D20_20, B_16, U8_8, 0, 0, 0 }, [INSTR_SI_RD] = { D_20, B_16, 0, 0, 0, 0 }, [INSTR_SI_URD] = { D_20, B_16, U8_8, 0, 0, 0 }, @@ -299,23 +300,28 @@ static const unsigned char formats[][6] = { [INSTR_VRI_V0UU2] = { V_8, U16_16, U4_32, 0, 0, 0 }, [INSTR_VRI_V0UUU] = { V_8, U8_16, U8_24, U4_32, 0, 0 }, [INSTR_VRI_VR0UU] = { V_8, R_12, U8_28, U4_24, 0, 0 }, + [INSTR_VRI_VV0UU] = { V_8, V_12, U8_28, U4_24, 0, 0 }, [INSTR_VRI_VVUU] = { V_8, V_12, U16_16, U4_32, 0, 0 }, [INSTR_VRI_VVUUU] = { V_8, V_12, U12_16, U4_32, U4_28, 0 }, [INSTR_VRI_VVUUU2] = { V_8, V_12, U8_28, U8_16, U4_24, 0 }, [INSTR_VRI_VVV0U] = { V_8, V_12, V_16, U8_24, 0, 0 }, [INSTR_VRI_VVV0UU] = { V_8, V_12, V_16, U8_24, U4_32, 0 }, [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, - [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, + [INSTR_VRI_VVV0UV] = { V_8, V_12, V_16, V_32, U8_24, 0 }, + [INSTR_VRR_0V0U] = { V_12, U16_20, 0, 0, 0, 0 }, [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, + [INSTR_VRR_0VVU] = { V_12, V_16, U16_20, 0, 0, 0 }, [INSTR_VRR_RV0UU] = { R_8, V_12, U4_24, U4_28, 0, 0 }, [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, [INSTR_VRR_VV0U] = { V_8, V_12, U4_32, 0, 0, 0 }, [INSTR_VRR_VV0U0U] = { V_8, V_12, U4_32, U4_24, 0, 0 }, + [INSTR_VRR_VV0U2] = { V_8, V_12, U4_24, 0, 0, 0 }, [INSTR_VRR_VV0UU2] = { V_8, V_12, U4_32, U4_28, 0, 0 }, [INSTR_VRR_VV0UUU] = { V_8, V_12, U4_32, U4_28, U4_24, 0 }, [INSTR_VRR_VVV] = { V_8, V_12, V_16, 0, 0, 0 }, [INSTR_VRR_VVV0U] = { V_8, V_12, V_16, U4_32, 0, 0 }, + [INSTR_VRR_VVV0U0] = { V_8, V_12, V_16, U4_24, 0, 0 }, [INSTR_VRR_VVV0U0U] = { V_8, V_12, V_16, U4_32, U4_24, 0 }, [INSTR_VRR_VVV0UU] = { V_8, V_12, V_16, U4_32, U4_28, 0 }, [INSTR_VRR_VVV0UUU] = { V_8, V_12, V_16, U4_32, U4_28, U4_24 }, @@ -452,21 +458,21 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr) if (separator) ptr += sprintf(ptr, "%c", separator); if (operand->flags & OPERAND_GPR) - ptr += sprintf(ptr, "%%r%i", value); + ptr += sprintf(ptr, "%%r%u", value); else if (operand->flags & OPERAND_FPR) - ptr += sprintf(ptr, "%%f%i", value); + ptr += sprintf(ptr, "%%f%u", value); else if (operand->flags & OPERAND_AR) - ptr += sprintf(ptr, "%%a%i", value); + ptr += sprintf(ptr, "%%a%u", value); else if (operand->flags & OPERAND_CR) - ptr += sprintf(ptr, "%%c%i", value); + ptr += sprintf(ptr, "%%c%u", value); else if (operand->flags & OPERAND_VR) - ptr += sprintf(ptr, "%%v%i", value); + ptr += sprintf(ptr, "%%v%u", value); else if (operand->flags & OPERAND_PCREL) { void *pcrel = (void *)((int)value + addr); ptr += sprintf(ptr, "%px", pcrel); } else if (operand->flags & OPERAND_SIGNED) - ptr += sprintf(ptr, "%i", value); + ptr += sprintf(ptr, "%i", (int)value); else ptr += sprintf(ptr, "%u", value); if (operand->flags & OPERAND_DISP) @@ -497,24 +503,27 @@ static int copy_from_regs(struct pt_regs *regs, void *dst, void *src, int len) void show_code(struct pt_regs *regs) { char *mode = user_mode(regs) ? "User" : "Krnl"; + unsigned long addr, pswaddr; unsigned char code[64]; char buffer[128], *ptr; - unsigned long addr; int start, end, opsize, hops, i; + pswaddr = regs->psw.addr; + if (test_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED)) + pswaddr = __forward_psw(regs->psw, regs->int_code >> 16); /* Get a snapshot of the 64 bytes surrounding the fault address. */ - for (start = 32; start && regs->psw.addr >= 34 - start; start -= 2) { - addr = regs->psw.addr - 34 + start; + for (start = 32; start && pswaddr >= 34 - start; start -= 2) { + addr = pswaddr - 34 + start; if (copy_from_regs(regs, code + start - 2, (void *)addr, 2)) break; } for (end = 32; end < 64; end += 2) { - addr = regs->psw.addr + end - 32; + addr = pswaddr + end - 32; if (copy_from_regs(regs, code + end, (void *)addr, 2)) break; } - /* Code snapshot useable ? */ - if ((regs->psw.addr & 1) || start >= end) { + /* Code snapshot usable ? */ + if ((pswaddr & 1) || start >= end) { printk("%s Code: Bad PSW.\n", mode); return; } @@ -537,12 +546,12 @@ void show_code(struct pt_regs *regs) while (start < end && hops < 8) { opsize = insn_length(code[start]); if (start + opsize == 32) - *ptr++ = '#'; + *ptr++ = '*'; else if (start == 32) *ptr++ = '>'; else *ptr++ = ' '; - addr = regs->psw.addr + start - 32; + addr = pswaddr + start - 32; ptr += sprintf(ptr, "%px: ", (void *)addr); if (start + opsize >= end) break; diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index db1bc00229ca..f9d52e05e01e 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <asm/asm-offsets.h> #include <asm/processor.h> #include <asm/debug.h> #include <asm/dis.h> @@ -41,60 +42,50 @@ const char *stack_type_name(enum stack_type type) EXPORT_SYMBOL_GPL(stack_type_name); static inline bool in_stack(unsigned long sp, struct stack_info *info, - enum stack_type type, unsigned long low, - unsigned long high) + enum stack_type type, unsigned long stack) { - if (sp < low || sp >= high) + if (sp < stack || sp >= stack + THREAD_SIZE) return false; info->type = type; - info->begin = low; - info->end = high; + info->begin = stack; + info->end = stack + THREAD_SIZE; return true; } static bool in_task_stack(unsigned long sp, struct task_struct *task, struct stack_info *info) { - unsigned long stack; + unsigned long stack = (unsigned long)task_stack_page(task); - stack = (unsigned long) task_stack_page(task); - return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE); + return in_stack(sp, info, STACK_TYPE_TASK, stack); } static bool in_irq_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = get_lowcore()->async_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.async_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_IRQ, stack); } static bool in_nodat_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = get_lowcore()->nodat_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.nodat_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_NODAT, stack); } static bool in_mcck_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = get_lowcore()->mcck_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.mcck_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_MCCK, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_MCCK, stack); } static bool in_restart_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size, top; + unsigned long stack = get_lowcore()->restart_stack - STACK_INIT_OFFSET; - frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); - top = S390_lowcore.restart_stack + frame_size; - return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top); + return in_stack(sp, info, STACK_TYPE_RESTART, stack); } int get_stack_info(unsigned long sp, struct task_struct *task, @@ -152,18 +143,28 @@ void show_stack(struct task_struct *task, unsigned long *stack, static void show_last_breaking_event(struct pt_regs *regs) { printk("Last Breaking-Event-Address:\n"); - printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]); + printk(" [<%016lx>] ", regs->last_break); + if (user_mode(regs)) { + print_vma_addr(KERN_CONT, regs->last_break); + pr_cont("\n"); + } else { + pr_cont("%pSR\n", (void *)regs->last_break); + } } void show_registers(struct pt_regs *regs) { struct psw_bits *psw = &psw_bits(regs->psw); + unsigned long pswaddr; char *mode; + pswaddr = regs->psw.addr; + if (test_pt_regs_flag(regs, PIF_PSW_ADDR_ADJUSTED)) + pswaddr = __forward_psw(regs->psw, regs->int_code >> 16); mode = user_mode(regs) ? "User" : "Krnl"; - printk("%s PSW : %px %px", mode, (void *)regs->psw.mask, (void *)regs->psw.addr); + printk("%s PSW : %px %px", mode, (void *)regs->psw.mask, (void *)pswaddr); if (!user_mode(regs)) - pr_cont(" (%pSR)", (void *)regs->psw.addr); + pr_cont(" (%pSR)", (void *)pswaddr); pr_cont("\n"); printk(" R:%x T:%x IO:%x EX:%x Key:%x M:%x W:%x " "P:%x AS:%x CC:%x PM:%x", psw->per, psw->dat, psw->io, psw->ext, @@ -192,7 +193,7 @@ void show_regs(struct pt_regs *regs) static DEFINE_SPINLOCK(die_lock); -void die(struct pt_regs *regs, const char *str) +void __noreturn die(struct pt_regs *regs, const char *str) { static int die_counter; @@ -202,13 +203,8 @@ void die(struct pt_regs *regs, const char *str) console_verbose(); spin_lock_irq(&die_lock); bust_spinlocks(1); - printk("%s: %04x ilc:%d [#%d] ", str, regs->int_code & 0xffff, + printk("%s: %04x ilc:%d [#%d]", str, regs->int_code & 0xffff, regs->int_code >> 17, ++die_counter); -#ifdef CONFIG_PREEMPT - pr_cont("PREEMPT "); -#elif defined(CONFIG_PREEMPT_RT) - pr_cont("PREEMPT_RT "); -#endif pr_cont("SMP "); if (debug_pagealloc_enabled()) pr_cont("DEBUG_PAGEALLOC"); @@ -224,5 +220,5 @@ void die(struct pt_regs *regs, const char *str) if (panic_on_oops) panic("Fatal exception: panic_on_oops"); oops_exit(); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index fb84e3fc1686..b27239c03d79 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -2,12 +2,12 @@ /* * Copyright IBM Corp. 2007, 2009 * Author(s): Hongjie Yang <hongjie@us.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com> */ -#define KMSG_COMPONENT "setup" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "setup: " fmt +#include <linux/sched/debug.h> +#include <linux/cpufeature.h> #include <linux/compiler.h> #include <linux/init.h> #include <linux/errno.h> @@ -18,8 +18,15 @@ #include <linux/pfn.h> #include <linux/uaccess.h> #include <linux/kernel.h> +#include <asm/asm-extable.h> +#include <linux/memblock.h> +#include <linux/kasan.h> +#include <asm/access-regs.h> +#include <asm/asm-offsets.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ebcdic.h> +#include <asm/fpu.h> #include <asm/ipl.h> #include <asm/lowcore.h> #include <asm/processor.h> @@ -30,24 +37,36 @@ #include <asm/sclp.h> #include <asm/facility.h> #include <asm/boot_data.h> -#include <asm/switch_to.h> #include "entry.h" -int __bootdata(is_full_image); +#define __decompressor_handled_param(func, param) \ +static int __init ignore_decompressor_param_##func(char *s) \ +{ \ + return 0; \ +} \ +early_param(#param, ignore_decompressor_param_##func) + +#define decompressor_handled_param(param) __decompressor_handled_param(param, param) + +decompressor_handled_param(mem); +decompressor_handled_param(vmalloc); +decompressor_handled_param(dfltcc); +decompressor_handled_param(facilities); +decompressor_handled_param(nokaslr); +decompressor_handled_param(cmma); +decompressor_handled_param(relocate_lowcore); +decompressor_handled_param(bootdebug); +__decompressor_handled_param(debug_alternative, debug-alternative); +#if IS_ENABLED(CONFIG_KVM) +decompressor_handled_param(prot_virt); +#endif -static void __init reset_tod_clock(void) +static void __init kasan_early_init(void) { - union tod_clock clk; - - if (store_tod_clock_ext_cc(&clk) == 0) - return; - /* TOD clock not running. Set the clock to Unix Epoch. */ - if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) - disabled_wait(); - - memset(&tod_clock_base, 0, sizeof(tod_clock_base)); - tod_clock_base.tod = TOD_UNIX_EPOCH; - S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; +#ifdef CONFIG_KASAN + init_task.kasan_depth = 0; + kasan_init_generic(); +#endif } /* @@ -68,26 +87,6 @@ static noinline __init void init_kernel_storage_key(void) static __initdata char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); -static noinline __init void detect_machine_type(void) -{ - struct sysinfo_3_2_2 *vmms = (struct sysinfo_3_2_2 *)&sysinfo_page; - - /* Check current-configuration-level */ - if (stsi(NULL, 0, 0, 0) <= 2) { - S390_lowcore.machine_flags |= MACHINE_FLAG_LPAR; - return; - } - /* Get virtual-machine cpu information. */ - if (stsi(vmms, 3, 2, 2) || !vmms->count) - return; - - /* Detect known hypervisors */ - if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) - S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; - else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) - S390_lowcore.machine_flags |= MACHINE_FLAG_VM; -} - /* Remove leading, trailing and double whitespace. */ static inline void strim_all(char *str) { @@ -106,6 +105,8 @@ static inline void strim_all(char *str) } } +char arch_hw_string[128]; + static noinline __init void setup_arch_string(void) { struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page; @@ -118,20 +119,21 @@ static noinline __init void setup_arch_string(void) EBCASC(mach->type, sizeof(mach->type)); EBCASC(mach->model, sizeof(mach->model)); EBCASC(mach->model_capacity, sizeof(mach->model_capacity)); - sprintf(mstr, "%-16.16s %-4.4s %-16.16s %-16.16s", - mach->manufacturer, mach->type, - mach->model, mach->model_capacity); + scnprintf(mstr, sizeof(mstr), "%-16.16s %-4.4s %-16.16s %-16.16s", + mach->manufacturer, mach->type, + mach->model, mach->model_capacity); strim_all(mstr); if (stsi(vm, 3, 2, 2) == 0 && vm->count) { EBCASC(vm->vm[0].cpi, sizeof(vm->vm[0].cpi)); - sprintf(hvstr, "%-16.16s", vm->vm[0].cpi); + scnprintf(hvstr, sizeof(hvstr), "%-16.16s", vm->vm[0].cpi); strim_all(hvstr); } else { - sprintf(hvstr, "%s", - MACHINE_IS_LPAR ? "LPAR" : - MACHINE_IS_VM ? "z/VM" : - MACHINE_IS_KVM ? "KVM" : "unknown"); + scnprintf(hvstr, sizeof(hvstr), "%s", + machine_is_lpar() ? "LPAR" : + machine_is_vm() ? "z/VM" : + machine_is_kvm() ? "KVM" : "unknown"); } + scnprintf(arch_hw_string, sizeof(arch_hw_string), "HW: %s (%s)", mstr, hvstr); dump_stack_set_arch_desc("%s (%s)", mstr, hvstr); } @@ -139,9 +141,8 @@ static __init void setup_topology(void) { int max_mnest; - if (!test_facility(11)) + if (!cpu_has_topology()) return; - S390_lowcore.machine_flags |= MACHINE_FLAG_TOPOLOGY; for (max_mnest = 6; max_mnest > 1; max_mnest--) { if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0) break; @@ -149,112 +150,59 @@ static __init void setup_topology(void) topology_max_mnest = max_mnest; } -static void early_pgm_check_handler(void) +void __init __do_early_pgm_check(struct pt_regs *regs) { - const struct exception_table_entry *fixup; - unsigned long cr0, cr0_new; - unsigned long addr; + struct lowcore *lc = get_lowcore(); + unsigned long ip; + + regs->int_code = lc->pgm_int_code; + regs->int_parm_long = lc->trans_exc_code; + regs->last_break = lc->pgm_last_break; + ip = __rewind_psw(regs->psw, regs->int_code >> 16); - addr = S390_lowcore.program_old_psw.addr; - fixup = s390_search_extables(addr); - if (!fixup) - disabled_wait(); - /* Disable low address protection before storing into lowcore. */ - __ctl_store(cr0, 0, 0); - cr0_new = cr0 & ~(1UL << 28); - __ctl_load(cr0_new, 0, 0); - S390_lowcore.program_old_psw.addr = extable_fixup(fixup); - __ctl_load(cr0, 0, 0); + /* Monitor Event? Might be a warning */ + if ((regs->int_code & PGM_INT_CODE_MASK) == 0x40) { + if (report_bug(ip, regs) == BUG_TRAP_TYPE_WARN) + return; + } + if (fixup_exception(regs)) + return; + /* + * Unhandled exception - system cannot continue but try to get some + * helpful messages to the console. Use early_printk() to print + * some basic information in case it is too early for printk(). + */ + register_early_console(); + early_printk("PANIC: early exception %04x PSW: %016lx %016lx\n", + regs->int_code & 0xffff, regs->psw.mask, regs->psw.addr); + show_regs(regs); + disabled_wait(); } static noinline __init void setup_lowcore_early(void) { + struct lowcore *lc = get_lowcore(); psw_t psw; - psw.addr = (unsigned long)s390_base_pgm_handler; - psw.mask = PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA; - if (IS_ENABLED(CONFIG_KASAN)) - psw.mask |= PSW_MASK_DAT; - S390_lowcore.program_new_psw = psw; - s390_base_pgm_handler_fn = early_pgm_check_handler; - S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; -} - -static noinline __init void setup_facility_list(void) -{ - memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list)); - if (!IS_ENABLED(CONFIG_KERNEL_NOBP)) - __clear_facility(82, alt_stfle_fac_list); -} - -static __init void detect_diag9c(void) -{ - unsigned int cpu_address; - int rc; - - cpu_address = stap(); - diag_stat_inc(DIAG_STAT_X09C); - asm volatile( - " diag %2,0,0x9c\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc"); - if (!rc) - S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; -} - -static __init void detect_machine_facilities(void) -{ - if (test_facility(8)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1; - __ctl_set_bit(0, 23); - } - if (test_facility(78)) - S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2; - if (test_facility(3)) - S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; - if (test_facility(50) && test_facility(73)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_TE; - __ctl_set_bit(0, 55); - } - if (test_facility(51)) - S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; - if (test_facility(129)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_VX; - __ctl_set_bit(0, 17); - } - if (test_facility(130) && !noexec_disabled) { - S390_lowcore.machine_flags |= MACHINE_FLAG_NX; - __ctl_set_bit(0, 20); - } - if (test_facility(133)) - S390_lowcore.machine_flags |= MACHINE_FLAG_GS; - if (test_facility(139) && (tod_clock_base.tod >> 63)) { - /* Enabled signed clock comparator comparisons */ - S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; - clock_comparator_max = -1ULL >> 1; - __ctl_set_bit(0, 53); - } + psw.addr = (unsigned long)early_pgm_check_handler; + psw.mask = PSW_KERNEL_BITS; + lc->program_new_psw = psw; + lc->preempt_count = INIT_PREEMPT_COUNT; + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); } static inline void save_vector_registers(void) { #ifdef CONFIG_CRASH_DUMP - if (test_facility(129)) + if (cpu_has_vx()) save_vx_regs(boot_cpu_vector_save_area); #endif } -static inline void setup_control_registers(void) +static inline void setup_low_address_protection(void) { - unsigned long reg; - - __ctl_store(reg, 0, 0); - reg |= CR0_LOW_ADDRESS_PROTECTION; - reg |= CR0_EMERGENCY_SIGNAL_SUBMASK; - reg |= CR0_EXTERNAL_CALL_SUBMASK; - __ctl_load(reg, 0, 0); + system_ctl_set_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); } static inline void setup_access_registers(void) @@ -264,50 +212,32 @@ static inline void setup_access_registers(void) restore_access_regs(acrs); } -static int __init disable_vector_extension(char *str) -{ - S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; - __ctl_clear_bit(0, 17); - return 0; -} -early_param("novx", disable_vector_extension); - char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; static void __init setup_boot_command_line(void) { /* copy arch command line */ - strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE); + strscpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE); } -static void __init check_image_bootable(void) +static void __init sort_amode31_extable(void) { - if (is_full_image) - return; - - sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); - sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n"); - sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n"); - disabled_wait(); + sort_extable(__start_amode31_ex_table, __stop_amode31_ex_table); } void __init startup_init(void) { - reset_tod_clock(); - check_image_bootable(); + kasan_early_init(); time_early_init(); init_kernel_storage_key(); lockdep_off(); + sort_amode31_extable(); setup_lowcore_early(); - setup_facility_list(); - detect_machine_type(); setup_arch_string(); setup_boot_command_line(); - detect_diag9c(); - detect_machine_facilities(); save_vector_registers(); setup_topology(); sclp_early_detect(); - setup_control_registers(); + setup_low_address_protection(); setup_access_registers(); lockdep_on(); } diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c index d9d53f44008a..cefe020a3be3 100644 --- a/arch/s390/kernel/early_printk.c +++ b/arch/s390/kernel/early_printk.c @@ -6,6 +6,7 @@ #include <linux/console.h> #include <linux/kernel.h> #include <linux/init.h> +#include <asm/setup.h> #include <asm/sclp.h> static void sclp_early_write(struct console *con, const char *s, unsigned int len) @@ -20,6 +21,16 @@ static struct console sclp_early_console = { .index = -1, }; +void __init register_early_console(void) +{ + if (early_console) + return; + if (!sclp.has_linemode && !sclp.has_vt220) + return; + early_console = &sclp_early_console; + register_console(early_console); +} + static int __init setup_early_printk(char *buf) { if (early_console) @@ -27,10 +38,7 @@ static int __init setup_early_printk(char *buf) /* Accept only "earlyprintk" and "earlyprintk=sclp" */ if (buf && !str_has_prefix(buf, "sclp")) return 0; - if (!sclp.has_linemode && !sclp.has_vt220) - return 0; - early_console = &sclp_early_console; - register_console(early_console); + register_early_console(); return 0; } early_param("earlyprintk", setup_early_printk); diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c index 7f8246c9be08..0e51fa537262 100644 --- a/arch/s390/kernel/ebcdic.c +++ b/arch/s390/kernel/ebcdic.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * ECBDIC -> ASCII, ASCII -> ECBDIC, + * EBCDIC -> ASCII, ASCII -> EBCDIC, * upper to lower case (EBCDIC) conversion tables. * * S390 version diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 5a2f70cbd3a9..b7f1553d9ee5 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -6,12 +6,13 @@ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), * Hartmut Penner (hp@de.ibm.com), * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), - * Heiko Carstens <heiko.carstens@de.ibm.com> */ +#include <linux/export.h> #include <linux/init.h> #include <linux/linkage.h> -#include <asm/alternative-asm.h> +#include <asm/asm-extable.h> +#include <asm/alternative.h> #include <asm/processor.h> #include <asm/cache.h> #include <asm/dwarf.h> @@ -23,68 +24,51 @@ #include <asm/page.h> #include <asm/sigp.h> #include <asm/irq.h> -#include <asm/vx-insn.h> +#include <asm/fpu-insn.h> #include <asm/setup.h> #include <asm/nmi.h> -#include <asm/export.h> #include <asm/nospec-insn.h> - -__PT_R0 = __PT_GPRS -__PT_R1 = __PT_GPRS + 8 -__PT_R2 = __PT_GPRS + 16 -__PT_R3 = __PT_GPRS + 24 -__PT_R4 = __PT_GPRS + 32 -__PT_R5 = __PT_GPRS + 40 -__PT_R6 = __PT_GPRS + 48 -__PT_R7 = __PT_GPRS + 56 -__PT_R8 = __PT_GPRS + 64 -__PT_R9 = __PT_GPRS + 72 -__PT_R10 = __PT_GPRS + 80 -__PT_R11 = __PT_GPRS + 88 -__PT_R12 = __PT_GPRS + 96 -__PT_R13 = __PT_GPRS + 104 -__PT_R14 = __PT_GPRS + 112 -__PT_R15 = __PT_GPRS + 120 - -STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER -STACK_SIZE = 1 << STACK_SHIFT -STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE +#include <asm/lowcore.h> +#include <asm/machine.h> _LPP_OFFSET = __LC_LPP - .macro CHECK_STACK savearea -#ifdef CONFIG_CHECK_STACK - tml %r15,STACK_SIZE - CONFIG_STACK_GUARD - lghi %r14,\savearea - jz stack_overflow -#endif + .macro STBEAR address + ALTERNATIVE "nop", ".insn s,0xb2010000,\address", ALT_FACILITY(193) + .endm + + .macro LBEAR address + ALTERNATIVE "nop", ".insn s,0xb2000000,\address", ALT_FACILITY(193) + .endm + + .macro LPSWEY address, lpswe + ALTERNATIVE_2 "b \lpswe;nopr", \ + ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193), \ + __stringify(.insn siy,0xeb0000000071,LOWCORE_ALT_ADDRESS+\address,0), \ + ALT_FEATURE(MFEATURE_LOWCORE) + .endm + + .macro MBEAR reg, lowcore + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK(\lowcore)),\ + ALT_FACILITY(193) .endm - .macro CHECK_VMAP_STACK savearea,oklabel -#ifdef CONFIG_VMAP_STACK + .macro CHECK_VMAP_STACK savearea, lowcore, oklabel lgr %r14,%r15 - nill %r14,0x10000 - STACK_SIZE - oill %r14,STACK_INIT - clg %r14,__LC_KERNEL_STACK + nill %r14,0x10000 - THREAD_SIZE + oill %r14,STACK_INIT_OFFSET + clg %r14,__LC_KERNEL_STACK(\lowcore) je \oklabel - clg %r14,__LC_ASYNC_STACK + clg %r14,__LC_ASYNC_STACK(\lowcore) je \oklabel - clg %r14,__LC_MCCK_STACK + clg %r14,__LC_MCCK_STACK(\lowcore) je \oklabel - clg %r14,__LC_NODAT_STACK + clg %r14,__LC_NODAT_STACK(\lowcore) je \oklabel - clg %r14,__LC_RESTART_STACK + clg %r14,__LC_RESTART_STACK(\lowcore) je \oklabel - lghi %r14,\savearea - j stack_overflow -#else - j \oklabel -#endif - .endm - - .macro STCK savearea - ALTERNATIVE ".insn s,0xb2050000,\savearea", \ - ".insn s,0xb27c0000,\savearea", 25 + la %r14,\savearea(\lowcore) + j stack_invalid .endm /* @@ -110,163 +94,130 @@ _LPP_OFFSET = __LC_LPP .endm .macro BPOFF - ALTERNATIVE "", ".long 0xb2e8c000", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_SPEC(82) .endm .macro BPON - ALTERNATIVE "", ".long 0xb2e8d000", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82) .endm .macro BPENTER tif_ptr,tif_mask - ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .long 0xb2e8d000", \ - "", 82 + ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ + "j .+12; nop; nop", ALT_SPEC(82) .endm .macro BPEXIT tif_ptr,tif_mask TSTMSK \tif_ptr,\tif_mask - ALTERNATIVE "jz .+8; .long 0xb2e8c000", \ - "jnz .+8; .long 0xb2e8d000", 82 - .endm - - /* - * The CHKSTG macro jumps to the provided label in case the - * machine check interruption code reports one of unrecoverable - * storage errors: - * - Storage error uncorrected - * - Storage key error uncorrected - * - Storage degradation with Failing-storage-address validity - */ - .macro CHKSTG errlabel - TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR) - jnz \errlabel - TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD - jz oklabel\@ - TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR - jnz \errlabel -oklabel\@: + ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \ + "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82) .endm #if IS_ENABLED(CONFIG_KVM) - /* - * The OUTSIDE macro jumps to the provided label in case the value - * in the provided register is outside of the provided range. The - * macro is useful for checking whether a PSW stored in a register - * pair points inside or outside of a block of instructions. - * @reg: register to check - * @start: start of the range - * @end: end of the range - * @outside_label: jump here if @reg is outside of [@start..@end) - */ - .macro OUTSIDE reg,start,end,outside_label - lgr %r14,\reg - larl %r13,\start - slgr %r14,%r13 - lghi %r13,\end - \start - clgr %r14,%r13 - jhe \outside_label - .endm - - .macro SIEEXIT - lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce + .macro SIEEXIT sie_control,lowcore + lg %r9,\sie_control # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE(\lowcore) # load primary asce + lg %r9,__LC_CURRENT(\lowcore) + mvi __TI_sie(%r9),0 larl %r9,sie_exit # skip forward to sie_exit .endm #endif + .macro STACKLEAK_ERASE +#ifdef CONFIG_KSTACK_ERASE + brasl %r14,stackleak_erase_on_task_stack +#endif + .endm + GEN_BR_THUNK %r14 - GEN_BR_THUNK %r14,%r13 .section .kprobes.text, "ax" .Ldummy: /* - * This nop exists only in order to avoid that __bpon starts at - * the beginning of the kprobes text section. In that case we would - * have several symbols at the same address. E.g. objdump would take - * an arbitrary symbol name when disassembling this code. - * With the added nop in between the __bpon symbol is unique - * again. + * The following nop exists only in order to avoid that the next + * symbol starts at the beginning of the kprobes text section. + * In that case there would be several symbols at the same address. + * E.g. objdump would take an arbitrary symbol when disassembling + * the code. + * With the added nop in between this cannot happen. */ nop 0 -ENTRY(__bpon) - .globl __bpon - BPON - BR_EX %r14 -ENDPROC(__bpon) - /* - * Scheduler resume function, called by switch_to - * gpr2 = (task_struct *) prev - * gpr3 = (task_struct *) next + * Scheduler resume function, called by __switch_to + * gpr2 = (task_struct *)prev + * gpr3 = (task_struct *)next * Returns: * gpr2 = prev */ -ENTRY(__switch_to) +SYM_FUNC_START(__switch_to_asm) stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task lghi %r4,__TASK_stack lghi %r1,__TASK_thread - llill %r5,STACK_INIT + llill %r5,STACK_INIT_OFFSET stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev lg %r15,0(%r4,%r3) # start of kernel stack of next agr %r15,%r5 # end of kernel stack of next - stg %r3,__LC_CURRENT # store task struct of next - stg %r15,__LC_KERNEL_STACK # store end of kernel stack + GET_LC %r13 + stg %r3,__LC_CURRENT(%r13) # store task struct of next + stg %r15,__LC_KERNEL_STACK(%r13) # store end of kernel stack lg %r15,__THREAD_ksp(%r1,%r3) # load kernel stack of next - aghi %r3,__TASK_pid - mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next + lay %r4,__TASK_pid(%r3) + mvc __LC_CURRENT_PID(4,%r13),0(%r4) # store pid of next + ALTERNATIVE "nop", "lpp _LPP_OFFSET(%r13)", ALT_FACILITY(40) +#ifdef CONFIG_STACKPROTECTOR + lg %r3,__TASK_stack_canary(%r3) + stg %r3,__LC_STACK_CANARY(%r13) +#endif lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task - ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 BR_EX %r14 -ENDPROC(__switch_to) +SYM_FUNC_END(__switch_to_asm) #if IS_ENABLED(CONFIG_KVM) /* - * sie64a calling convention: - * %r2 pointer to sie control block - * %r3 guest register save area + * __sie64a calling convention: + * %r2 pointer to sie control block phys + * %r3 pointer to sie control block virt + * %r4 guest register save area + * %r5 guest asce */ -ENTRY(sie64a) +SYM_FUNC_START(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers - lg %r12,__LC_CURRENT - stg %r2,__SF_SIE_CONTROL(%r15) # save control block pointer - stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area + GET_LC %r13 + lg %r14,__LC_CURRENT(%r13) + stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. + stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses + stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area + stg %r5,__SF_SIE_GUEST_ASCE(%r15) # save guest asce xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 - mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags - lmg %r0,%r13,0(%r3) # load guest gprs 0-13 - lg %r14,__LC_GMAP # get gmap pointer - ltgr %r14,%r14 - jz .Lsie_gmap - lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce -.Lsie_gmap: + mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags + lmg %r0,%r13,0(%r4) # load guest gprs 0-13 + mvi __TI_sie(%r14),1 + stosm __SF_SIE_IRQ(%r15),0x03 # enable interrupts + lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now tm __SIE_PROG20+3(%r14),3 # last exit... jnz .Lsie_skip - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jo .Lsie_skip # exit if fp/vx regs changed - BPEXIT __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr + BPEXIT __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST .Lsie_entry: sie 0(%r14) +# Let the next instruction be NOP to avoid triggering a machine check +# and handling it in a guest as result of the instruction execution. + nopr 7 +.Lsie_leave: BPOFF - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST .Lsie_skip: + lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce -.Lsie_done: -# some program checks are suppressing. C code (e.g. do_protection_exception) -# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There -# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. -# Other instructions between sie64a and .Lsie_done should not cause program -# interrupts. So lets use 3 nops as a landing pad for all possible rewinds. -.Lrewind_pad6: - nopr 7 -.Lrewind_pad4: - nopr 7 -.Lrewind_pad2: - nopr 7 - .globl sie_exit -sie_exit: + GET_LC %r14 + lctlg %c1,%c1,__LC_USER_ASCE(%r14) # load primary asce + lg %r14,__LC_CURRENT(%r14) + mvi __TI_sie(%r14),0 +SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) + stnsm __SF_SIE_IRQ(%r15),0xfc # disable interrupts lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 xgr %r0,%r0 # clear guest registers to @@ -277,17 +228,8 @@ sie_exit: lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers lg %r2,__SF_SIE_REASON(%r15) # return exit reason code BR_EX %r14 -.Lsie_fault: - lghi %r14,-EFAULT - stg %r14,__SF_SIE_REASON(%r15) # set exit reason code - j sie_exit - - EX_TABLE(.Lrewind_pad6,.Lsie_fault) - EX_TABLE(.Lrewind_pad4,.Lsie_fault) - EX_TABLE(.Lrewind_pad2,.Lsie_fault) - EX_TABLE(sie_exit,.Lsie_fault) -ENDPROC(sie64a) -EXPORT_SYMBOL(sie64a) +SYM_FUNC_END(__sie64a) +EXPORT_SYMBOL(__sie64a) EXPORT_SYMBOL(sie_exit) #endif @@ -296,18 +238,17 @@ EXPORT_SYMBOL(sie_exit) * are entered with interrupts disabled. */ -ENTRY(system_call) - stpt __LC_SYS_ENTER_TIMER - stmg %r8,%r15,__LC_SAVE_AREA_SYNC +SYM_CODE_START(system_call) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF lghi %r14,0 .Lsysc_per: - lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r12,__LC_CURRENT - lg %r15,__LC_KERNEL_STACK + STBEAR __LC_LAST_BREAK(%r13) + lg %r15,__LC_KERNEL_STACK(%r13) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP # clear user controlled register to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -320,70 +261,71 @@ ENTRY(system_call) xgr %r10,%r10 xgr %r11,%r11 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs - mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC + mvc __PT_R8(64,%r2),__LC_SAVE_AREA(%r13) + MBEAR %r2,%r13 lgr %r3,%r14 brasl %r14,__do_syscall - lctlg %c1,%c1,__LC_USER_ASCE - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + STACKLEAK_ERASE + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPON + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + stpt __LC_EXIT_TIMER(%r13) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - stpt __LC_EXIT_TIMER - b __LC_RETURN_LPSWE -ENDPROC(system_call) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(system_call) # # a new process exits the kernel with ret_from_fork # -ENTRY(ret_from_fork) +SYM_CODE_START(ret_from_fork) lgr %r3,%r11 brasl %r14,__ret_from_fork - lctlg %c1,%c1,__LC_USER_ASCE - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + STACKLEAK_ERASE + GET_LC %r13 + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPON + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + stpt __LC_EXIT_TIMER(%r13) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - stpt __LC_EXIT_TIMER - b __LC_RETURN_LPSWE -ENDPROC(ret_from_fork) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(ret_from_fork) /* * Program check handler routine */ -ENTRY(pgm_check_handler) - stpt __LC_SYS_ENTER_TIMER +SYM_CODE_START(pgm_check_handler) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_SYNC - lg %r12,__LC_CURRENT - lghi %r10,0 - lmg %r8,%r9,__LC_PGM_OLD_PSW + lmg %r8,%r9,__LC_PGM_OLD_PSW(%r13) + xgr %r10,%r10 tmhh %r8,0x0001 # coming from user space? - jno .Lpgm_skip_asce - lctlg %c1,%c1,__LC_KERNEL_ASCE - j 3f # -> fault in user space -.Lpgm_skip_asce: + jo 3f # -> fault in user space #if IS_ENABLED(CONFIG_KVM) - # cleanup critical section for program checks in sie64a - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f - SIEEXIT + lg %r11,__LC_CURRENT(%r13) + tm __TI_sie(%r11),0xff + jz 1f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 lghi %r10,_PIF_GUEST_FAULT #endif 1: tmhh %r8,0x4000 # PER bit set in old PSW ? jnz 2f # -> enabled, can't be a double fault - tm __LC_PGM_ILC+3,0x80 # check for per exception + tm __LC_PGM_ILC+3(%r13),0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -2: CHECK_STACK __LC_SAVE_AREA_SYNC - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - # CHECK_VMAP_STACK branches to stack_overflow or 4f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f -3: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - lg %r15,__LC_KERNEL_STACK +2: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + # CHECK_VMAP_STACK branches to stack_invalid or 4f + CHECK_VMAP_STACK __LC_SAVE_AREA,%r13,4f +3: lg %r15,__LC_KERNEL_STACK(%r13) 4: la %r11,STACK_FRAME_OVERHEAD(%r15) stg %r10,__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK(%r13) stmg %r8,%r9,__PT_PSW(%r11) - # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -392,53 +334,56 @@ ENTRY(pgm_check_handler) xgr %r5,%r5 xgr %r6,%r6 xgr %r7,%r7 + xgr %r12,%r12 lgr %r2,%r11 brasl %r14,__do_pgm_check tmhh %r8,0x0001 # returning to user space? jno .Lpgm_exit_kernel - lctlg %c1,%c1,__LC_USER_ASCE - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP - stpt __LC_EXIT_TIMER + STACKLEAK_ERASE + BPON + stpt __LC_EXIT_TIMER(%r13) .Lpgm_exit_kernel: - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - b __LC_RETURN_LPSWE + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # # single stepped system call # .Lpgm_svcper: - mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW + mvc __LC_RETURN_PSW(8,%r13),__LC_SVC_NEW_PSW(%r13) larl %r14,.Lsysc_per - stg %r14,__LC_RETURN_PSW+8 + stg %r14,__LC_RETURN_PSW+8(%r13) lghi %r14,1 - lpswe __LC_RETURN_PSW # branch to .Lsysc_per -ENDPROC(pgm_check_handler) + LBEAR __LC_PGM_LAST_BREAK(%r13) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per +SYM_CODE_END(pgm_check_handler) /* * Interrupt handler macro used for external and IO interrupts. */ .macro INT_HANDLER name,lc_old_psw,handler -ENTRY(\name) - STCK __LC_INT_CLOCK - stpt __LC_SYS_ENTER_TIMER +SYM_CODE_START(\name) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stckf __LC_INT_CLOCK(%r13) + stpt __LC_SYS_ENTER_TIMER(%r13) + STBEAR __LC_LAST_BREAK(%r13) BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_ASYNC - lg %r12,__LC_CURRENT - lmg %r8,%r9,\lc_old_psw + lmg %r8,%r9,\lc_old_psw(%r13) tmhh %r8,0x0001 # interrupting from user ? jnz 1f #if IS_ENABLED(CONFIG_KVM) - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) - SIEEXIT + lg %r10,__LC_CURRENT(%r13) + tm __TI_sie(%r10),0xff + jz 0f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif -0: CHECK_STACK __LC_SAVE_AREA_ASYNC - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) +0: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) j 2f -1: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r15,__LC_KERNEL_STACK +1: lg %r15,__LC_KERNEL_STACK(%r13) 2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -452,114 +397,85 @@ ENTRY(\name) xgr %r7,%r7 xgr %r10,%r10 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + MBEAR %r11,%r13 stmg %r8,%r9,__PT_PSW(%r11) - tm %r8,0x0001 # coming from user space? - jno 1f - lctlg %c1,%c1,__LC_KERNEL_ASCE -1: lgr %r2,%r11 # pass pointer to pt_regs + lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,\handler - mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + mvc __LC_RETURN_PSW(16,%r13),__PT_PSW(%r11) tmhh %r8,0x0001 # returning to user ? jno 2f - lctlg %c1,%c1,__LC_USER_ASCE - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP - stpt __LC_EXIT_TIMER -2: lmg %r0,%r15,__PT_R0(%r11) - b __LC_RETURN_LPSWE -ENDPROC(\name) + STACKLEAK_ERASE + BPON + stpt __LC_EXIT_TIMER(%r13) +2: LBEAR __PT_LAST_BREAK(%r11) + lmg %r0,%r15,__PT_R0(%r11) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(\name) .endm + .section .irqentry.text, "ax" + INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq -/* - * Load idle PSW. - */ -ENTRY(psw_idle) - stg %r14,(__SF_GPRS+8*8)(%r15) - stg %r3,__SF_EMPTY(%r15) - larl %r1,psw_idle_exit - stg %r1,__SF_EMPTY+8(%r15) - larl %r1,smp_cpu_mtid - llgf %r1,0(%r1) - ltgr %r1,%r1 - jz .Lpsw_idle_stcctm - .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2) -.Lpsw_idle_stcctm: - oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT - BPON - STCK __CLOCK_IDLE_ENTER(%r2) - stpt __TIMER_IDLE_ENTER(%r2) - lpswe __SF_EMPTY(%r15) -.globl psw_idle_exit -psw_idle_exit: - BR_EX %r14 -ENDPROC(psw_idle) + .section .kprobes.text, "ax" /* * Machine check handler routines */ -ENTRY(mcck_int_handler) - STCK __LC_MCCK_CLOCK +SYM_CODE_START(mcck_int_handler) BPOFF - la %r1,4095 # validate r1 - spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer - lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs - lg %r12,__LC_CURRENT - lmg %r8,%r9,__LC_MCK_OLD_PSW - TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE + GET_LC %r13 + lmg %r8,%r9,__LC_MCK_OLD_PSW(%r13) + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_SYSTEM_DAMAGE jo .Lmcck_panic # yes -> rest of mcck code invalid - TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_CR_VALID jno .Lmcck_panic # control registers invalid -> panic - la %r14,4095 - lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs ptlb - lghi %r14,__LC_CPU_TIMER_SAVE_AREA - mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) - TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID + lay %r14,__LC_CPU_TIMER_SAVE_AREA(%r13) + mvc __LC_MCCK_ENTER_TIMER(8,%r13),0(%r14) + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_CPU_TIMER_VALID jo 3f - la %r14,__LC_SYS_ENTER_TIMER - clc 0(8,%r14),__LC_EXIT_TIMER + la %r14,__LC_SYS_ENTER_TIMER(%r13) + clc 0(8,%r14),__LC_EXIT_TIMER(%r13) jl 1f - la %r14,__LC_EXIT_TIMER -1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER + la %r14,__LC_EXIT_TIMER(%r13) +1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER(%r13) jl 2f - la %r14,__LC_LAST_UPDATE_TIMER + la %r14,__LC_LAST_UPDATE_TIMER(%r13) 2: spt 0(%r14) - mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) -3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID + mvc __LC_MCCK_ENTER_TIMER(8,%r13),0(%r14) +3: TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_PSW_MWP_VALID jno .Lmcck_panic tmhh %r8,0x0001 # interrupting from user ? - jnz 6f - TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID + jnz .Lmcck_user + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic #if IS_ENABLED(CONFIG_KVM) - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f - OUTSIDE %r9,.Lsie_entry,.Lsie_skip,4f - oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST - j 5f -4: CHKSTG .Lmcck_panic -5: larl %r14,.Lstosm_tmp - stosm 0(%r14),0x04 # turn dat on, keep irqs off - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) - SIEEXIT - j .Lmcck_stack + lg %r10,__LC_CURRENT(%r13) + tm __TI_sie(%r10),0xff + jz .Lmcck_user + # Need to compare the address instead of __TI_SIE flag. + # Otherwise there would be a race between setting the flag + # and entering SIE (or leaving and clearing the flag). This + # would cause machine checks targeted at the guest to be + # handled by the host. + larl %r14,.Lsie_entry + clgrjl %r9,%r14, 4f + larl %r14,.Lsie_leave + clgrjhe %r9,%r14, 4f + lg %r10,__LC_PCPU(%r13) + oi __PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST +4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif -6: CHKSTG .Lmcck_panic - larl %r14,.Lstosm_tmp - stosm 0(%r14),0x04 # turn dat on, keep irqs off - tmhh %r8,0x0001 # interrupting from user ? - jz .Lmcck_stack - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP -.Lmcck_stack: - lg %r15,__LC_MCCK_STACK +.Lmcck_user: + lg %r15,__LC_MCCK_STACK(%r13) la %r11,STACK_FRAME_OVERHEAD(%r15) - stctg %c1,%c1,__PT_CR1(%r11) - lctlg %c1,%c1,__LC_KERNEL_ASCE xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - lghi %r14,__LC_GPREGS_SAVE_AREA+64 - stmg %r0,%r7,__PT_R0(%r11) + lay %r14,__LC_GPREGS_SAVE_AREA(%r13) + mvc __PT_R0(128,%r11),0(%r14) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -569,30 +485,22 @@ ENTRY(mcck_int_handler) xgr %r6,%r6 xgr %r7,%r7 xgr %r10,%r10 - mvc __PT_R8(64,%r11),0(%r14) stmg %r8,%r9,__PT_PSW(%r11) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,s390_do_machine_check - cghi %r2,0 - je .Lmcck_return - lg %r1,__LC_KERNEL_STACK # switch to kernel stack - mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) - xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) - la %r11,STACK_FRAME_OVERHEAD(%r1) - lgr %r15,%r1 - brasl %r14,s390_handle_mcck -.Lmcck_return: - lctlg %c1,%c1,__PT_CR1(%r11) lmg %r0,%r10,__PT_R0(%r11) - mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW - tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? + mvc __LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW + tm __LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ? jno 0f - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP - stpt __LC_EXIT_TIMER -0: lmg %r11,%r15,__PT_R11(%r11) - b __LC_RETURN_MCCK_LPSWE + BPON + stpt __LC_EXIT_TIMER(%r13) +0: ALTERNATIVE "brcl 0,0", __stringify(lay %r12,__LC_LAST_BREAK_SAVE_AREA(%r13)),\ + ALT_FACILITY(193) + LBEAR 0(%r12) + lmg %r11,%r15,__PT_R11(%r11) + LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE .Lmcck_panic: /* @@ -603,10 +511,10 @@ ENTRY(mcck_int_handler) */ lhi %r5,0 lhi %r6,1 - larl %r7,.Lstop_lock + larl %r7,stop_lock cs %r5,%r6,0(%r7) # single CPU-stopper only jnz 4f - larl %r7,.Lthis_cpu + larl %r7,this_cpu stap 0(%r7) # this CPU address lh %r4,0(%r7) nilh %r4,0 @@ -622,23 +530,28 @@ ENTRY(mcck_int_handler) 3: sigp %r1,%r4,SIGP_STOP # stop this CPU brc SIGP_CC_BUSY,3b 4: j 4b -ENDPROC(mcck_int_handler) +SYM_CODE_END(mcck_int_handler) -# -# PSW restart interrupt handler -# -ENTRY(restart_int_handler) - ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 +SYM_CODE_START(restart_int_handler) + ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40) stg %r15,__LC_SAVE_AREA_RESTART - lg %r15,__LC_RESTART_STACK + TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4 + jz 0f + lctlg %c0,%c15,__LC_CREGS_SAVE_AREA +0: larl %r15,daton_psw + lpswe 0(%r15) # turn dat on, keep irqs off +.Ldaton: + GET_LC %r15 + lg %r15,__LC_RESTART_STACK(%r15) xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART - mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW + GET_LC %r13 + mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART(%r13) + mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW(%r13) xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15) - lg %r1,__LC_RESTART_FN # load fn, parm & source cpu - lg %r2,__LC_RESTART_DATA - lg %r3,__LC_RESTART_SOURCE + lg %r1,__LC_RESTART_FN(%r13) # load fn, parm & source cpu + lg %r2,__LC_RESTART_DATA(%r13) + lgf %r3,__LC_RESTART_SOURCE(%r13) ltgr %r3,%r3 # test source cpu address jm 1f # negative -> skip source stop 0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu @@ -649,46 +562,53 @@ ENTRY(restart_int_handler) 2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu brc 2,2b 3: j 3b -ENDPROC(restart_int_handler) +SYM_CODE_END(restart_int_handler) + + __INIT +SYM_CODE_START(early_pgm_check_handler) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW(%r13) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + lgr %r2,%r11 + brasl %r14,__do_early_pgm_check + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(early_pgm_check_handler) + __FINIT .section .kprobes.text, "ax" -#if defined(CONFIG_CHECK_STACK) || defined(CONFIG_VMAP_STACK) /* - * The synchronous or the asynchronous stack overflowed. We are dead. + * The synchronous or the asynchronous stack pointer is invalid. We are dead. * No need to properly save the registers, we are going to panic anyway. * Setup a pt_regs so that show_trace can provide a good call trace. */ -ENTRY(stack_overflow) - lg %r15,__LC_NODAT_STACK # change to panic stack +SYM_CODE_START(stack_invalid) + GET_LC %r15 + lg %r15,__LC_NODAT_STACK(%r15) # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) stmg %r8,%r9,__PT_PSW(%r11) mvc __PT_R8(64,%r11),0(%r14) - stg %r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2 + GET_LC %r2 + mvc __PT_ORIG_GPR2(8,%r11),__LC_PGM_LAST_BREAK(%r2) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs - jg kernel_stack_overflow -ENDPROC(stack_overflow) -#endif + jg kernel_stack_invalid +SYM_CODE_END(stack_invalid) .section .data, "aw" - .align 4 -.Lstop_lock: .long 0 -.Lthis_cpu: .short 0 -.Lstosm_tmp: .byte 0 - .section .rodata, "a" -#define SYSCALL(esame,emu) .quad __s390x_ ## esame - .globl sys_call_table -sys_call_table: -#include "asm/syscall_table.h" -#undef SYSCALL - -#ifdef CONFIG_COMPAT - -#define SYSCALL(esame,emu) .quad __s390_ ## emu - .globl sys_call_table_emu -sys_call_table_emu: -#include "asm/syscall_table.h" -#undef SYSCALL -#endif + .balign 4 +SYM_DATA_LOCAL(stop_lock, .long 0) +SYM_DATA_LOCAL(this_cpu, .short 0) + .balign 8 +SYM_DATA_START_LOCAL(daton_psw) + .quad PSW_KERNEL_BITS + .quad .Ldaton +SYM_DATA_END(daton_psw) diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 1ab33465382f..dd55cc6bbc28 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -5,6 +5,7 @@ #include <linux/percpu.h> #include <linux/types.h> #include <linux/signal.h> +#include <asm/extable.h> #include <asm/ptrace.h> #include <asm/idle.h> @@ -16,10 +17,13 @@ void ext_int_handler(void); void io_int_handler(void); void mcck_int_handler(void); void restart_int_handler(void); +void early_pgm_check_handler(void); +struct task_struct *__switch_to_asm(struct task_struct *prev, struct task_struct *next); void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs); void __do_pgm_check(struct pt_regs *regs); void __do_syscall(struct pt_regs *regs, int per_trap); +void __do_early_pgm_check(struct pt_regs *regs); void do_protection_exception(struct pt_regs *regs); void do_dat_exception(struct pt_regs *regs); @@ -27,21 +31,16 @@ void do_secure_storage_access(struct pt_regs *regs); void do_non_secure_storage_access(struct pt_regs *regs); void do_secure_storage_violation(struct pt_regs *regs); void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str); -void kernel_stack_overflow(struct pt_regs * regs); -void do_signal(struct pt_regs *regs); +void kernel_stack_invalid(struct pt_regs *regs); void handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct pt_regs *regs); -void do_notify_resume(struct pt_regs *regs); -void __init init_IRQ(void); void do_io_irq(struct pt_regs *regs); void do_ext_irq(struct pt_regs *regs); void do_restart(void *arg); void __init startup_init(void); void die(struct pt_regs *regs, const char *str); int setup_profiling_timer(unsigned int multiplier); -void __init time_init(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); struct s390_mmap_arg_struct; struct fadvise64_64_args; @@ -64,4 +63,13 @@ void stack_free(unsigned long stack); extern char kprobes_insn_page[]; +extern char _samode31[], _eamode31[]; +extern char _stext_amode31[], _etext_amode31[]; +extern struct exception_table_entry _start_amode31_ex_table[]; +extern struct exception_table_entry _stop_amode31_ex_table[]; + +#define __amode31_data __section(".amode31.data") +#define __amode31_ref __section(".amode31.refs") +extern long _start_amode31_refs[], _end_amode31_refs[]; + #endif /* _ENTRY_H */ diff --git a/arch/s390/kernel/facility.c b/arch/s390/kernel/facility.c new file mode 100644 index 000000000000..d028b0be5c1d --- /dev/null +++ b/arch/s390/kernel/facility.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2023 + */ + +#include <linux/export.h> +#include <asm/facility.h> + +unsigned int stfle_size(void) +{ + static unsigned int size; + unsigned int r; + u64 dummy; + + r = READ_ONCE(size); + if (!r) { + r = __stfle_asm(&dummy, 1) + 1; + WRITE_ONCE(size, r); + } + return r; +} +EXPORT_SYMBOL(stfle_size); diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index d864c9a325e2..03a8973aec3c 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -5,261 +5,191 @@ * Copyright IBM Corp. 2015 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ + +#include <linux/export.h> #include <linux/kernel.h> #include <linux/cpu.h> #include <linux/sched.h> -#include <asm/fpu/types.h> -#include <asm/fpu/api.h> - -asm(".include \"asm/vx-insn.h\"\n"); +#include <asm/fpu.h> -void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) +void __kernel_fpu_begin(struct kernel_fpu *state, int flags) { + __vector128 *vxrs = state->vxrs; + int mask; + /* * Limit the save to the FPU/vector registers already - * in use by the previous context + * in use by the previous context. */ - flags &= state->mask; - + flags &= state->hdr.mask; if (flags & KERNEL_FPC) - /* Save floating point control */ - asm volatile("stfpc %0" : "=Q" (state->fpc)); - - if (!MACHINE_HAS_VX) { - if (flags & KERNEL_VXR_V0V7) { - /* Save floating-point registers */ - asm volatile("std 0,%0" : "=Q" (state->fprs[0])); - asm volatile("std 1,%0" : "=Q" (state->fprs[1])); - asm volatile("std 2,%0" : "=Q" (state->fprs[2])); - asm volatile("std 3,%0" : "=Q" (state->fprs[3])); - asm volatile("std 4,%0" : "=Q" (state->fprs[4])); - asm volatile("std 5,%0" : "=Q" (state->fprs[5])); - asm volatile("std 6,%0" : "=Q" (state->fprs[6])); - asm volatile("std 7,%0" : "=Q" (state->fprs[7])); - asm volatile("std 8,%0" : "=Q" (state->fprs[8])); - asm volatile("std 9,%0" : "=Q" (state->fprs[9])); - asm volatile("std 10,%0" : "=Q" (state->fprs[10])); - asm volatile("std 11,%0" : "=Q" (state->fprs[11])); - asm volatile("std 12,%0" : "=Q" (state->fprs[12])); - asm volatile("std 13,%0" : "=Q" (state->fprs[13])); - asm volatile("std 14,%0" : "=Q" (state->fprs[14])); - asm volatile("std 15,%0" : "=Q" (state->fprs[15])); - } + fpu_stfpc(&state->hdr.fpc); + if (!cpu_has_vx()) { + if (flags & KERNEL_VXR_LOW) + save_fp_regs_vx(vxrs); return; } - - /* Test and save vector registers */ - asm volatile ( - /* - * Test if any vector register must be saved and, if so, - * test if all register can be saved. - */ - " la 1,%[vxrs]\n" /* load save area */ - " tmll %[m],30\n" /* KERNEL_VXR */ - " jz 7f\n" /* no work -> done */ - " jo 5f\n" /* -> save V0..V31 */ - /* - * Test for special case KERNEL_FPU_MID only. In this - * case a vstm V8..V23 is the best instruction - */ - " chi %[m],12\n" /* KERNEL_VXR_MID */ - " jne 0f\n" /* -> save V8..V23 */ - " VSTM 8,23,128,1\n" /* vstm %v8,%v23,128(%r1) */ - " j 7f\n" - /* Test and save the first half of 16 vector registers */ - "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ - " jz 3f\n" /* -> KERNEL_VXR_HIGH */ - " jo 2f\n" /* 11 -> save V0..V15 */ - " brc 2,1f\n" /* 10 -> save V8..V15 */ - " VSTM 0,7,0,1\n" /* vstm %v0,%v7,0(%r1) */ - " j 3f\n" - "1: VSTM 8,15,128,1\n" /* vstm %v8,%v15,128(%r1) */ - " j 3f\n" - "2: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ - /* Test and save the second half of 16 vector registers */ - "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ - " jz 7f\n" - " jo 6f\n" /* 11 -> save V16..V31 */ - " brc 2,4f\n" /* 10 -> save V24..V31 */ - " VSTM 16,23,256,1\n" /* vstm %v16,%v23,256(%r1) */ - " j 7f\n" - "4: VSTM 24,31,384,1\n" /* vstm %v24,%v31,384(%r1) */ - " j 7f\n" - "5: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */ - "6: VSTM 16,31,256,1\n" /* vstm %v16,%v31,256(%r1) */ - "7:" - : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) - : [m] "d" (flags) - : "1", "cc"); + mask = flags & KERNEL_VXR; + if (mask == KERNEL_VXR) { + vxrs += fpu_vstm(0, 15, vxrs); + vxrs += fpu_vstm(16, 31, vxrs); + return; + } + if (mask == KERNEL_VXR_MID) { + vxrs += fpu_vstm(8, 23, vxrs); + return; + } + mask = flags & KERNEL_VXR_LOW; + if (mask) { + if (mask == KERNEL_VXR_LOW) + vxrs += fpu_vstm(0, 15, vxrs); + else if (mask == KERNEL_VXR_V0V7) + vxrs += fpu_vstm(0, 7, vxrs); + else + vxrs += fpu_vstm(8, 15, vxrs); + } + mask = flags & KERNEL_VXR_HIGH; + if (mask) { + if (mask == KERNEL_VXR_HIGH) + vxrs += fpu_vstm(16, 31, vxrs); + else if (mask == KERNEL_VXR_V16V23) + vxrs += fpu_vstm(16, 23, vxrs); + else + vxrs += fpu_vstm(24, 31, vxrs); + } } EXPORT_SYMBOL(__kernel_fpu_begin); -void __kernel_fpu_end(struct kernel_fpu *state, u32 flags) +void __kernel_fpu_end(struct kernel_fpu *state, int flags) { + __vector128 *vxrs = state->vxrs; + int mask; + /* * Limit the restore to the FPU/vector registers of the - * previous context that have been overwritte by the - * current context + * previous context that have been overwritten by the + * current context. */ - flags &= state->mask; - + flags &= state->hdr.mask; if (flags & KERNEL_FPC) - /* Restore floating-point controls */ - asm volatile("lfpc %0" : : "Q" (state->fpc)); - - if (!MACHINE_HAS_VX) { - if (flags & KERNEL_VXR_V0V7) { - /* Restore floating-point registers */ - asm volatile("ld 0,%0" : : "Q" (state->fprs[0])); - asm volatile("ld 1,%0" : : "Q" (state->fprs[1])); - asm volatile("ld 2,%0" : : "Q" (state->fprs[2])); - asm volatile("ld 3,%0" : : "Q" (state->fprs[3])); - asm volatile("ld 4,%0" : : "Q" (state->fprs[4])); - asm volatile("ld 5,%0" : : "Q" (state->fprs[5])); - asm volatile("ld 6,%0" : : "Q" (state->fprs[6])); - asm volatile("ld 7,%0" : : "Q" (state->fprs[7])); - asm volatile("ld 8,%0" : : "Q" (state->fprs[8])); - asm volatile("ld 9,%0" : : "Q" (state->fprs[9])); - asm volatile("ld 10,%0" : : "Q" (state->fprs[10])); - asm volatile("ld 11,%0" : : "Q" (state->fprs[11])); - asm volatile("ld 12,%0" : : "Q" (state->fprs[12])); - asm volatile("ld 13,%0" : : "Q" (state->fprs[13])); - asm volatile("ld 14,%0" : : "Q" (state->fprs[14])); - asm volatile("ld 15,%0" : : "Q" (state->fprs[15])); - } + fpu_lfpc(&state->hdr.fpc); + if (!cpu_has_vx()) { + if (flags & KERNEL_VXR_LOW) + load_fp_regs_vx(vxrs); return; } - - /* Test and restore (load) vector registers */ - asm volatile ( - /* - * Test if any vector register must be loaded and, if so, - * test if all registers can be loaded at once. - */ - " la 1,%[vxrs]\n" /* load restore area */ - " tmll %[m],30\n" /* KERNEL_VXR */ - " jz 7f\n" /* no work -> done */ - " jo 5f\n" /* -> restore V0..V31 */ - /* - * Test for special case KERNEL_FPU_MID only. In this - * case a vlm V8..V23 is the best instruction - */ - " chi %[m],12\n" /* KERNEL_VXR_MID */ - " jne 0f\n" /* -> restore V8..V23 */ - " VLM 8,23,128,1\n" /* vlm %v8,%v23,128(%r1) */ - " j 7f\n" - /* Test and restore the first half of 16 vector registers */ - "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */ - " jz 3f\n" /* -> KERNEL_VXR_HIGH */ - " jo 2f\n" /* 11 -> restore V0..V15 */ - " brc 2,1f\n" /* 10 -> restore V8..V15 */ - " VLM 0,7,0,1\n" /* vlm %v0,%v7,0(%r1) */ - " j 3f\n" - "1: VLM 8,15,128,1\n" /* vlm %v8,%v15,128(%r1) */ - " j 3f\n" - "2: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ - /* Test and restore the second half of 16 vector registers */ - "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */ - " jz 7f\n" - " jo 6f\n" /* 11 -> restore V16..V31 */ - " brc 2,4f\n" /* 10 -> restore V24..V31 */ - " VLM 16,23,256,1\n" /* vlm %v16,%v23,256(%r1) */ - " j 7f\n" - "4: VLM 24,31,384,1\n" /* vlm %v24,%v31,384(%r1) */ - " j 7f\n" - "5: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */ - "6: VLM 16,31,256,1\n" /* vlm %v16,%v31,256(%r1) */ - "7:" - : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs) - : [m] "d" (flags) - : "1", "cc"); + mask = flags & KERNEL_VXR; + if (mask == KERNEL_VXR) { + vxrs += fpu_vlm(0, 15, vxrs); + vxrs += fpu_vlm(16, 31, vxrs); + return; + } + if (mask == KERNEL_VXR_MID) { + vxrs += fpu_vlm(8, 23, vxrs); + return; + } + mask = flags & KERNEL_VXR_LOW; + if (mask) { + if (mask == KERNEL_VXR_LOW) + vxrs += fpu_vlm(0, 15, vxrs); + else if (mask == KERNEL_VXR_V0V7) + vxrs += fpu_vlm(0, 7, vxrs); + else + vxrs += fpu_vlm(8, 15, vxrs); + } + mask = flags & KERNEL_VXR_HIGH; + if (mask) { + if (mask == KERNEL_VXR_HIGH) + vxrs += fpu_vlm(16, 31, vxrs); + else if (mask == KERNEL_VXR_V16V23) + vxrs += fpu_vlm(16, 23, vxrs); + else + vxrs += fpu_vlm(24, 31, vxrs); + } } EXPORT_SYMBOL(__kernel_fpu_end); -void __load_fpu_regs(void) +void load_fpu_state(struct fpu *state, int flags) { - struct fpu *state = ¤t->thread.fpu; - unsigned long *regs = current->thread.fpu.regs; + __vector128 *vxrs = &state->vxrs[0]; + int mask; - asm volatile("lfpc %0" : : "Q" (state->fpc)); - if (likely(MACHINE_HAS_VX)) { - asm volatile("lgr 1,%0\n" - "VLM 0,15,0,1\n" - "VLM 16,31,256,1\n" - : - : "d" (regs) - : "1", "cc", "memory"); - } else { - asm volatile("ld 0,%0" : : "Q" (regs[0])); - asm volatile("ld 1,%0" : : "Q" (regs[1])); - asm volatile("ld 2,%0" : : "Q" (regs[2])); - asm volatile("ld 3,%0" : : "Q" (regs[3])); - asm volatile("ld 4,%0" : : "Q" (regs[4])); - asm volatile("ld 5,%0" : : "Q" (regs[5])); - asm volatile("ld 6,%0" : : "Q" (regs[6])); - asm volatile("ld 7,%0" : : "Q" (regs[7])); - asm volatile("ld 8,%0" : : "Q" (regs[8])); - asm volatile("ld 9,%0" : : "Q" (regs[9])); - asm volatile("ld 10,%0" : : "Q" (regs[10])); - asm volatile("ld 11,%0" : : "Q" (regs[11])); - asm volatile("ld 12,%0" : : "Q" (regs[12])); - asm volatile("ld 13,%0" : : "Q" (regs[13])); - asm volatile("ld 14,%0" : : "Q" (regs[14])); - asm volatile("ld 15,%0" : : "Q" (regs[15])); + if (flags & KERNEL_FPC) + fpu_lfpc_safe(&state->fpc); + if (!cpu_has_vx()) { + if (flags & KERNEL_VXR_V0V7) + load_fp_regs_vx(state->vxrs); + return; + } + mask = flags & KERNEL_VXR; + if (mask == KERNEL_VXR) { + fpu_vlm(0, 15, &vxrs[0]); + fpu_vlm(16, 31, &vxrs[16]); + return; + } + if (mask == KERNEL_VXR_MID) { + fpu_vlm(8, 23, &vxrs[8]); + return; + } + mask = flags & KERNEL_VXR_LOW; + if (mask) { + if (mask == KERNEL_VXR_LOW) + fpu_vlm(0, 15, &vxrs[0]); + else if (mask == KERNEL_VXR_V0V7) + fpu_vlm(0, 7, &vxrs[0]); + else + fpu_vlm(8, 15, &vxrs[8]); + } + mask = flags & KERNEL_VXR_HIGH; + if (mask) { + if (mask == KERNEL_VXR_HIGH) + fpu_vlm(16, 31, &vxrs[16]); + else if (mask == KERNEL_VXR_V16V23) + fpu_vlm(16, 23, &vxrs[16]); + else + fpu_vlm(24, 31, &vxrs[24]); } - clear_cpu_flag(CIF_FPU); -} -EXPORT_SYMBOL(__load_fpu_regs); - -void load_fpu_regs(void) -{ - raw_local_irq_disable(); - __load_fpu_regs(); - raw_local_irq_enable(); } -EXPORT_SYMBOL(load_fpu_regs); -void save_fpu_regs(void) +void save_fpu_state(struct fpu *state, int flags) { - unsigned long flags, *regs; - struct fpu *state; + __vector128 *vxrs = &state->vxrs[0]; + int mask; - local_irq_save(flags); - - if (test_cpu_flag(CIF_FPU)) - goto out; - - state = ¤t->thread.fpu; - regs = current->thread.fpu.regs; - - asm volatile("stfpc %0" : "=Q" (state->fpc)); - if (likely(MACHINE_HAS_VX)) { - asm volatile("lgr 1,%0\n" - "VSTM 0,15,0,1\n" - "VSTM 16,31,256,1\n" - : - : "d" (regs) - : "1", "cc", "memory"); - } else { - asm volatile("std 0,%0" : "=Q" (regs[0])); - asm volatile("std 1,%0" : "=Q" (regs[1])); - asm volatile("std 2,%0" : "=Q" (regs[2])); - asm volatile("std 3,%0" : "=Q" (regs[3])); - asm volatile("std 4,%0" : "=Q" (regs[4])); - asm volatile("std 5,%0" : "=Q" (regs[5])); - asm volatile("std 6,%0" : "=Q" (regs[6])); - asm volatile("std 7,%0" : "=Q" (regs[7])); - asm volatile("std 8,%0" : "=Q" (regs[8])); - asm volatile("std 9,%0" : "=Q" (regs[9])); - asm volatile("std 10,%0" : "=Q" (regs[10])); - asm volatile("std 11,%0" : "=Q" (regs[11])); - asm volatile("std 12,%0" : "=Q" (regs[12])); - asm volatile("std 13,%0" : "=Q" (regs[13])); - asm volatile("std 14,%0" : "=Q" (regs[14])); - asm volatile("std 15,%0" : "=Q" (regs[15])); + if (flags & KERNEL_FPC) + fpu_stfpc(&state->fpc); + if (!cpu_has_vx()) { + if (flags & KERNEL_VXR_LOW) + save_fp_regs_vx(state->vxrs); + return; + } + mask = flags & KERNEL_VXR; + if (mask == KERNEL_VXR) { + fpu_vstm(0, 15, &vxrs[0]); + fpu_vstm(16, 31, &vxrs[16]); + return; + } + if (mask == KERNEL_VXR_MID) { + fpu_vstm(8, 23, &vxrs[8]); + return; + } + mask = flags & KERNEL_VXR_LOW; + if (mask) { + if (mask == KERNEL_VXR_LOW) + fpu_vstm(0, 15, &vxrs[0]); + else if (mask == KERNEL_VXR_V0V7) + fpu_vstm(0, 7, &vxrs[0]); + else + fpu_vstm(8, 15, &vxrs[8]); + } + mask = flags & KERNEL_VXR_HIGH; + if (mask) { + if (mask == KERNEL_VXR_HIGH) + fpu_vstm(16, 31, &vxrs[16]); + else if (mask == KERNEL_VXR_V16V23) + fpu_vstm(16, 23, &vxrs[16]); + else + fpu_vstm(24, 31, &vxrs[24]); } - set_cpu_flag(CIF_FPU); -out: - local_irq_restore(flags); } -EXPORT_SYMBOL(save_fpu_regs); +EXPORT_SYMBOL(save_fpu_state); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index c6ddeb5029b4..e94bb98f5231 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -4,22 +4,27 @@ * * Copyright IBM Corp. 2009,2014 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * Martin Schwidefsky <schwidefsky@de.ibm.com> + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/moduleloader.h> #include <linux/hardirq.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/kmsan-checks.h> +#include <linux/cpufeature.h> #include <linux/kprobes.h> +#include <linux/execmem.h> #include <trace/syscall.h> #include <asm/asm-offsets.h> +#include <asm/text-patching.h> #include <asm/cacheflush.h> +#include <asm/ftrace.lds.h> +#include <asm/nospec-branch.h> #include <asm/set_memory.h> #include "entry.h" +#include "ftrace.h" /* * To generate function prologue either gcc's hotpatch feature (since gcc 4.8) @@ -40,123 +45,236 @@ * trampoline (ftrace_plt), which clobbers also r1. */ -unsigned long ftrace_plt; +void *ftrace_func __read_mostly = ftrace_stub; +struct ftrace_insn { + u16 opc; + s32 disp; +} __packed; -int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, - unsigned long addr) +static const char *ftrace_shared_hotpatch_trampoline(const char **end) { - return 0; + const char *tstart, *tend; + + tstart = ftrace_shared_hotpatch_trampoline_br; + tend = ftrace_shared_hotpatch_trampoline_br_end; +#ifdef CONFIG_EXPOLINE + if (!nospec_disable) { + tstart = ftrace_shared_hotpatch_trampoline_exrl; + tend = ftrace_shared_hotpatch_trampoline_exrl_end; + } +#endif /* CONFIG_EXPOLINE */ + if (end) + *end = tend; + return tstart; } -int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, - unsigned long addr) +bool ftrace_need_init_nop(void) { - struct ftrace_insn orig, new, old; + return !cpu_has_seq_insn(); +} - if (copy_from_kernel_nofault(&old, (void *) rec->ip, sizeof(old))) - return -EFAULT; - /* Replace ftrace call with a nop. */ - ftrace_generate_call_insn(&orig, rec->ip); - ftrace_generate_nop_insn(&new); +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) +{ + static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline = + __ftrace_hotpatch_trampolines_start; + static const struct ftrace_insn orig = { .opc = 0xc004, .disp = 0 }; + static struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_hotpatch_trampoline **next_trampoline; + struct ftrace_hotpatch_trampoline *trampolines_end; + struct ftrace_hotpatch_trampoline tmp; + struct ftrace_insn *insn; + struct ftrace_insn old; + const char *shared; + s32 disp; + + BUILD_BUG_ON(sizeof(struct ftrace_hotpatch_trampoline) != + SIZEOF_FTRACE_HOTPATCH_TRAMPOLINE); + + next_trampoline = &next_vmlinux_trampoline; + trampolines_end = __ftrace_hotpatch_trampolines_end; + shared = ftrace_shared_hotpatch_trampoline(NULL); +#ifdef CONFIG_MODULES + if (mod) { + next_trampoline = &mod->arch.next_trampoline; + trampolines_end = mod->arch.trampolines_end; + } +#endif - /* Verify that the to be replaced code matches what we expect. */ - if (memcmp(&orig, &old, sizeof(old))) + if (WARN_ON_ONCE(*next_trampoline >= trampolines_end)) + return -ENOMEM; + trampoline = (*next_trampoline)++; + + if (copy_from_kernel_nofault(&old, (void *)rec->ip, sizeof(old))) + return -EFAULT; + /* Check for the compiler-generated fentry nop (brcl 0, .). */ + if (WARN_ON_ONCE(memcmp(&orig, &old, sizeof(old)))) return -EINVAL; - s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + + /* Generate the trampoline. */ + tmp.brasl_opc = 0xc015; /* brasl %r1, shared */ + tmp.brasl_disp = (shared - (const char *)&trampoline->brasl_opc) / 2; + tmp.interceptor = FTRACE_ADDR; + tmp.rest_of_intercepted_function = rec->ip + sizeof(struct ftrace_insn); + s390_kernel_write(trampoline, &tmp, sizeof(tmp)); + + /* Generate a jump to the trampoline. */ + disp = ((char *)trampoline - (char *)rec->ip) / 2; + insn = (struct ftrace_insn *)rec->ip; + s390_kernel_write(&insn->disp, &disp, sizeof(disp)); + return 0; } -int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrace *rec) { - struct ftrace_insn orig, new, old; + struct ftrace_hotpatch_trampoline *trampoline; + struct ftrace_insn insn; + s64 disp; + u16 opc; + + if (copy_from_kernel_nofault(&insn, (void *)rec->ip, sizeof(insn))) + return ERR_PTR(-EFAULT); + disp = (s64)insn.disp * 2; + trampoline = (void *)(rec->ip + disp); + if (get_kernel_nofault(opc, &trampoline->brasl_opc)) + return ERR_PTR(-EFAULT); + if (opc != 0xc015) + return ERR_PTR(-EINVAL); + return trampoline; +} - if (copy_from_kernel_nofault(&old, (void *) rec->ip, sizeof(old))) - return -EFAULT; - /* Replace nop with an ftrace call. */ - ftrace_generate_nop_insn(&orig); - ftrace_generate_call_insn(&new, rec->ip); +static inline struct ftrace_insn +ftrace_generate_branch_insn(unsigned long ip, unsigned long target) +{ + /* brasl r0,target or brcl 0,0 */ + return (struct ftrace_insn){ .opc = target ? 0xc005 : 0xc004, + .disp = target ? (target - ip) / 2 : 0 }; +} + +static int ftrace_patch_branch_insn(unsigned long ip, unsigned long old_target, + unsigned long target) +{ + struct ftrace_insn orig = ftrace_generate_branch_insn(ip, old_target); + struct ftrace_insn new = ftrace_generate_branch_insn(ip, target); + struct ftrace_insn old; + if (!IS_ALIGNED(ip, 8)) + return -EINVAL; + if (copy_from_kernel_nofault(&old, (void *)ip, sizeof(old))) + return -EFAULT; /* Verify that the to be replaced code matches what we expect. */ if (memcmp(&orig, &old, sizeof(old))) return -EINVAL; - s390_kernel_write((void *) rec->ip, &new, sizeof(new)); + s390_kernel_write((void *)ip, &new, sizeof(new)); return 0; } -int ftrace_update_ftrace_func(ftrace_func_t func) +static int ftrace_modify_trampoline_call(struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long addr) { + struct ftrace_hotpatch_trampoline *trampoline; + u64 old; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + if (get_kernel_nofault(old, &trampoline->interceptor)) + return -EFAULT; + if (old != old_addr) + return -EINVAL; + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); return 0; } -int __init ftrace_dyn_arch_init(void) +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) { - return 0; + if (cpu_has_seq_insn()) + return ftrace_patch_branch_insn(rec->ip, old_addr, addr); + else + return ftrace_modify_trampoline_call(rec, old_addr, addr); } -#ifdef CONFIG_MODULES - -static int __init ftrace_plt_init(void) +static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) { - unsigned int *ip; + u16 old; + u8 op; - ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE); - if (!ftrace_plt) - panic("cannot allocate ftrace plt\n"); - ip = (unsigned int *) ftrace_plt; - ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */ - ip[1] = 0x100a0004; - ip[2] = 0x07f10000; - ip[3] = FTRACE_ADDR >> 32; - ip[4] = FTRACE_ADDR & 0xffffffff; - set_memory_ro(ftrace_plt, 1); + if (get_kernel_nofault(old, addr)) + return -EFAULT; + if (old != expected) + return -EINVAL; + /* set mask field to all ones or zeroes */ + op = enable ? 0xf4 : 0x04; + s390_kernel_write((char *)addr + 1, &op, sizeof(op)); return 0; } -device_initcall(ftrace_plt_init); -#endif /* CONFIG_MODULES */ +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, + unsigned long addr) +{ + /* Expect brcl 0xf,... for the !cpu_has_seq_insn() case */ + if (cpu_has_seq_insn()) + return ftrace_patch_branch_insn(rec->ip, addr, 0); + else + return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); +} -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -/* - * Hook the return address and push it in the stack of return addresses - * in current thread info. - */ -unsigned long prepare_ftrace_return(unsigned long ra, unsigned long sp, - unsigned long ip) +static int ftrace_make_trampoline_call(struct dyn_ftrace *rec, unsigned long addr) { - if (unlikely(ftrace_graph_is_dead())) - goto out; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) - goto out; - ip -= MCOUNT_INSN_SIZE; - if (!function_graph_enter(ra, ip, 0, (void *) sp)) - ra = (unsigned long) return_to_handler; -out: - return ra; + struct ftrace_hotpatch_trampoline *trampoline; + + trampoline = ftrace_get_trampoline(rec); + if (IS_ERR(trampoline)) + return PTR_ERR(trampoline); + s390_kernel_write(&trampoline->interceptor, &addr, sizeof(addr)); + /* Expect brcl 0x0,... */ + return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true); } -NOKPROBE_SYMBOL(prepare_ftrace_return); -/* - * Patch the kernel code at ftrace_graph_caller location. The instruction - * there is branch relative on condition. To enable the ftrace graph code - * block, we simply patch the mask field of the instruction to zero and - * turn the instruction into a nop. - * To disable the ftrace graph code the mask field will be patched to - * all ones, which turns the instruction into an unconditional branch. - */ -int ftrace_enable_ftrace_graph_caller(void) +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - u8 op = 0x04; /* set mask field to zero */ + if (cpu_has_seq_insn()) + return ftrace_patch_branch_insn(rec->ip, 0, addr); + else + return ftrace_make_trampoline_call(rec, addr); +} - s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op)); +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + ftrace_func = func; return 0; } -int ftrace_disable_ftrace_graph_caller(void) +void arch_ftrace_update_code(int command) { - u8 op = 0xf4; /* set mask field to all ones */ + ftrace_modify_all_code(command); +} - s390_kernel_write(__va(ftrace_graph_caller)+1, &op, sizeof(op)); - return 0; +void ftrace_arch_code_modify_post_process(void) +{ + /* + * Flush any pre-fetched instructions on all + * CPUs to make the new code visible. + */ + text_poke_sync_lock(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; + unsigned long sp = arch_ftrace_regs(fregs)->regs.gprs[15]; + + if (unlikely(ftrace_graph_is_dead())) + return; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + if (!function_graph_enter_regs(*parent, ip, 0, (unsigned long *)sp, fregs)) + *parent = (unsigned long)&return_to_handler; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -170,14 +288,17 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; + kmsan_unpoison_memory(fregs, ftrace_regs_size()); regs = ftrace_get_regs(fregs); - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); - if (unlikely(!p) || kprobe_disabled(p)) + if (!regs || unlikely(!p) || kprobe_disabled(p)) goto out; if (kprobe_running()) { @@ -203,7 +324,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, } __this_cpu_write(current_kprobe, NULL); out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h new file mode 100644 index 000000000000..23337065f402 --- /dev/null +++ b/arch/s390/kernel/ftrace.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FTRACE_H +#define _FTRACE_H + +#include <asm/types.h> + +struct ftrace_hotpatch_trampoline { + u16 brasl_opc; + s32 brasl_disp; + s16: 16; + u64 rest_of_intercepted_function; + u64 interceptor; +} __packed; + +extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_start[]; +extern struct ftrace_hotpatch_trampoline __ftrace_hotpatch_trampolines_end[]; +extern const char ftrace_shared_hotpatch_trampoline_br[]; +extern const char ftrace_shared_hotpatch_trampoline_br_end[]; +extern const char ftrace_shared_hotpatch_trampoline_exrl[]; +extern const char ftrace_shared_hotpatch_trampoline_exrl_end[]; + +#endif /* _FTRACE_H */ diff --git a/arch/s390/kernel/guarded_storage.c b/arch/s390/kernel/guarded_storage.c index d14dd1c2e524..cf26d7a37425 100644 --- a/arch/s390/kernel/guarded_storage.c +++ b/arch/s390/kernel/guarded_storage.c @@ -4,6 +4,7 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ +#include <linux/cpufeature.h> #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -28,7 +29,7 @@ static int gs_enable(void) return -ENOMEM; gs_cb->gsd = 25; preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); load_gs_cb(gs_cb); current->thread.gs_cb = gs_cb; preempt_enable(); @@ -42,7 +43,7 @@ static int gs_disable(void) preempt_disable(); kfree(current->thread.gs_cb); current->thread.gs_cb = NULL; - __ctl_clear_bit(2, 4); + local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT); preempt_enable(); } return 0; @@ -84,7 +85,7 @@ void gs_load_bc_cb(struct pt_regs *regs) if (gs_cb) { kfree(current->thread.gs_cb); current->thread.gs_bc_cb = NULL; - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); load_gs_cb(gs_cb); current->thread.gs_cb = gs_cb; } @@ -109,7 +110,7 @@ static int gs_broadcast(void) SYSCALL_DEFINE2(s390_guarded_storage, int, command, struct gs_cb __user *, gs_cb) { - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -EOPNOTSUPP; switch (command) { case GS_ENABLE: diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head.S index 0c253886da78..7edb9ded199c 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head.S @@ -5,39 +5,36 @@ * Author(s): Hartmut Penner <hp@de.ibm.com> * Martin Schwidefsky <schwidefsky@de.ibm.com> * Rob van der Heij <rvdhei@iae.nl> - * Heiko Carstens <heiko.carstens@de.ibm.com> * */ #include <linux/init.h> #include <linux/linkage.h> +#include <asm/lowcore.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page.h> #include <asm/ptrace.h> __HEAD -ENTRY(startup_continue) - larl %r1,tod_clock_base - mvc 0(16,%r1),__LC_BOOT_CLOCK - larl %r13,.LPG1 # get base +SYM_CODE_START(startup_continue) # # Setup stack # + GET_LC %r2 larl %r14,init_task - stg %r14,__LC_CURRENT - larl %r15,init_thread_union+THREAD_SIZE-STACK_FRAME_OVERHEAD-__PT_SIZE -#ifdef CONFIG_KASAN - brasl %r14,kasan_early_init -#endif + stg %r14,__LC_CURRENT(%r2) + larl %r15,init_thread_union+STACK_INIT_OFFSET + stg %r15,__LC_KERNEL_STACK(%r2) + brasl %r14,sclp_early_adjust_va # allow sclp_early_printk brasl %r14,startup_init # s390 specific early init brasl %r14,start_kernel # common init code # # We returned from start_kernel ?!? PANIK # basr %r13,0 - lpswe .Ldw-.(%r13) # load disabled wait psw + lpswe dw_psw-.(%r13) # load disabled wait psw +SYM_CODE_END(startup_continue) - .align 16 -.LPG1: -.Ldw: .quad 0x0002000180000000,0x0000000000000000 + .balign 16 +SYM_DATA_LOCAL(dw_psw, .quad 0x0002000180000000,0x0000000000000000) diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c new file mode 100644 index 000000000000..217206522266 --- /dev/null +++ b/arch/s390/kernel/hiperdispatch.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define pr_fmt(fmt) "hd: " fmt + +/* + * Hiperdispatch: + * Dynamically calculates the optimum number of high capacity COREs + * by considering the state the system is in. When hiperdispatch decides + * that a capacity update is necessary, it schedules a topology update. + * During topology updates the CPU capacities are always re-adjusted. + * + * There is two places where CPU capacities are being accessed within + * hiperdispatch. + * -> hiperdispatch's reoccuring work function reads CPU capacities to + * determine high capacity CPU count. + * -> during a topology update hiperdispatch's adjustment function + * updates CPU capacities. + * These two can run on different CPUs in parallel which can cause + * hiperdispatch to make wrong decisions. This can potentially cause + * some overhead by leading to extra rebuild_sched_domains() calls + * for correction. Access to capacities within hiperdispatch has to be + * serialized to prevent the overhead. + * + * Hiperdispatch decision making revolves around steal time. + * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time + * crosses the threshold value hiperdispatch falls back to giving high + * capacities to entitled CPUs. When steal time drops below the + * threshold boundary, hiperdispatch utilizes all CPUs by giving all + * of them high capacity. + * + * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread + * performance. Comparing the throughput of; + * - single CORE, with N threads, running N tasks + * - N separate COREs running N tasks, + * using individual COREs for individual tasks yield better + * performance. This performance difference is roughly ~30% (can change + * between machine generations) + * + * Hiperdispatch tries to hint scheduler to use individual COREs for + * each task, as long as steal time on those COREs are less than 30%, + * therefore delaying the throughput loss caused by using SMP threads. + */ + +#include <linux/cpufeature.h> +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/kernel_stat.h> +#include <linux/kstrtox.h> +#include <linux/ktime.h> +#include <linux/sysctl.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <asm/hiperdispatch.h> +#include <asm/setup.h> +#include <asm/smp.h> +#include <asm/topology.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hiperdispatch.h> + +#define HD_DELAY_FACTOR (4) +#define HD_DELAY_INTERVAL (HZ / 4) +#define HD_STEAL_THRESHOLD 10 +#define HD_STEAL_AVG_WEIGHT 16 + +static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ +static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */ +static int hd_high_capacity_cores; /* Current CORE count with high capacity */ +static int hd_entitled_cores; /* Total vertical high and medium CORE count */ +static int hd_online_cores; /* Current online CORE count */ + +static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */ +static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */ +static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */ +static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */ + +static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD; +static unsigned int hd_delay_factor = HD_DELAY_FACTOR; +static int hd_enabled; + +static void hd_capacity_work_fn(struct work_struct *work); +static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); + +static int hd_set_hiperdispatch_mode(int enable) +{ + if (!cpu_has_topology()) + enable = 0; + if (hd_enabled == enable) + return 0; + hd_enabled = enable; + return 1; +} + +void hd_reset_state(void) +{ + cpumask_clear(&hd_vl_coremask); + cpumask_clear(&hd_vmvl_cpumask); + hd_entitled_cores = 0; + hd_online_cores = 0; +} + +void hd_add_core(int cpu) +{ + const struct cpumask *siblings; + int polarization; + + hd_online_cores++; + polarization = smp_cpu_get_polarization(cpu); + siblings = topology_sibling_cpumask(cpu); + switch (polarization) { + case POLARIZATION_VH: + hd_entitled_cores++; + break; + case POLARIZATION_VM: + hd_entitled_cores++; + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + case POLARIZATION_VL: + cpumask_set_cpu(cpu, &hd_vl_coremask); + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + } +} + +/* Serialize update and read operations of debug counters. */ +static DEFINE_MUTEX(hd_counter_mutex); + +static void hd_update_times(void) +{ + static ktime_t prev; + ktime_t now; + + /* + * Check if hiperdispatch is active, if not set the prev to 0. + * This way it is possible to differentiate the first update iteration after + * enabling hiperdispatch. + */ + if (hd_entitled_cores == 0 || hd_enabled == 0) { + prev = ktime_set(0, 0); + return; + } + now = ktime_get(); + if (ktime_after(prev, 0)) { + if (hd_high_capacity_cores == hd_online_cores) + hd_high_time += ktime_ms_delta(now, prev); + else + hd_low_time += ktime_ms_delta(now, prev); + } + prev = now; +} + +static void hd_update_capacities(void) +{ + int cpu, upscaling_cores; + unsigned long capacity; + + upscaling_cores = hd_high_capacity_cores - hd_entitled_cores; + capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW; + hd_high_capacity_cores = hd_entitled_cores; + for_each_cpu(cpu, &hd_vl_coremask) { + smp_set_core_capacity(cpu, capacity); + if (capacity != CPU_CAPACITY_HIGH) + continue; + hd_high_capacity_cores++; + upscaling_cores--; + if (upscaling_cores == 0) + capacity = CPU_CAPACITY_LOW; + } +} + +void hd_disable_hiperdispatch(void) +{ + cancel_delayed_work_sync(&hd_capacity_work); + hd_high_capacity_cores = hd_online_cores; + hd_previous_steal = 0; +} + +int hd_enable_hiperdispatch(void) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + mutex_unlock(&hd_counter_mutex); + if (hd_enabled == 0) + return 0; + if (hd_entitled_cores == 0) + return 0; + if (hd_online_cores <= hd_entitled_cores) + return 0; + mod_delayed_work(system_dfl_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + hd_update_capacities(); + return 1; +} + +static unsigned long hd_steal_avg(unsigned long new) +{ + static unsigned long steal; + + steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT; + return steal; +} + +static unsigned long hd_calculate_steal_percentage(void) +{ + unsigned long time_delta, steal_delta, steal, percentage; + static ktime_t prev; + int cpus, cpu; + ktime_t now; + + cpus = 0; + steal = 0; + percentage = 0; + for_each_cpu(cpu, &hd_vmvl_cpumask) { + steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + cpus++; + } + /* + * If there is no vertical medium and low CPUs steal time + * is 0 as vertical high CPUs shouldn't experience steal time. + */ + if (cpus == 0) + return percentage; + now = ktime_get(); + time_delta = ktime_to_ns(ktime_sub(now, prev)); + if (steal > hd_previous_steal && hd_previous_steal != 0) { + steal_delta = (steal - hd_previous_steal) * 100 / time_delta; + percentage = steal_delta / cpus; + } + hd_previous_steal = steal; + prev = now; + return percentage; +} + +static void hd_capacity_work_fn(struct work_struct *work) +{ + unsigned long steal_percentage, new_cores; + + mutex_lock(&smp_cpu_state_mutex); + /* + * If online cores are less or equal to entitled cores hiperdispatch + * does not need to make any adjustments, call a topology update to + * disable hiperdispatch. + * Normally this check is handled on topology update, but during cpu + * unhotplug, topology and cpu mask updates are done in reverse + * order, causing hd_enable_hiperdispatch() to get stale data. + */ + if (hd_online_cores <= hd_entitled_cores) { + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return; + } + steal_percentage = hd_steal_avg(hd_calculate_steal_percentage()); + if (steal_percentage < hd_steal_threshold) + new_cores = hd_online_cores; + else + new_cores = hd_entitled_cores; + if (hd_high_capacity_cores != new_cores) { + trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores); + hd_high_capacity_cores = new_cores; + atomic64_inc(&hd_adjustments); + topology_schedule_update(); + } + trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores); + mutex_unlock(&smp_cpu_state_mutex); + schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL); +} + +static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int hiperdispatch; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &hiperdispatch, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + hiperdispatch = hd_enabled; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + mutex_lock(&smp_cpu_state_mutex); + if (hd_set_hiperdispatch_mode(hiperdispatch)) + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return 0; +} + +static const struct ctl_table hiperdispatch_ctl_table[] = { + { + .procname = "hiperdispatch", + .mode = 0644, + .proc_handler = hiperdispatch_ctl_handler, + }, +}; + +static ssize_t hd_steal_threshold_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_steal_threshold); +} + +static ssize_t hd_steal_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (val > 100) + return -ERANGE; + hd_steal_threshold = val; + return count; +} + +static DEVICE_ATTR_RW(hd_steal_threshold); + +static ssize_t hd_delay_factor_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_delay_factor); +} + +static ssize_t hd_delay_factor_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (!val) + return -ERANGE; + hd_delay_factor = val; + return count; +} + +static DEVICE_ATTR_RW(hd_delay_factor); + +static struct attribute *hd_attrs[] = { + &dev_attr_hd_steal_threshold.attr, + &dev_attr_hd_delay_factor.attr, + NULL, +}; + +static const struct attribute_group hd_attr_group = { + .name = "hiperdispatch", + .attrs = hd_attrs, +}; + +static int hd_greedy_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_high_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n"); + +static int hd_conservative_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_low_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n"); + +static int hd_adjustment_count_get(void *unused, u64 *val) +{ + *val = atomic64_read(&hd_adjustments); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n"); + +static void __init hd_create_debugfs_counters(void) +{ + struct dentry *dir; + + dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir); + debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops); + debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops); + debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops); +} + +static void __init hd_create_attributes(void) +{ + struct device *dev; + + dev = bus_get_dev_root(&cpu_subsys); + if (!dev) + return; + if (sysfs_create_group(&dev->kobj, &hd_attr_group)) + pr_warn("Unable to create hiperdispatch attribute group\n"); + put_device(dev); +} + +static int __init hd_init(void) +{ + if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) { + hd_set_hiperdispatch_mode(1); + topology_schedule_update(); + } + if (!register_sysctl("s390", hiperdispatch_ctl_table)) + pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n"); + hd_create_debugfs_counters(); + hd_create_attributes(); + return 0; +} +late_initcall(hd_init); diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 4bf1ee293f2b..39cb8d0ae348 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -12,9 +12,9 @@ #include <linux/notifier.h> #include <linux/init.h> #include <linux/cpu.h> -#include <linux/sched/cputime.h> #include <trace/events/power.h> #include <asm/cpu_mf.h> +#include <asm/cputime.h> #include <asm/nmi.h> #include <asm/smp.h> #include "entry.h" @@ -24,117 +24,66 @@ static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); void account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct lowcore *lc = get_lowcore(); + unsigned long idle_time; u64 cycles_new[8]; int i; - clear_cpu_flag(CIF_ENABLED_WAIT); if (smp_cpu_mtid) { stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); for (i = 0; i < smp_cpu_mtid; i++) this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); } - idle->clock_idle_exit = S390_lowcore.int_clock; - idle->timer_idle_exit = S390_lowcore.sys_enter_timer; + idle_time = lc->int_clock - idle->clock_idle_enter; + + lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; + lc->last_update_clock = lc->int_clock; - S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock; - S390_lowcore.last_update_clock = idle->clock_idle_exit; + lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; + lc->last_update_timer = lc->sys_enter_timer; - S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter; - S390_lowcore.last_update_timer = idle->timer_idle_exit; + /* Account time spent with enabled wait psw loaded as idle time. */ + WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); + WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1); + account_idle_time(cputime_to_nsecs(idle_time)); } -void arch_cpu_idle(void) +void noinstr arch_cpu_idle(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - unsigned long idle_time; unsigned long psw_mask; /* Wait for external, I/O or machine check interrupt. */ - psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; + psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | + PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); - - /* psw_idle() returns with interrupts disabled. */ - psw_idle(idle, psw_mask); - - /* Account time spent with enabled wait psw loaded as idle time. */ - raw_write_seqcount_begin(&idle->seqcount); - idle_time = idle->clock_idle_exit - idle->clock_idle_enter; - idle->clock_idle_enter = idle->clock_idle_exit = 0ULL; - idle->idle_time += idle_time; - idle->idle_count++; - account_idle_time(cputime_to_nsecs(idle_time)); - raw_write_seqcount_end(&idle->seqcount); - raw_local_irq_enable(); + set_cpu_flag(CIF_ENABLED_WAIT); + if (smp_cpu_mtid) + stcctm(MT_DIAG, smp_cpu_mtid, (u64 *)&idle->mt_cycles_enter); + idle->clock_idle_enter = get_tod_clock_fast(); + idle->timer_idle_enter = get_cpu_timer(); + bpon(); + __load_psw_mask(psw_mask); } static ssize_t show_idle_count(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned long idle_count; - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_count = READ_ONCE(idle->idle_count); - if (READ_ONCE(idle->clock_idle_enter)) - idle_count++; - } while (read_seqcount_retry(&idle->seqcount, seq)); - return sprintf(buf, "%lu\n", idle_count); + + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_count)); } DEVICE_ATTR(idle_count, 0444, show_idle_count, NULL); static ssize_t show_idle_time(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { - unsigned long now, idle_time, idle_enter, idle_exit, in_idle; struct s390_idle_data *idle = &per_cpu(s390_idle, dev->id); - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_time = READ_ONCE(idle->idle_time); - idle_enter = READ_ONCE(idle->clock_idle_enter); - idle_exit = READ_ONCE(idle->clock_idle_exit); - } while (read_seqcount_retry(&idle->seqcount, seq)); - in_idle = 0; - now = get_tod_clock(); - if (idle_enter) { - if (idle_exit) { - in_idle = idle_exit - idle_enter; - } else if (now > idle_enter) { - in_idle = now - idle_enter; - } - } - idle_time += in_idle; - return sprintf(buf, "%lu\n", idle_time >> 12); -} -DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); -u64 arch_cpu_idle_time(int cpu) -{ - struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); - unsigned long now, idle_enter, idle_exit, in_idle; - unsigned int seq; - - do { - seq = read_seqcount_begin(&idle->seqcount); - idle_enter = READ_ONCE(idle->clock_idle_enter); - idle_exit = READ_ONCE(idle->clock_idle_exit); - } while (read_seqcount_retry(&idle->seqcount, seq)); - in_idle = 0; - now = get_tod_clock(); - if (idle_enter) { - if (idle_exit) { - in_idle = idle_exit - idle_enter; - } else if (now > idle_enter) { - in_idle = now - idle_enter; - } - } - return cputime_to_nsecs(in_idle); + return sysfs_emit(buf, "%lu\n", READ_ONCE(idle->idle_time) >> 12); } +DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); void arch_cpu_idle_enter(void) { @@ -144,7 +93,7 @@ void arch_cpu_idle_exit(void) { } -void arch_cpu_idle_dead(void) +void __noreturn arch_cpu_idle_dead(void) { cpu_die(); } diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 50e2c21e0ec9..961a3d60a4dd 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -4,7 +4,6 @@ * * Copyright IBM Corp. 2005, 2012 * Author(s): Michael Holzheu <holzheu@de.ibm.com> - * Heiko Carstens <heiko.carstens@de.ibm.com> * Volker Sameske <sameske@de.ibm.com> */ @@ -13,6 +12,7 @@ #include <linux/init.h> #include <linux/device.h> #include <linux/delay.h> +#include <linux/kstrtox.h> #include <linux/panic_notifier.h> #include <linux/reboot.h> #include <linux/ctype.h> @@ -20,6 +20,9 @@ #include <linux/gfp.h> #include <linux/crash_dump.h> #include <linux/debug_locks.h> +#include <linux/vmalloc.h> +#include <asm/asm-extable.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/ipl.h> #include <asm/smp.h> @@ -29,6 +32,7 @@ #include <asm/sclp.h> #include <asm/checksum.h> #include <asm/debug.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> #include <asm/sections.h> #include <asm/boot_data.h> @@ -38,6 +42,8 @@ #define IPL_UNKNOWN_STR "unknown" #define IPL_CCW_STR "ccw" +#define IPL_ECKD_STR "eckd" +#define IPL_ECKD_DUMP_STR "eckd_dump" #define IPL_FCP_STR "fcp" #define IPL_FCP_DUMP_STR "fcp_dump" #define IPL_NVME_STR "nvme" @@ -45,6 +51,7 @@ #define IPL_NSS_STR "nss" #define DUMP_CCW_STR "ccw" +#define DUMP_ECKD_STR "eckd" #define DUMP_FCP_STR "fcp" #define DUMP_NVME_STR "nvme" #define DUMP_NONE_STR "none" @@ -91,6 +98,10 @@ static char *ipl_type_str(enum ipl_type type) switch (type) { case IPL_TYPE_CCW: return IPL_CCW_STR; + case IPL_TYPE_ECKD: + return IPL_ECKD_STR; + case IPL_TYPE_ECKD_DUMP: + return IPL_ECKD_DUMP_STR; case IPL_TYPE_FCP: return IPL_FCP_STR; case IPL_TYPE_FCP_DUMP: @@ -112,6 +123,7 @@ enum dump_type { DUMP_TYPE_CCW = 2, DUMP_TYPE_FCP = 4, DUMP_TYPE_NVME = 8, + DUMP_TYPE_ECKD = 16, }; static char *dump_type_str(enum dump_type type) @@ -121,6 +133,8 @@ static char *dump_type_str(enum dump_type type) return DUMP_NONE_STR; case DUMP_TYPE_CCW: return DUMP_CCW_STR; + case DUMP_TYPE_ECKD: + return DUMP_ECKD_STR; case DUMP_TYPE_FCP: return DUMP_FCP_STR; case DUMP_TYPE_NVME: @@ -146,6 +160,7 @@ static enum ipl_type reipl_type = IPL_TYPE_UNKNOWN; static struct ipl_parameter_block *reipl_block_fcp; static struct ipl_parameter_block *reipl_block_nvme; static struct ipl_parameter_block *reipl_block_ccw; +static struct ipl_parameter_block *reipl_block_eckd; static struct ipl_parameter_block *reipl_block_nss; static struct ipl_parameter_block *reipl_block_actual; @@ -154,20 +169,24 @@ static enum dump_type dump_type = DUMP_TYPE_NONE; static struct ipl_parameter_block *dump_block_fcp; static struct ipl_parameter_block *dump_block_nvme; static struct ipl_parameter_block *dump_block_ccw; +static struct ipl_parameter_block *dump_block_eckd; static struct sclp_ipl_info sclp_ipl_info; static bool reipl_nvme_clear; static bool reipl_fcp_clear; static bool reipl_ccw_clear; +static bool reipl_eckd_clear; -static inline int __diag308(unsigned long subcode, void *addr) +static unsigned long os_info_flags; + +static inline int __diag308(unsigned long subcode, unsigned long addr) { union register_pair r1; - r1.even = (unsigned long) addr; + r1.even = addr; r1.odd = 0; - asm volatile( + asm_inline volatile( " diag %[r1],%[subcode],0x308\n" "0: nopr %%r7\n" EX_TABLE(0b,0b) @@ -179,10 +198,8 @@ static inline int __diag308(unsigned long subcode, void *addr) int diag308(unsigned long subcode, void *addr) { - if (IS_ENABLED(CONFIG_KASAN)) - __arch_local_irq_stosm(0x04); /* enable DAT */ diag_stat_inc(DIAG_STAT_X308); - return __diag308(subcode, addr); + return __diag308(subcode, addr ? virt_to_phys(addr) : 0); } EXPORT_SYMBOL_GPL(diag308); @@ -193,7 +210,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, \ char *page) \ { \ - return scnprintf(page, PAGE_SIZE, _format, ##args); \ + return sysfs_emit(page, _format, ##args); \ } #define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \ @@ -219,14 +236,14 @@ IPL_ATTR_SHOW_FN(_prefix, _name, "0.%x.%04x\n", \ _ipl_blk.ssid, _ipl_blk.devno); \ IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk); \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name, (S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) \ #define DEFINE_IPL_ATTR_RO(_prefix, _name, _format, _value) \ IPL_ATTR_SHOW_FN(_prefix, _name, _format, _value) \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name, S_IRUGO, sys_##_prefix##_##_name##_show, NULL) + __ATTR(_name, 0444, sys_##_prefix##_##_name##_show, NULL) #define DEFINE_IPL_ATTR_RW(_prefix, _name, _fmt_out, _fmt_in, _value) \ IPL_ATTR_SHOW_FN(_prefix, _name, _fmt_out, (unsigned long long) _value) \ @@ -241,7 +258,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ return len; \ } \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name,(S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) @@ -251,15 +268,74 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ struct kobj_attribute *attr, \ const char *buf, size_t len) \ { \ - strncpy(_value, buf, sizeof(_value) - 1); \ + if (len >= sizeof(_value)) \ + return -E2BIG; \ + len = strscpy(_value, buf); \ + if ((ssize_t)len < 0) \ + return len; \ strim(_value); \ return len; \ } \ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ - __ATTR(_name,(S_IRUGO | S_IWUSR), \ + __ATTR(_name, 0644, \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) +#define IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +static ssize_t sys_##_prefix##_scp_data_show(struct file *filp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t off, \ + size_t count) \ +{ \ + size_t size = _ipl_block.scp_data_len; \ + void *scp_data = _ipl_block.scp_data; \ + \ + return memory_read_from_buffer(buf, count, &off, \ + scp_data, size); \ +} + +#define IPL_ATTR_SCP_DATA_STORE_FN(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len)\ +static ssize_t sys_##_prefix##_scp_data_store(struct file *filp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t off, \ + size_t count) \ +{ \ + size_t scpdata_len = count; \ + size_t padding; \ + \ + if (off) \ + return -EINVAL; \ + \ + memcpy(_ipl_block.scp_data, buf, count); \ + if (scpdata_len % 8) { \ + padding = 8 - (scpdata_len % 8); \ + memset(_ipl_block.scp_data + scpdata_len, \ + 0, padding); \ + scpdata_len += padding; \ + } \ + \ + _ipl_block_hdr.len = _ipl_bp_len + scpdata_len; \ + _ipl_block.len = _ipl_bp0_len + scpdata_len; \ + _ipl_block.scp_data_len = scpdata_len; \ + \ + return count; \ +} + +#define DEFINE_IPL_ATTR_SCP_DATA_RO(_prefix, _ipl_block, _size) \ +IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +static const struct bin_attribute sys_##_prefix##_scp_data_attr = \ + __BIN_ATTR(scp_data, 0444, sys_##_prefix##_scp_data_show, \ + NULL, _size) + +#define DEFINE_IPL_ATTR_SCP_DATA_RW(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len, _size)\ +IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +IPL_ATTR_SCP_DATA_STORE_FN(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len)\ +static const struct bin_attribute sys_##_prefix##_scp_data_attr = \ + __BIN_ATTR(scp_data, 0644, sys_##_prefix##_scp_data_show, \ + sys_##_prefix##_scp_data_store, _size) + /* * ipl section */ @@ -282,6 +358,11 @@ static __init enum ipl_type get_ipl_type(void) return IPL_TYPE_NVME_DUMP; else return IPL_TYPE_NVME; + case IPL_PBT_ECKD: + if (ipl_block.eckd.opt == IPL_PB0_ECKD_OPT_DUMP) + return IPL_TYPE_ECKD_DUMP; + else + return IPL_TYPE_ECKD; } return IPL_TYPE_UNKNOWN; } @@ -292,7 +373,7 @@ EXPORT_SYMBOL_GPL(ipl_info); static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", ipl_type_str(ipl_info.type)); + return sysfs_emit(page, "%s\n", ipl_type_str(ipl_info.type)); } static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); @@ -300,7 +381,7 @@ static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); static ssize_t ipl_secure_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%i\n", !!ipl_secure_flag); + return sysfs_emit(page, "%i\n", !!ipl_secure_flag); } static struct kobj_attribute sys_ipl_secure_attr = @@ -309,7 +390,7 @@ static struct kobj_attribute sys_ipl_secure_attr = static ssize_t ipl_has_secure_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%i\n", !!sclp.has_sipl); + return sysfs_emit(page, "%i\n", !!sclp.has_sipl); } static struct kobj_attribute sys_ipl_has_secure_attr = @@ -322,79 +403,69 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj, if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW)) ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block); - return sprintf(page, "%s\n", parm); + return sysfs_emit(page, "%s\n", parm); } static struct kobj_attribute sys_ipl_vm_parm_attr = - __ATTR(parm, S_IRUGO, ipl_vm_parm_show, NULL); + __ATTR(parm, 0444, ipl_vm_parm_show, NULL); static ssize_t sys_ipl_device_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { switch (ipl_info.type) { case IPL_TYPE_CCW: - return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid, - ipl_block.ccw.devno); + return sysfs_emit(page, "0.%x.%04x\n", ipl_block.ccw.ssid, + ipl_block.ccw.devno); + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + return sysfs_emit(page, "0.%x.%04x\n", ipl_block.eckd.ssid, + ipl_block.eckd.devno); case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: - return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); + return sysfs_emit(page, "0.0.%04x\n", ipl_block.fcp.devno); case IPL_TYPE_NVME: case IPL_TYPE_NVME_DUMP: - return sprintf(page, "%08ux\n", ipl_block.nvme.fid); + return sysfs_emit(page, "%08ux\n", ipl_block.nvme.fid); default: return 0; } } static struct kobj_attribute sys_ipl_device_attr = - __ATTR(device, S_IRUGO, sys_ipl_device_show, NULL); + __ATTR(device, 0444, sys_ipl_device_show, NULL); -static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) +static ssize_t sys_ipl_parameter_read(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, char *buf, + loff_t off, size_t count) { return memory_read_from_buffer(buf, count, &off, &ipl_block, ipl_block.hdr.len); } -static struct bin_attribute ipl_parameter_attr = - __BIN_ATTR(binary_parameter, S_IRUGO, ipl_parameter_read, NULL, +static const struct bin_attribute sys_ipl_parameter_attr = + __BIN_ATTR(binary_parameter, 0444, sys_ipl_parameter_read, NULL, PAGE_SIZE); -static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) -{ - unsigned int size = ipl_block.fcp.scp_data_len; - void *scp_data = &ipl_block.fcp.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) -{ - unsigned int size = ipl_block.nvme.scp_data_len; - void *scp_data = &ipl_block.nvme.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_fcp, ipl_block.fcp, PAGE_SIZE); -static struct bin_attribute ipl_scp_data_attr = - __BIN_ATTR(scp_data, S_IRUGO, ipl_scp_data_read, NULL, PAGE_SIZE); +static const struct bin_attribute *const ipl_fcp_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_fcp_scp_data_attr, + NULL, +}; -static struct bin_attribute ipl_nvme_scp_data_attr = - __BIN_ATTR(scp_data, S_IRUGO, ipl_nvme_scp_data_read, NULL, PAGE_SIZE); +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_nvme, ipl_block.nvme, PAGE_SIZE); -static struct bin_attribute *ipl_fcp_bin_attrs[] = { - &ipl_parameter_attr, - &ipl_scp_data_attr, +static const struct bin_attribute *const ipl_nvme_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_nvme_scp_data_attr, NULL, }; -static struct bin_attribute *ipl_nvme_bin_attrs[] = { - &ipl_parameter_attr, - &ipl_nvme_scp_data_attr, +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_eckd, ipl_block.eckd, PAGE_SIZE); + +static const struct bin_attribute *const ipl_eckd_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_eckd_scp_data_attr, NULL, }; @@ -419,103 +490,180 @@ DEFINE_IPL_ATTR_RO(ipl_nvme, bootprog, "%lld\n", DEFINE_IPL_ATTR_RO(ipl_nvme, br_lba, "%lld\n", (unsigned long long)ipl_block.nvme.br_lba); +/* ECKD ipl device attributes */ +DEFINE_IPL_ATTR_RO(ipl_eckd, bootprog, "%lld\n", + (unsigned long long)ipl_block.eckd.bootprog); + +#define IPL_ATTR_BR_CHR_SHOW_FN(_name, _ipb) \ +static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *buf) \ +{ \ + struct ipl_pb0_eckd *ipb = &(_ipb); \ + \ + if (!ipb->br_chr.cyl && \ + !ipb->br_chr.head && \ + !ipb->br_chr.record) \ + return sysfs_emit(buf, "auto\n"); \ + \ + return sysfs_emit(buf, "0x%x,0x%x,0x%x\n", \ + ipb->br_chr.cyl, \ + ipb->br_chr.head, \ + ipb->br_chr.record); \ +} + +#define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb) \ +static ssize_t eckd_##_name##_br_chr_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct ipl_pb0_eckd *ipb = &(_ipb); \ + unsigned long args[3] = { 0 }; \ + char *p, *p1, *tmp = NULL; \ + int i, rc; \ + \ + if (!strncmp(buf, "auto", 4)) \ + goto out; \ + \ + tmp = kstrdup(buf, GFP_KERNEL); \ + p = tmp; \ + for (i = 0; i < 3; i++) { \ + p1 = strsep(&p, ", "); \ + if (!p1) { \ + rc = -EINVAL; \ + goto err; \ + } \ + rc = kstrtoul(p1, 0, args + i); \ + if (rc) \ + goto err; \ + } \ + \ + rc = -EINVAL; \ + if (i != 3) \ + goto err; \ + \ + if ((args[0] || args[1]) && !args[2]) \ + goto err; \ + \ + if (args[0] > UINT_MAX || args[1] > 255 || args[2] > 255) \ + goto err; \ + \ +out: \ + ipb->br_chr.cyl = args[0]; \ + ipb->br_chr.head = args[1]; \ + ipb->br_chr.record = args[2]; \ + rc = len; \ +err: \ + kfree(tmp); \ + return rc; \ +} + +IPL_ATTR_BR_CHR_SHOW_FN(ipl, ipl_block.eckd); +static struct kobj_attribute sys_ipl_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_ipl_br_chr_show, NULL); + +IPL_ATTR_BR_CHR_SHOW_FN(reipl, reipl_block_eckd->eckd); +IPL_ATTR_BR_CHR_STORE_FN(reipl, reipl_block_eckd->eckd); + +static struct kobj_attribute sys_reipl_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_reipl_br_chr_show, eckd_reipl_br_chr_store); + static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { char loadparm[LOADPARM_LEN + 1] = {}; if (!sclp_ipl_info.is_valid) - return sprintf(page, "#unknown#\n"); + return sysfs_emit(page, "#unknown#\n"); memcpy(loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); EBCASC(loadparm, LOADPARM_LEN); strim(loadparm); - return sprintf(page, "%s\n", loadparm); + return sysfs_emit(page, "%s\n", loadparm); } static struct kobj_attribute sys_ipl_ccw_loadparm_attr = __ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL); static struct attribute *ipl_fcp_attrs[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_fcp_wwpn_attr.attr, &sys_ipl_fcp_lun_attr.attr, &sys_ipl_fcp_bootprog_attr.attr, &sys_ipl_fcp_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_fcp_attr_group = { +static const struct attribute_group ipl_fcp_attr_group = { .attrs = ipl_fcp_attrs, .bin_attrs = ipl_fcp_bin_attrs, }; static struct attribute *ipl_nvme_attrs[] = { - &sys_ipl_type_attr.attr, &sys_ipl_nvme_fid_attr.attr, &sys_ipl_nvme_nsid_attr.attr, &sys_ipl_nvme_bootprog_attr.attr, &sys_ipl_nvme_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_nvme_attr_group = { +static const struct attribute_group ipl_nvme_attr_group = { .attrs = ipl_nvme_attrs, .bin_attrs = ipl_nvme_bin_attrs, }; +static struct attribute *ipl_eckd_attrs[] = { + &sys_ipl_eckd_bootprog_attr.attr, + &sys_ipl_eckd_br_chr_attr.attr, + &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_device_attr.attr, + NULL, +}; + +static const struct attribute_group ipl_eckd_attr_group = { + .attrs = ipl_eckd_attrs, + .bin_attrs = ipl_eckd_bin_attrs, +}; /* CCW ipl device attributes */ static struct attribute *ipl_ccw_attrs_vm[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_vm_parm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; static struct attribute *ipl_ccw_attrs_lpar[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_ccw_attr_group_vm = { +static const struct attribute_group ipl_ccw_attr_group_vm = { .attrs = ipl_ccw_attrs_vm, }; -static struct attribute_group ipl_ccw_attr_group_lpar = { +static const struct attribute_group ipl_ccw_attr_group_lpar = { .attrs = ipl_ccw_attrs_lpar }; -/* UNKNOWN ipl device attributes */ - -static struct attribute *ipl_unknown_attrs[] = { +static struct attribute *ipl_common_attrs[] = { &sys_ipl_type_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_unknown_attr_group = { - .attrs = ipl_unknown_attrs, +static const struct attribute_group ipl_common_attr_group = { + .attrs = ipl_common_attrs, }; static struct kset *ipl_kset; static void __ipl_run(void *unused) { - __bpon(); diag308(DIAG308_LOAD_CLEAR, NULL); } @@ -533,15 +681,22 @@ static int __init ipl_init(void) rc = -ENOMEM; goto out; } + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group); + if (rc) + goto out; switch (ipl_info.type) { case IPL_TYPE_CCW: - if (MACHINE_IS_VM) + if (machine_is_vm()) rc = sysfs_create_group(&ipl_kset->kobj, &ipl_ccw_attr_group_vm); else rc = sysfs_create_group(&ipl_kset->kobj, &ipl_ccw_attr_group_lpar); break; + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_eckd_attr_group); + break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_fcp_attr_group); @@ -551,8 +706,6 @@ static int __init ipl_init(void) rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group); break; default: - rc = sysfs_create_group(&ipl_kset->kobj, - &ipl_unknown_attr_group); break; } out: @@ -579,7 +732,7 @@ static ssize_t reipl_generic_vmparm_show(struct ipl_parameter_block *ipb, char vmparm[DIAG308_VMPARM_SIZE + 1] = {}; ipl_block_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb); - return sprintf(page, "%s\n", vmparm); + return sysfs_emit(page, "%s\n", vmparm); } static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, @@ -643,54 +796,20 @@ static ssize_t reipl_ccw_vmparm_store(struct kobject *kobj, } static struct kobj_attribute sys_reipl_nss_vmparm_attr = - __ATTR(parm, S_IRUGO | S_IWUSR, reipl_nss_vmparm_show, - reipl_nss_vmparm_store); + __ATTR(parm, 0644, reipl_nss_vmparm_show, + reipl_nss_vmparm_store); static struct kobj_attribute sys_reipl_ccw_vmparm_attr = - __ATTR(parm, S_IRUGO | S_IWUSR, reipl_ccw_vmparm_show, - reipl_ccw_vmparm_store); + __ATTR(parm, 0644, reipl_ccw_vmparm_show, + reipl_ccw_vmparm_store); /* FCP reipl device attributes */ -static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t size = reipl_block_fcp->fcp.scp_data_len; - void *scp_data = reipl_block_fcp->fcp.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t scpdata_len = count; - size_t padding; - - - if (off) - return -EINVAL; - - memcpy(reipl_block_fcp->fcp.scp_data, buf, count); - if (scpdata_len % 8) { - padding = 8 - (scpdata_len % 8); - memset(reipl_block_fcp->fcp.scp_data + scpdata_len, - 0, padding); - scpdata_len += padding; - } - - reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN + scpdata_len; - reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN + scpdata_len; - reipl_block_fcp->fcp.scp_data_len = scpdata_len; +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_fcp, reipl_block_fcp->hdr, + reipl_block_fcp->fcp, + IPL_BP_FCP_LEN, IPL_BP0_FCP_LEN, + DIAG308_SCPDATA_SIZE); - return count; -} -static struct bin_attribute sys_reipl_fcp_scp_data_attr = - __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_fcp_scpdata_read, - reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE); - -static struct bin_attribute *reipl_fcp_bin_attrs[] = { +static const struct bin_attribute *const reipl_fcp_bin_attrs[] = { &sys_reipl_fcp_scp_data_attr, NULL, }; @@ -721,7 +840,7 @@ static ssize_t reipl_generic_loadparm_show(struct ipl_parameter_block *ipb, char buf[LOADPARM_LEN + 1]; reipl_get_ascii_loadparm(buf, ipb); - return sprintf(page, "%s\n", buf); + return sysfs_emit(page, "%s\n", buf); } static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, @@ -752,35 +871,39 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, return len; } -/* FCP wrapper */ -static ssize_t reipl_fcp_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_fcp, page); -} - -static ssize_t reipl_fcp_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_fcp, buf, len); -} - -static struct kobj_attribute sys_reipl_fcp_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_fcp_loadparm_show, - reipl_fcp_loadparm_store); +#define DEFINE_GENERIC_LOADPARM(name) \ +static ssize_t reipl_##name##_loadparm_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *page) \ +{ \ + return reipl_generic_loadparm_show(reipl_block_##name, page); \ +} \ +static ssize_t reipl_##name##_loadparm_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return reipl_generic_loadparm_store(reipl_block_##name, buf, len); \ +} \ +static struct kobj_attribute sys_reipl_##name##_loadparm_attr = \ + __ATTR(loadparm, 0644, reipl_##name##_loadparm_show, \ + reipl_##name##_loadparm_store) + +DEFINE_GENERIC_LOADPARM(fcp); +DEFINE_GENERIC_LOADPARM(nvme); +DEFINE_GENERIC_LOADPARM(ccw); +DEFINE_GENERIC_LOADPARM(nss); +DEFINE_GENERIC_LOADPARM(eckd); static ssize_t reipl_fcp_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_fcp_clear); + return sysfs_emit(page, "%u\n", reipl_fcp_clear); } static ssize_t reipl_fcp_clear_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - if (strtobool(buf, &reipl_fcp_clear) < 0) + if (kstrtobool(buf, &reipl_fcp_clear) < 0) return -EINVAL; return len; } @@ -795,7 +918,7 @@ static struct attribute *reipl_fcp_attrs[] = { NULL, }; -static struct attribute_group reipl_fcp_attr_group = { +static const struct attribute_group reipl_fcp_attr_group = { .attrs = reipl_fcp_attrs, .bin_attrs = reipl_fcp_bin_attrs, }; @@ -805,46 +928,12 @@ static struct kobj_attribute sys_reipl_fcp_clear_attr = /* NVME reipl device attributes */ -static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t size = reipl_block_nvme->nvme.scp_data_len; - void *scp_data = reipl_block_nvme->nvme.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t scpdata_len = count; - size_t padding; - - if (off) - return -EINVAL; - - memcpy(reipl_block_nvme->nvme.scp_data, buf, count); - if (scpdata_len % 8) { - padding = 8 - (scpdata_len % 8); - memset(reipl_block_nvme->nvme.scp_data + scpdata_len, - 0, padding); - scpdata_len += padding; - } +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_nvme, reipl_block_nvme->hdr, + reipl_block_nvme->nvme, + IPL_BP_NVME_LEN, IPL_BP0_NVME_LEN, + DIAG308_SCPDATA_SIZE); - reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len; - reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len; - reipl_block_nvme->nvme.scp_data_len = scpdata_len; - - return count; -} - -static struct bin_attribute sys_reipl_nvme_scp_data_attr = - __BIN_ATTR(scp_data, (S_IRUGO | S_IWUSR), reipl_nvme_scpdata_read, - reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE); - -static struct bin_attribute *reipl_nvme_bin_attrs[] = { +static const struct bin_attribute *const reipl_nvme_bin_attrs[] = { &sys_reipl_nvme_scp_data_attr, NULL, }; @@ -858,24 +947,6 @@ DEFINE_IPL_ATTR_RW(reipl_nvme, bootprog, "%lld\n", "%lld\n", DEFINE_IPL_ATTR_RW(reipl_nvme, br_lba, "%lld\n", "%lld\n", reipl_block_nvme->nvme.br_lba); -/* nvme wrapper */ -static ssize_t reipl_nvme_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_nvme, page); -} - -static ssize_t reipl_nvme_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_nvme, buf, len); -} - -static struct kobj_attribute sys_reipl_nvme_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nvme_loadparm_show, - reipl_nvme_loadparm_store); - static struct attribute *reipl_nvme_attrs[] = { &sys_reipl_nvme_fid_attr.attr, &sys_reipl_nvme_nsid_attr.attr, @@ -885,7 +956,7 @@ static struct attribute *reipl_nvme_attrs[] = { NULL, }; -static struct attribute_group reipl_nvme_attr_group = { +static const struct attribute_group reipl_nvme_attr_group = { .attrs = reipl_nvme_attrs, .bin_attrs = reipl_nvme_bin_attrs }; @@ -893,14 +964,14 @@ static struct attribute_group reipl_nvme_attr_group = { static ssize_t reipl_nvme_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_nvme_clear); + return sysfs_emit(page, "%u\n", reipl_nvme_clear); } static ssize_t reipl_nvme_clear_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - if (strtobool(buf, &reipl_nvme_clear) < 0) + if (kstrtobool(buf, &reipl_nvme_clear) < 0) return -EINVAL; return len; } @@ -911,49 +982,17 @@ static struct kobj_attribute sys_reipl_nvme_clear_attr = /* CCW reipl device attributes */ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); -/* NSS wrapper */ -static ssize_t reipl_nss_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_nss, page); -} - -static ssize_t reipl_nss_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_nss, buf, len); -} - -/* CCW wrapper */ -static ssize_t reipl_ccw_loadparm_show(struct kobject *kobj, - struct kobj_attribute *attr, char *page) -{ - return reipl_generic_loadparm_show(reipl_block_ccw, page); -} - -static ssize_t reipl_ccw_loadparm_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t len) -{ - return reipl_generic_loadparm_store(reipl_block_ccw, buf, len); -} - -static struct kobj_attribute sys_reipl_ccw_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_ccw_loadparm_show, - reipl_ccw_loadparm_store); - static ssize_t reipl_ccw_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_ccw_clear); + return sysfs_emit(page, "%u\n", reipl_ccw_clear); } static ssize_t reipl_ccw_clear_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - if (strtobool(buf, &reipl_ccw_clear) < 0) + if (kstrtobool(buf, &reipl_ccw_clear) < 0) return -EINVAL; return len; } @@ -986,6 +1025,52 @@ static struct attribute_group reipl_ccw_attr_group_lpar = { .attrs = reipl_ccw_attrs_lpar, }; +/* ECKD reipl device attributes */ + +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_eckd, reipl_block_eckd->hdr, + reipl_block_eckd->eckd, + IPL_BP_ECKD_LEN, IPL_BP0_ECKD_LEN, + DIAG308_SCPDATA_SIZE); + +static const struct bin_attribute *const reipl_eckd_bin_attrs[] = { + &sys_reipl_eckd_scp_data_attr, + NULL, +}; + +DEFINE_IPL_CCW_ATTR_RW(reipl_eckd, device, reipl_block_eckd->eckd); +DEFINE_IPL_ATTR_RW(reipl_eckd, bootprog, "%lld\n", "%lld\n", + reipl_block_eckd->eckd.bootprog); + +static struct attribute *reipl_eckd_attrs[] = { + &sys_reipl_eckd_device_attr.attr, + &sys_reipl_eckd_bootprog_attr.attr, + &sys_reipl_eckd_br_chr_attr.attr, + &sys_reipl_eckd_loadparm_attr.attr, + NULL, +}; + +static const struct attribute_group reipl_eckd_attr_group = { + .attrs = reipl_eckd_attrs, + .bin_attrs = reipl_eckd_bin_attrs +}; + +static ssize_t reipl_eckd_clear_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sysfs_emit(page, "%u\n", reipl_eckd_clear); +} + +static ssize_t reipl_eckd_clear_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + if (kstrtobool(buf, &reipl_eckd_clear) < 0) + return -EINVAL; + return len; +} + +static struct kobj_attribute sys_reipl_eckd_clear_attr = + __ATTR(clear, 0644, reipl_eckd_clear_show, reipl_eckd_clear_store); /* NSS reipl device attributes */ static void reipl_get_ascii_nss_name(char *dst, @@ -1002,7 +1087,7 @@ static ssize_t reipl_nss_name_show(struct kobject *kobj, char nss_name[NSS_NAME_SIZE + 1] = {}; reipl_get_ascii_nss_name(nss_name, reipl_block_nss); - return sprintf(page, "%s\n", nss_name); + return sysfs_emit(page, "%s\n", nss_name); } static ssize_t reipl_nss_name_store(struct kobject *kobj, @@ -1033,12 +1118,8 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj, } static struct kobj_attribute sys_reipl_nss_name_attr = - __ATTR(name, S_IRUGO | S_IWUSR, reipl_nss_name_show, - reipl_nss_name_store); - -static struct kobj_attribute sys_reipl_nss_loadparm_attr = - __ATTR(loadparm, S_IRUGO | S_IWUSR, reipl_nss_loadparm_show, - reipl_nss_loadparm_store); + __ATTR(name, 0644, reipl_nss_name_show, + reipl_nss_name_store); static struct attribute *reipl_nss_attrs[] = { &sys_reipl_nss_name_attr.attr, @@ -1054,8 +1135,8 @@ static struct attribute_group reipl_nss_attr_group = { void set_os_info_reipl_block(void) { - os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual, - reipl_block_actual->hdr.len); + os_info_entry_add_data(OS_INFO_REIPL_BLOCK, reipl_block_actual, + reipl_block_actual->hdr.len); } /* reipl type */ @@ -1069,6 +1150,9 @@ static int reipl_set_type(enum ipl_type type) case IPL_TYPE_CCW: reipl_block_actual = reipl_block_ccw; break; + case IPL_TYPE_ECKD: + reipl_block_actual = reipl_block_eckd; + break; case IPL_TYPE_FCP: reipl_block_actual = reipl_block_fcp; break; @@ -1088,7 +1172,7 @@ static int reipl_set_type(enum ipl_type type) static ssize_t reipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", ipl_type_str(reipl_type)); + return sysfs_emit(page, "%s\n", ipl_type_str(reipl_type)); } static ssize_t reipl_type_store(struct kobject *kobj, @@ -1099,6 +1183,8 @@ static ssize_t reipl_type_store(struct kobject *kobj, if (strncmp(buf, IPL_CCW_STR, strlen(IPL_CCW_STR)) == 0) rc = reipl_set_type(IPL_TYPE_CCW); + else if (strncmp(buf, IPL_ECKD_STR, strlen(IPL_ECKD_STR)) == 0) + rc = reipl_set_type(IPL_TYPE_ECKD); else if (strncmp(buf, IPL_FCP_STR, strlen(IPL_FCP_STR)) == 0) rc = reipl_set_type(IPL_TYPE_FCP); else if (strncmp(buf, IPL_NVME_STR, strlen(IPL_NVME_STR)) == 0) @@ -1114,6 +1200,7 @@ static struct kobj_attribute reipl_type_attr = static struct kset *reipl_kset; static struct kset *reipl_fcp_kset; static struct kset *reipl_nvme_kset; +static struct kset *reipl_eckd_kset; static void __reipl_run(void *unused) { @@ -1125,6 +1212,13 @@ static void __reipl_run(void *unused) else diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); break; + case IPL_TYPE_ECKD: + diag308(DIAG308_SET, reipl_block_eckd); + if (reipl_eckd_clear) + diag308(DIAG308_LOAD_CLEAR, NULL); + else + diag308(DIAG308_LOAD_NORMAL, NULL); + break; case IPL_TYPE_FCP: diag308(DIAG308_SET, reipl_block_fcp); if (reipl_fcp_clear) @@ -1148,6 +1242,7 @@ static void __reipl_run(void *unused) break; case IPL_TYPE_FCP_DUMP: case IPL_TYPE_NVME_DUMP: + case IPL_TYPE_ECKD_DUMP: break; } disabled_wait(); @@ -1178,7 +1273,7 @@ static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) ipb->ccw.flags = IPL_PB0_FLAG_LOADPARM; /* VM PARM */ - if (MACHINE_IS_VM && ipl_block_valid && + if (machine_is_vm() && ipl_block_valid && (ipl_block.ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP)) { ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; @@ -1192,7 +1287,7 @@ static int __init reipl_nss_init(void) { int rc; - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 0; reipl_block_nss = (void *) get_zeroed_page(GFP_KERNEL); @@ -1217,8 +1312,8 @@ static int __init reipl_ccw_init(void) return -ENOMEM; rc = sysfs_create_group(&reipl_kset->kobj, - MACHINE_IS_VM ? &reipl_ccw_attr_group_vm - : &reipl_ccw_attr_group_lpar); + machine_is_vm() ? &reipl_ccw_attr_group_vm + : &reipl_ccw_attr_group_lpar); if (rc) return rc; @@ -1345,6 +1440,58 @@ out1: return rc; } +static int __init reipl_eckd_init(void) +{ + int rc; + + if (!sclp.has_sipl_eckd) + return 0; + + reipl_block_eckd = (void *)get_zeroed_page(GFP_KERNEL); + if (!reipl_block_eckd) + return -ENOMEM; + + /* sysfs: create kset for mixing attr group and bin attrs */ + reipl_eckd_kset = kset_create_and_add(IPL_ECKD_STR, NULL, + &reipl_kset->kobj); + if (!reipl_eckd_kset) { + free_page((unsigned long)reipl_block_eckd); + return -ENOMEM; + } + + rc = sysfs_create_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group); + if (rc) + goto out1; + + if (test_facility(141)) { + rc = sysfs_create_file(&reipl_eckd_kset->kobj, + &sys_reipl_eckd_clear_attr.attr); + if (rc) + goto out2; + } else { + reipl_eckd_clear = true; + } + + if (ipl_info.type == IPL_TYPE_ECKD) { + memcpy(reipl_block_eckd, &ipl_block, sizeof(ipl_block)); + } else { + reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN; + reipl_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION; + reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN; + reipl_block_eckd->eckd.pbt = IPL_PBT_ECKD; + reipl_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_IPL; + } + reipl_capabilities |= IPL_TYPE_ECKD; + return 0; + +out2: + sysfs_remove_group(&reipl_eckd_kset->kobj, &reipl_eckd_attr_group); +out1: + kset_unregister(reipl_eckd_kset); + free_page((unsigned long)reipl_block_eckd); + return rc; +} + static int __init reipl_type_init(void) { enum ipl_type reipl_type = ipl_info.type; @@ -1366,6 +1513,9 @@ static int __init reipl_type_init(void) } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) { memcpy(reipl_block_ccw, reipl_block, size); reipl_type = IPL_TYPE_CCW; + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_ECKD) { + memcpy(reipl_block_eckd, reipl_block, size); + reipl_type = IPL_TYPE_ECKD; } out: return reipl_set_type(reipl_type); @@ -1386,6 +1536,9 @@ static int __init reipl_init(void) rc = reipl_ccw_init(); if (rc) return rc; + rc = reipl_eckd_init(); + if (rc) + return rc; rc = reipl_fcp_init(); if (rc) return rc; @@ -1421,6 +1574,11 @@ DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", dump_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_fcp, dump_block_fcp->hdr, + dump_block_fcp->fcp, + IPL_BP_FCP_LEN, IPL_BP0_FCP_LEN, + DIAG308_SCPDATA_SIZE); + static struct attribute *dump_fcp_attrs[] = { &sys_dump_fcp_device_attr.attr, &sys_dump_fcp_wwpn_attr.attr, @@ -1430,9 +1588,15 @@ static struct attribute *dump_fcp_attrs[] = { NULL, }; -static struct attribute_group dump_fcp_attr_group = { +static const struct bin_attribute *const dump_fcp_bin_attrs[] = { + &sys_dump_fcp_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_fcp_attr_group = { .name = IPL_FCP_STR, .attrs = dump_fcp_attrs, + .bin_attrs = dump_fcp_bin_attrs, }; /* NVME dump device attributes */ @@ -1445,6 +1609,11 @@ DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", dump_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_nvme, dump_block_nvme->hdr, + dump_block_nvme->nvme, + IPL_BP_NVME_LEN, IPL_BP0_NVME_LEN, + DIAG308_SCPDATA_SIZE); + static struct attribute *dump_nvme_attrs[] = { &sys_dump_nvme_fid_attr.attr, &sys_dump_nvme_nsid_attr.attr, @@ -1453,9 +1622,49 @@ static struct attribute *dump_nvme_attrs[] = { NULL, }; -static struct attribute_group dump_nvme_attr_group = { +static const struct bin_attribute *const dump_nvme_bin_attrs[] = { + &sys_dump_nvme_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_nvme_attr_group = { .name = IPL_NVME_STR, .attrs = dump_nvme_attrs, + .bin_attrs = dump_nvme_bin_attrs, +}; + +/* ECKD dump device attributes */ +DEFINE_IPL_CCW_ATTR_RW(dump_eckd, device, dump_block_eckd->eckd); +DEFINE_IPL_ATTR_RW(dump_eckd, bootprog, "%lld\n", "%llx\n", + dump_block_eckd->eckd.bootprog); + +IPL_ATTR_BR_CHR_SHOW_FN(dump, dump_block_eckd->eckd); +IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd); + +static struct kobj_attribute sys_dump_eckd_br_chr_attr = + __ATTR(br_chr, 0644, eckd_dump_br_chr_show, eckd_dump_br_chr_store); + +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_eckd, dump_block_eckd->hdr, + dump_block_eckd->eckd, + IPL_BP_ECKD_LEN, IPL_BP0_ECKD_LEN, + DIAG308_SCPDATA_SIZE); + +static struct attribute *dump_eckd_attrs[] = { + &sys_dump_eckd_device_attr.attr, + &sys_dump_eckd_bootprog_attr.attr, + &sys_dump_eckd_br_chr_attr.attr, + NULL, +}; + +static const struct bin_attribute *const dump_eckd_bin_attrs[] = { + &sys_dump_eckd_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_eckd_attr_group = { + .name = IPL_ECKD_STR, + .attrs = dump_eckd_attrs, + .bin_attrs = dump_eckd_bin_attrs, }; /* CCW dump device attributes */ @@ -1484,7 +1693,7 @@ static int dump_set_type(enum dump_type type) static ssize_t dump_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", dump_type_str(dump_type)); + return sysfs_emit(page, "%s\n", dump_type_str(dump_type)); } static ssize_t dump_type_store(struct kobject *kobj, @@ -1497,6 +1706,8 @@ static ssize_t dump_type_store(struct kobject *kobj, rc = dump_set_type(DUMP_TYPE_NONE); else if (strncmp(buf, DUMP_CCW_STR, strlen(DUMP_CCW_STR)) == 0) rc = dump_set_type(DUMP_TYPE_CCW); + else if (strncmp(buf, DUMP_ECKD_STR, strlen(DUMP_ECKD_STR)) == 0) + rc = dump_set_type(DUMP_TYPE_ECKD); else if (strncmp(buf, DUMP_FCP_STR, strlen(DUMP_FCP_STR)) == 0) rc = dump_set_type(DUMP_TYPE_FCP); else if (strncmp(buf, DUMP_NVME_STR, strlen(DUMP_NVME_STR)) == 0) @@ -1507,6 +1718,24 @@ static ssize_t dump_type_store(struct kobject *kobj, static struct kobj_attribute dump_type_attr = __ATTR(dump_type, 0644, dump_type_show, dump_type_store); +static ssize_t dump_area_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sysfs_emit(page, "%lu\n", sclp.hsa_size); +} + +static struct kobj_attribute dump_area_size_attr = __ATTR_RO(dump_area_size); + +static struct attribute *dump_attrs[] = { + &dump_type_attr.attr, + &dump_area_size_attr.attr, + NULL, +}; + +static struct attribute_group dump_attr_group = { + .attrs = dump_attrs, +}; + static struct kset *dump_kset; static void diag308_dump(void *dump_block) @@ -1525,6 +1754,9 @@ static void __dump_run(void *unused) case DUMP_TYPE_CCW: diag308_dump(dump_block_ccw); break; + case DUMP_TYPE_ECKD: + diag308_dump(dump_block_eckd); + break; case DUMP_TYPE_FCP: diag308_dump(dump_block_fcp); break; @@ -1603,13 +1835,36 @@ static int __init dump_nvme_init(void) } dump_block_nvme->hdr.len = IPL_BP_NVME_LEN; dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN; - dump_block_nvme->fcp.pbt = IPL_PBT_NVME; - dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP; + dump_block_nvme->nvme.len = IPL_BP0_NVME_LEN; + dump_block_nvme->nvme.pbt = IPL_PBT_NVME; + dump_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_DUMP; dump_capabilities |= DUMP_TYPE_NVME; return 0; } +static int __init dump_eckd_init(void) +{ + int rc; + + if (!sclp_ipl_info.has_dump || !sclp.has_sipl_eckd) + return 0; /* LDIPL DUMP is not installed */ + dump_block_eckd = (void *)get_zeroed_page(GFP_KERNEL); + if (!dump_block_eckd) + return -ENOMEM; + rc = sysfs_create_group(&dump_kset->kobj, &dump_eckd_attr_group); + if (rc) { + free_page((unsigned long)dump_block_eckd); + return rc; + } + dump_block_eckd->hdr.len = IPL_BP_ECKD_LEN; + dump_block_eckd->hdr.version = IPL_PARM_BLOCK_VERSION; + dump_block_eckd->eckd.len = IPL_BP0_ECKD_LEN; + dump_block_eckd->eckd.pbt = IPL_PBT_ECKD; + dump_block_eckd->eckd.opt = IPL_PB0_ECKD_OPT_DUMP; + dump_capabilities |= DUMP_TYPE_ECKD; + return 0; +} + static int __init dump_init(void) { int rc; @@ -1617,7 +1872,7 @@ static int __init dump_init(void) dump_kset = kset_create_and_add("dump", NULL, firmware_kobj); if (!dump_kset) return -ENOMEM; - rc = sysfs_create_file(&dump_kset->kobj, &dump_type_attr.attr); + rc = sysfs_create_group(&dump_kset->kobj, &dump_attr_group); if (rc) { kset_unregister(dump_kset); return rc; @@ -1625,6 +1880,9 @@ static int __init dump_init(void) rc = dump_ccw_init(); if (rc) return rc; + rc = dump_eckd_init(); + if (rc) + return rc; rc = dump_fcp_init(); if (rc) return rc; @@ -1643,13 +1901,28 @@ static struct shutdown_action __refdata dump_action = { static void dump_reipl_run(struct shutdown_trigger *trigger) { - unsigned long ipib = (unsigned long) reipl_block_actual; + struct lowcore *abs_lc; unsigned int csum; - csum = (__force unsigned int) - csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0); - mem_assign_absolute(S390_lowcore.ipib, ipib); - mem_assign_absolute(S390_lowcore.ipib_checksum, csum); + /* + * Set REIPL_CLEAR flag in os_info flags entry indicating + * 'clear' sysfs attribute has been set on the panicked system + * for specified reipl type. + * Always set for IPL_TYPE_NSS and IPL_TYPE_UNKNOWN. + */ + if ((reipl_type == IPL_TYPE_CCW && reipl_ccw_clear) || + (reipl_type == IPL_TYPE_ECKD && reipl_eckd_clear) || + (reipl_type == IPL_TYPE_FCP && reipl_fcp_clear) || + (reipl_type == IPL_TYPE_NVME && reipl_nvme_clear) || + reipl_type == IPL_TYPE_NSS || + reipl_type == IPL_TYPE_UNKNOWN) + os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR; + os_info_entry_add_data(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags)); + csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0); + abs_lc = get_abs_lowcore(); + abs_lc->ipib = __pa(reipl_block_actual); + abs_lc->ipib_checksum = csum; + put_abs_lowcore(abs_lc); dump_run(trigger); } @@ -1662,11 +1935,13 @@ static struct shutdown_action __refdata dump_reipl_action = { * vmcmd shutdown action: Trigger vm command on shutdown. */ -static char vmcmd_on_reboot[128]; -static char vmcmd_on_panic[128]; -static char vmcmd_on_halt[128]; -static char vmcmd_on_poff[128]; -static char vmcmd_on_restart[128]; +#define VMCMD_MAX_SIZE 240 + +static char vmcmd_on_reboot[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_panic[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_halt[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_poff[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_restart[VMCMD_MAX_SIZE + 1]; DEFINE_IPL_ATTR_STR_RW(vmcmd, on_reboot, "%s\n", "%s\n", vmcmd_on_reboot); DEFINE_IPL_ATTR_STR_RW(vmcmd, on_panic, "%s\n", "%s\n", vmcmd_on_panic); @@ -1713,7 +1988,7 @@ static void vmcmd_run(struct shutdown_trigger *trigger) static int vmcmd_init(void) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return -EOPNOTSUPP; vmcmd_kset = kset_create_and_add("vmcmd", NULL, firmware_kobj); if (!vmcmd_kset) @@ -1778,7 +2053,7 @@ static struct shutdown_trigger on_reboot_trigger = {ON_REIPL_STR, static ssize_t on_reboot_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_reboot_trigger.action->name); + return sysfs_emit(page, "%s\n", on_reboot_trigger.action->name); } static ssize_t on_reboot_store(struct kobject *kobj, @@ -1804,7 +2079,7 @@ static struct shutdown_trigger on_panic_trigger = {ON_PANIC_STR, &stop_action}; static ssize_t on_panic_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_panic_trigger.action->name); + return sysfs_emit(page, "%s\n", on_panic_trigger.action->name); } static ssize_t on_panic_store(struct kobject *kobj, @@ -1830,7 +2105,7 @@ static struct shutdown_trigger on_restart_trigger = {ON_RESTART_STR, static ssize_t on_restart_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_restart_trigger.action->name); + return sysfs_emit(page, "%s\n", on_restart_trigger.action->name); } static ssize_t on_restart_store(struct kobject *kobj, @@ -1843,7 +2118,6 @@ static struct kobj_attribute on_restart_attr = __ATTR_RW(on_restart); static void __do_restart(void *ignore) { - __arch_local_irq_stosm(0x04); /* enable DAT */ smp_send_stop(); #ifdef CONFIG_CRASH_DUMP crash_kexec(NULL); @@ -1857,7 +2131,7 @@ void do_restart(void *arg) tracing_off(); debug_locks_off(); lgr_info_log(); - smp_call_online_cpu(__do_restart, arg); + smp_call_ipl_cpu(__do_restart, arg); } /* on halt */ @@ -1867,7 +2141,7 @@ static struct shutdown_trigger on_halt_trigger = {ON_HALT_STR, &stop_action}; static ssize_t on_halt_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_halt_trigger.action->name); + return sysfs_emit(page, "%s\n", on_halt_trigger.action->name); } static ssize_t on_halt_store(struct kobject *kobj, @@ -1893,7 +2167,7 @@ static struct shutdown_trigger on_poff_trigger = {ON_POFF_STR, &stop_action}; static ssize_t on_poff_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_poff_trigger.action->name); + return sysfs_emit(page, "%s\n", on_poff_trigger.action->name); } static ssize_t on_poff_store(struct kobject *kobj, @@ -1975,26 +2249,28 @@ static int __init s390_ipl_init(void) __initcall(s390_ipl_init); -static void __init strncpy_skip_quote(char *dst, char *src, int n) +static void __init strscpy_skip_quote(char *dst, char *src, int n) { int sx, dx; - dx = 0; - for (sx = 0; src[sx] != 0; sx++) { + if (!n) + return; + for (sx = 0, dx = 0; src[sx]; sx++) { if (src[sx] == '"') continue; - dst[dx++] = src[sx]; - if (dx >= n) + dst[dx] = src[sx]; + if (dx + 1 == n) break; + dx++; } + dst[dx] = '\0'; } static int __init vmcmd_on_reboot_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_reboot, str, 127); - vmcmd_on_reboot[127] = 0; + strscpy_skip_quote(vmcmd_on_reboot, str, sizeof(vmcmd_on_reboot)); on_reboot_trigger.action = &vmcmd_action; return 1; } @@ -2002,10 +2278,9 @@ __setup("vmreboot=", vmcmd_on_reboot_setup); static int __init vmcmd_on_panic_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_panic, str, 127); - vmcmd_on_panic[127] = 0; + strscpy_skip_quote(vmcmd_on_panic, str, sizeof(vmcmd_on_panic)); on_panic_trigger.action = &vmcmd_action; return 1; } @@ -2013,10 +2288,9 @@ __setup("vmpanic=", vmcmd_on_panic_setup); static int __init vmcmd_on_halt_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_halt, str, 127); - vmcmd_on_halt[127] = 0; + strscpy_skip_quote(vmcmd_on_halt, str, sizeof(vmcmd_on_halt)); on_halt_trigger.action = &vmcmd_action; return 1; } @@ -2024,10 +2298,9 @@ __setup("vmhalt=", vmcmd_on_halt_setup); static int __init vmcmd_on_poff_setup(char *str) { - if (!MACHINE_IS_VM) + if (!machine_is_vm()) return 1; - strncpy_skip_quote(vmcmd_on_poff, str, 127); - vmcmd_on_poff[127] = 0; + strscpy_skip_quote(vmcmd_on_poff, str, sizeof(vmcmd_on_poff)); on_poff_trigger.action = &vmcmd_action; return 1; } @@ -2055,6 +2328,11 @@ void __init setup_ipl(void) ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid; ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno; break; + case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: + ipl_info.data.eckd.dev_id.ssid = ipl_block.eckd.ssid; + ipl_info.data.eckd.dev_id.devno = ipl_block.eckd.devno; + break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: ipl_info.data.fcp.dev_id.ssid = 0; @@ -2081,8 +2359,8 @@ void s390_reset_system(void) set_prefix(0); /* Disable lowcore protection */ - __ctl_clear_bit(0, 28); - diag_dma_ops.diag308_reset(); + local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); + diag_amode31_ops.diag308_reset(); } #ifdef CONFIG_KEXEC_FILE @@ -2159,7 +2437,7 @@ void *ipl_report_finish(struct ipl_report *report) buf = vzalloc(report->size); if (!buf) - return ERR_PTR(-ENOMEM); + goto out; ptr = buf; memcpy(ptr, report->ipib, report->ipib->hdr.len); @@ -2198,6 +2476,7 @@ void *ipl_report_finish(struct ipl_report *report) } BUG_ON(ptr > buf + report->size); +out: return buf; } diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c index af43535a976d..b5245fadcfb0 100644 --- a/arch/s390/kernel/ipl_vmparm.c +++ b/arch/s390/kernel/ipl_vmparm.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/minmax.h> +#include <linux/string.h> #include <asm/ebcdic.h> #include <asm/ipl.h> diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 234d085257eb..bdf9c7cb5685 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -9,6 +9,7 @@ */ #include <linux/kernel_stat.h> +#include <linux/cpufeature.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> @@ -25,10 +26,13 @@ #include <asm/irq_regs.h> #include <asm/cputime.h> #include <asm/lowcore.h> +#include <asm/machine.h> #include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/stacktrace.h> #include <asm/softirq_stack.h> +#include <asm/vtime.h> +#include <asm/asm.h> #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); @@ -75,13 +79,13 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQEXT_CMS, .name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"}, {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, + {.irq = IRQEXT_WTI, .name = "WTI", .desc = "[EXT] Warning Track"}, {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, {.irq = IRQIO_TAP, .name = "TAP", .desc = "[I/O] Tape"}, {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, - {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, @@ -99,8 +103,8 @@ static const struct irq_class irqclass_sub_desc[] = { static void do_IRQ(struct pt_regs *regs, int irq) { - if (tod_after_eq(S390_lowcore.int_clock, - S390_lowcore.clock_comparator)) + if (tod_after_eq(get_lowcore()->int_clock, + get_lowcore()->clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); @@ -110,7 +114,7 @@ static int on_async_stack(void) { unsigned long frame = current_frame_address(); - return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; + return ((get_lowcore()->async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; } static void do_irq_async(struct pt_regs *regs, int irq) @@ -118,7 +122,7 @@ static void do_irq_async(struct pt_regs *regs, int irq) if (on_async_stack()) { do_IRQ(regs, irq); } else { - call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + call_on_stack(2, get_lowcore()->async_stack, void, do_IRQ, struct pt_regs *, regs, int, irq); } } @@ -127,35 +131,44 @@ static int irq_pending(struct pt_regs *regs) { int cc; - asm volatile("tpi 0\n" - "ipm %0" : "=d" (cc) : : "cc"); - return cc >> 28; + asm volatile( + " tpi 0\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : + : CC_CLOBBER); + return CC_TRANSFORM(cc); } void noinstr do_io_irq(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); struct pt_regs *old_regs = set_irq_regs(regs); - int from_idle; + bool from_idle; - irq_enter(); + irq_enter_rcu(); - if (user_mode(regs)) + if (user_mode(regs)) { update_timer_sys(); + if (cpu_has_bear()) + current->thread.last_break = regs->last_break; + } - from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); + set_cpu_flag(CIF_NOHZ_DELAY); do { - regs->tpi_info = S390_lowcore.tpi_info; - if (S390_lowcore.tpi_info.adapter_IO) + regs->tpi_info = get_lowcore()->tpi_info; + if (get_lowcore()->tpi_info.adapter_IO) do_irq_async(regs, THIN_INTERRUPT); else do_irq_async(regs, IO_INTERRUPT); - } while (MACHINE_IS_LPAR && irq_pending(regs)); + } while (machine_is_lpar() && irq_pending(regs)); + + irq_exit_rcu(); - irq_exit(); set_irq_regs(old_regs); irqentry_exit(regs, state); @@ -167,24 +180,27 @@ void noinstr do_ext_irq(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); struct pt_regs *old_regs = set_irq_regs(regs); - int from_idle; + bool from_idle; - irq_enter(); + irq_enter_rcu(); - if (user_mode(regs)) + if (user_mode(regs)) { update_timer_sys(); + if (cpu_has_bear()) + current->thread.last_break = regs->last_break; + } - regs->int_code = S390_lowcore.ext_int_code_addr; - regs->int_parm = S390_lowcore.ext_params; - regs->int_parm_long = S390_lowcore.ext_params2; + regs->int_code = get_lowcore()->ext_int_code_addr; + regs->int_parm = get_lowcore()->ext_params; + regs->int_parm_long = get_lowcore()->ext_params2; - from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); do_irq_async(regs, EXT_INTERRUPT); - irq_exit(); + irq_exit_rcu(); set_irq_regs(old_regs); irqentry_exit(regs, state); @@ -198,7 +214,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) unsigned long flags; int cpu; - irq_lock_sparse(); + rcu_read_lock(); desc = irq_to_desc(irq); if (!desc) goto out; @@ -217,7 +233,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) seq_putc(p, '\n'); raw_spin_unlock_irqrestore(&desc->lock, flags); out: - irq_unlock_sparse(); + rcu_read_unlock(); } /* @@ -228,7 +244,7 @@ int show_interrupts(struct seq_file *p, void *v) int index = *(loff_t *) v; int cpu, irq; - get_online_cpus(); + cpus_read_lock(); if (index == 0) { seq_puts(p, " "); for_each_online_cpu(cpu) @@ -243,7 +259,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); goto out; } - if (index < nr_irqs) { + if (index < irq_get_nr_irqs()) { show_msi_interrupt(p, index); goto out; } @@ -258,7 +274,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } out: - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -335,7 +351,7 @@ static irqreturn_t do_ext_interrupt(int irq, void *dummy) struct ext_int_info *p; int index; - ext_code = *(struct ext_code *) ®s->int_code; + ext_code.int_code = regs->int_code; if (ext_code.code != EXT_IRQ_CLK_COMP) set_cpu_flag(CIF_NOHZ_DELAY); @@ -378,7 +394,7 @@ void irq_subclass_register(enum irq_subclass subclass) { spin_lock(&irq_subclass_lock); if (!irq_subclass_refcount[subclass]) - ctl_set_bit(0, subclass); + system_ctl_set_bit(0, subclass); irq_subclass_refcount[subclass]++; spin_unlock(&irq_subclass_lock); } @@ -389,7 +405,7 @@ void irq_subclass_unregister(enum irq_subclass subclass) spin_lock(&irq_subclass_lock); irq_subclass_refcount[subclass]--; if (!irq_subclass_refcount[subclass]) - ctl_clear_bit(0, subclass); + system_ctl_clear_bit(0, subclass); spin_unlock(&irq_subclass_lock); } EXPORT_SYMBOL(irq_subclass_unregister); diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index ab584e8e3527..e808bb8bc0da 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -6,8 +6,9 @@ * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ #include <linux/uaccess.h> -#include <linux/stop_machine.h> #include <linux/jump_label.h> +#include <linux/module.h> +#include <asm/text-patching.h> #include <asm/ipl.h> struct insn { @@ -36,21 +37,15 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected, unsigned char *ipe = (unsigned char *)expected; unsigned char *ipn = (unsigned char *)new; - pr_emerg("Jump label code mismatch at %pS [%p]\n", ipc, ipc); + pr_emerg("Jump label code mismatch at %pS [%px]\n", ipc, ipc); pr_emerg("Found: %6ph\n", ipc); pr_emerg("Expected: %6ph\n", ipe); pr_emerg("New: %6ph\n", ipn); panic("Corrupted kernel text"); } -static struct insn orignop = { - .opcode = 0xc004, - .offset = JUMP_LABEL_NOP_OFFSET >> 1, -}; - -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) { void *code = (void *)jump_entry_code(entry); struct insn old, new; @@ -62,29 +57,26 @@ static void __jump_label_transform(struct jump_entry *entry, jump_label_make_branch(entry, &old); jump_label_make_nop(entry, &new); } - if (init) { - if (memcmp(code, &orignop, sizeof(orignop))) - jump_label_bug(entry, &orignop, &new); - } else { - if (memcmp(code, &old, sizeof(old))) - jump_label_bug(entry, &old, &new); - } + if (memcmp(code, &old, sizeof(old))) + jump_label_bug(entry, &old, &new); s390_kernel_write(code, &new, sizeof(new)); } -static void __jump_label_sync(void *dummy) +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) { + jump_label_transform(entry, type); + text_poke_sync(); } -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) { - __jump_label_transform(entry, type, 0); - smp_call_function(__jump_label_sync, NULL, 1); + jump_label_transform(entry, type); + return true; } -void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) +void arch_jump_label_transform_apply(void) { - __jump_label_transform(entry, type, 1); + text_poke_sync(); } diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 9da6fa30c447..143e34a4eca5 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_elf(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; Elf_Addr entry; @@ -40,8 +40,10 @@ static int kexec_file_add_kernel_elf(struct kimage *image, buf.bufsz = phdr->p_filesz; buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = phdr->p_memsz; data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index af23eff5774d..9a439175723c 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_image(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; buf.image = image; @@ -24,8 +24,10 @@ static int kexec_file_add_kernel_image(struct kimage *image, buf.bufsz = image->kernel_buf_len; buf.mem = 0; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = buf.bufsz; data->kernel_buf = image->kernel_buf; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 52d056a5f89f..c450120b4474 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -7,11 +7,13 @@ * s390 port, used ppc64 as template. Mike Grundy <grundym@us.ibm.com> */ -#include <linux/moduleloader.h> +#define pr_fmt(fmt) "kprobes: " fmt + #include <linux/kprobes.h> #include <linux/ptrace.h> #include <linux/preempt.h> #include <linux/stop_machine.h> +#include <linux/cpufeature.h> #include <linux/kdebug.h> #include <linux/uaccess.h> #include <linux/extable.h> @@ -19,6 +21,8 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/ftrace.h> +#include <linux/execmem.h> +#include <asm/text-patching.h> #include <asm/set_memory.h> #include <asm/sections.h> #include <asm/dis.h> @@ -29,41 +33,17 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; -DEFINE_INSN_CACHE_OPS(s390_insn); - -static int insn_page_in_use; - void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; - __set_memory((unsigned long) page, 1, SET_MEMORY_RO | SET_MEMORY_X); + set_memory_rox((unsigned long)page, 1); return page; } -static void *alloc_s390_insn_page(void) -{ - if (xchg(&insn_page_in_use, 1) == 1) - return NULL; - return &kprobes_insn_page; -} - -static void free_s390_insn_page(void *page) -{ - xchg(&insn_page_in_use, 0); -} - -struct kprobe_insn_cache kprobe_s390_insn_slots = { - .mutex = __MUTEX_INITIALIZER(kprobe_s390_insn_slots.mutex), - .alloc = alloc_s390_insn_page, - .free = free_s390_insn_page, - .pages = LIST_HEAD_INIT(kprobe_s390_insn_slots.pages), - .insn_size = MAX_INSN_SIZE, -}; - static void copy_instruction(struct kprobe *p) { kprobe_opcode_t insn[MAX_INSN_SIZE]; @@ -77,10 +57,10 @@ static void copy_instruction(struct kprobe *p) if (probe_is_insn_relative_long(&insn[0])) { /* * For pc-relative instructions in RIL-b or RIL-c format patch - * the RI2 displacement field. We have already made sure that - * the insn slot for the patched instruction is within the same - * 2GB area as the original instruction (either kernel image or - * module area). Therefore the new displacement will always fit. + * the RI2 displacement field. The insn slot for the to be + * patched instruction is within the same 4GB area like the + * original instruction. Therefore the new displacement will + * always fit. */ disp = *(s32 *)&insn[1]; addr = (u64)(unsigned long)p->addr; @@ -92,42 +72,61 @@ static void copy_instruction(struct kprobe *p) } NOKPROBE_SYMBOL(copy_instruction); -static int s390_get_insn_slot(struct kprobe *p) +/* Check if paddr is at an instruction boundary */ +static bool can_probe(unsigned long paddr) { - /* - * Get an insn slot that is within the same 2GB area like the original - * instruction. That way instructions with a 32bit signed displacement - * field can be patched and executed within the insn slot. - */ - p->ainsn.insn = NULL; - if (is_kernel((unsigned long)p->addr)) - p->ainsn.insn = get_s390_insn_slot(); - else if (is_module_addr(p->addr)) - p->ainsn.insn = get_insn_slot(); - return p->ainsn.insn ? 0 : -ENOMEM; -} -NOKPROBE_SYMBOL(s390_get_insn_slot); - -static void s390_free_insn_slot(struct kprobe *p) -{ - if (!p->ainsn.insn) - return; - if (is_kernel((unsigned long)p->addr)) - free_s390_insn_slot(p->ainsn.insn, 0); - else - free_insn_slot(p->ainsn.insn, 0); - p->ainsn.insn = NULL; + unsigned long addr, offset = 0; + kprobe_opcode_t insn; + struct kprobe *kp; + + if (paddr & 0x01) + return false; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return false; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn))) + return false; + + if (insn >> 8 == 0) { + if (insn != BREAKPOINT_INSTRUCTION) { + /* + * Note that QEMU inserts opcode 0x0000 to implement + * software breakpoints for guests. Since the size of + * the original instruction is unknown, stop following + * instructions and prevent setting a kprobe. + */ + return false; + } + /* + * Check if the instruction has been modified by another + * kprobe, in which case the original instruction is + * decoded. + */ + kp = get_kprobe((void *)addr); + if (!kp) { + /* not a kprobe */ + return false; + } + insn = kp->opcode; + } + addr += insn_length(insn >> 8); + } + return addr == paddr; } -NOKPROBE_SYMBOL(s390_free_insn_slot); int arch_prepare_kprobe(struct kprobe *p) { - if ((unsigned long) p->addr & 0x01) + if (!can_probe((unsigned long)p->addr)) return -EINVAL; /* Make sure the probe isn't going on a difficult instruction */ if (probe_is_prohibited_opcode(p->addr)) return -EINVAL; - if (s390_get_insn_slot(p)) + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) return -ENOMEM; copy_instruction(p); return 0; @@ -155,7 +154,12 @@ void arch_arm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; - stop_machine_cpuslocked(swap_instruction, &args, NULL); + if (cpu_has_seq_insn()) { + swap_instruction(&args); + text_poke_sync(); + } else { + stop_machine_cpuslocked(swap_instruction, &args, NULL); + } } NOKPROBE_SYMBOL(arch_arm_kprobe); @@ -163,13 +167,21 @@ void arch_disarm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; - stop_machine_cpuslocked(swap_instruction, &args, NULL); + if (cpu_has_seq_insn()) { + swap_instruction(&args); + text_poke_sync(); + } else { + stop_machine_cpuslocked(swap_instruction, &args, NULL); + } } NOKPROBE_SYMBOL(arch_disarm_kprobe); void arch_remove_kprobe(struct kprobe *p) { - s390_free_insn_slot(p); + if (!p->ainsn.insn) + return; + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; } NOKPROBE_SYMBOL(arch_remove_kprobe); @@ -177,20 +189,27 @@ static void enable_singlestep(struct kprobe_ctlblk *kcb, struct pt_regs *regs, unsigned long ip) { - struct per_regs per_kprobe; + union { + struct ctlreg regs[3]; + struct { + struct ctlreg control; + struct ctlreg start; + struct ctlreg end; + }; + } per_kprobe; /* Set up the PER control registers %cr9-%cr11 */ - per_kprobe.control = PER_EVENT_IFETCH; - per_kprobe.start = ip; - per_kprobe.end = ip; + per_kprobe.control.val = PER_EVENT_IFETCH; + per_kprobe.start.val = ip; + per_kprobe.end.val = ip; /* Save control regs and psw mask */ - __ctl_store(kcb->kprobe_saved_ctl, 9, 11); + __local_ctl_store(9, 11, kcb->kprobe_saved_ctl); kcb->kprobe_saved_imask = regs->psw.mask & (PSW_MASK_PER | PSW_MASK_IO | PSW_MASK_EXT); /* Set PER control regs, turns on single step for the given address */ - __ctl_load(per_kprobe, 9, 11); + __local_ctl_load(9, 11, per_kprobe.regs); regs->psw.mask |= PSW_MASK_PER; regs->psw.mask &= ~(PSW_MASK_IO | PSW_MASK_EXT); regs->psw.addr = ip; @@ -202,7 +221,7 @@ static void disable_singlestep(struct kprobe_ctlblk *kcb, unsigned long ip) { /* Restore control regs and psw mask, set new psw address */ - __ctl_load(kcb->kprobe_saved_ctl, 9, 11); + __local_ctl_load(9, 11, kcb->kprobe_saved_ctl); regs->psw.mask &= ~PSW_MASK_PER; regs->psw.mask |= kcb->kprobe_saved_imask; regs->psw.addr = ip; @@ -231,19 +250,10 @@ static void pop_kprobe(struct kprobe_ctlblk *kcb) { __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->prev_kprobe.kp = NULL; } NOKPROBE_SYMBOL(pop_kprobe); -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - ri->ret_addr = (kprobe_opcode_t *) regs->gprs[14]; - ri->fp = NULL; - - /* Replace the return addr with trampoline addr */ - regs->gprs[14] = (unsigned long) &kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) { switch (kcb->kprobe_status) { @@ -259,7 +269,7 @@ static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) * is a BUG. The code path resides in the .kprobes.text * section and is executed with interrupts disabled. */ - pr_err("Invalid kprobe detected.\n"); + pr_err("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); } @@ -325,33 +335,6 @@ static int kprobe_handler(struct pt_regs *regs) NOKPROBE_SYMBOL(kprobe_handler); /* - * Function return probe trampoline: - * - init_kprobes() establishes a probepoint here - * - When the probed function returns, this probe - * causes the handlers to fire - */ -static void __used kretprobe_trampoline_holder(void) -{ - asm volatile(".global kretprobe_trampoline\n" - "kretprobe_trampoline: bcr 0,0\n"); -} - -/* - * Called when the probe at kretprobe trampoline is hit - */ -static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - regs->psw.addr = __kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; -} -NOKPROBE_SYMBOL(trampoline_probe_handler); - -/* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "breakpoint" * instruction. To avoid the SMP problems that can occur when we @@ -392,12 +375,11 @@ static int post_kprobe_handler(struct pt_regs *regs) if (!p) return 0; + resume_execution(p, regs); if (kcb->kprobe_status != KPROBE_REENTER && p->post_handler) { kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - - resume_execution(p, regs); pop_kprobe(kcb); preempt_enable_no_resched(); @@ -417,7 +399,6 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); struct kprobe *p = kprobe_running(); - const struct exception_table_entry *entry; switch(kcb->kprobe_status) { case KPROBE_HIT_SS: @@ -439,10 +420,8 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) * In case the user-specified fault handler returned * zero, try to fix up. */ - entry = s390_search_extables(regs->psw.addr); - if (entry && ex_handle(entry, regs)) + if (fixup_exception(regs)) return 1; - /* * fixup_exception() could not handle it, * Let do_page_fault() fix it. @@ -506,18 +485,19 @@ int kprobe_exceptions_notify(struct notifier_block *self, } NOKPROBE_SYMBOL(kprobe_exceptions_notify); -static struct kprobe trampoline = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - int __init arch_init_kprobes(void) { - return register_kprobe(&trampoline); + return 0; +} + +int __init arch_populate_kprobe_blacklist(void) +{ + return kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); } int arch_trampoline_kprobe(struct kprobe *p) { - return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline; + return 0; } NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S deleted file mode 100644 index f6cb022ef8c8..000000000000 --- a/arch/s390/kernel/kprobes_insn_page.S +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#include <linux/linkage.h> - -/* - * insn_page is a special 4k aligned dummy function for kprobes. - * It will contain all kprobed instructions that are out-of-line executed. - * The page must be within the kernel image to guarantee that the - * out-of-line instructions are within 2GB distance of their original - * location. Using a dummy function ensures that the insn_page is within - * the text section of the kernel and mapped read-only/executable from - * the beginning on, thus avoiding to split large mappings if the page - * would be in the data section instead. - */ - .section .kprobes.text, "ax" - .align 4096 -ENTRY(kprobes_insn_page) - .rept 2048 - .word 0x07fe - .endr -ENDPROC(kprobes_insn_page) - .previous diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 3b895971c3d0..6d1ffca5f798 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -88,8 +88,7 @@ static void lgr_stsi_2_2_2(struct lgr_info *lgr_info) if (stsi(si, 2, 2, 2)) return; cpascii(lgr_info->name, si->name, sizeof(si->name)); - memcpy(&lgr_info->lpar_number, &si->lpar_number, - sizeof(lgr_info->lpar_number)); + lgr_info->lpar_number = si->lpar_number; } /* @@ -167,7 +166,7 @@ static struct timer_list lgr_timer; */ static void lgr_timer_set(void) { - mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC)); + mod_timer(&lgr_timer, jiffies + secs_to_jiffies(LGR_TIMER_INTERVAL_SECS)); } /* diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 1005a6935fbe..baeb3dcfc1c8 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -3,7 +3,6 @@ * Copyright IBM Corp. 2005, 2011 * * Author(s): Rolf Adelsberger, - * Heiko Carstens <heiko.carstens@de.ibm.com> * Michael Holzheu <holzheu@linux.vnet.ibm.com> */ @@ -14,7 +13,12 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/debug_locks.h> +#include <linux/cpufeature.h> +#include <asm/guarded_storage.h> +#include <asm/machine.h> +#include <asm/pfault.h> #include <asm/cio.h> +#include <asm/fpu.h> #include <asm/setup.h> #include <asm/smp.h> #include <asm/ipl.h> @@ -22,13 +26,15 @@ #include <asm/elf.h> #include <asm/asm-offsets.h> #include <asm/cacheflush.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> #include <asm/set_memory.h> #include <asm/stacktrace.h> -#include <asm/switch_to.h> #include <asm/nmi.h> +#include <asm/sclp.h> -typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long); +typedef void (*relocate_kernel_t)(unsigned long, unsigned long, unsigned long); +typedef int (*purgatory_t)(int); extern const unsigned char relocate_kernel[]; extern const unsigned long long relocate_kernel_len; @@ -39,13 +45,16 @@ extern const unsigned long long relocate_kernel_len; * Reset the system, copy boot CPU registers to absolute zero, * and jump to the kdump image */ -static void __do_machine_kdump(void *image) +static void __do_machine_kdump(void *data) { - int (*start_kdump)(int); + struct kimage *image = data; + purgatory_t purgatory; unsigned long prefix; + purgatory = (purgatory_t)image->start; + /* store_status() saved the prefix register to lowcore */ - prefix = (unsigned long) S390_lowcore.prefixreg_save_area; + prefix = (unsigned long)get_lowcore()->prefixreg_save_area; /* Now do the reset */ s390_reset_system(); @@ -55,14 +64,12 @@ static void __do_machine_kdump(void *image) * This need to be done *after* s390_reset_system set the * prefix register of this CPU to zero */ - memcpy((void *) __LC_FPREGS_SAVE_AREA, - (void *)(prefix + __LC_FPREGS_SAVE_AREA), 512); + memcpy(absolute_pointer(get_lowcore()->floating_pt_save_area), + phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512); - __load_psw_mask(PSW_MASK_BASE | PSW_DEFAULT_KEY | PSW_MASK_EA | PSW_MASK_BA); - start_kdump = (void *)((struct kimage *) image)->start; - start_kdump(1); + call_nodat(1, int, purgatory, int, 1); - /* Die if start_kdump returns */ + /* Die if kdump returns */ disabled_wait(); } @@ -86,16 +93,16 @@ static noinline void __machine_kdump(void *image) continue; } /* Store status of the boot CPU */ - mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); - if (MACHINE_HAS_VX) + mcesa = __va(get_lowcore()->mcesad & MCESA_ORIGIN_MASK); + if (cpu_has_vx()) save_vx_regs((__vector128 *) mcesa->vector_save_area); - if (MACHINE_HAS_GS) { - __ctl_store(cr2_old.val, 2, 2); + if (cpu_has_gs()) { + local_ctl_store(2, &cr2_old.reg); cr2_new = cr2_old; cr2_new.gse = 1; - __ctl_load(cr2_new.val, 2, 2); + local_ctl_load(2, &cr2_new.reg); save_gs_cb((struct gs_cb *) mcesa->guarded_storage_save_area); - __ctl_load(cr2_old.val, 2, 2); + local_ctl_load(2, &cr2_old.reg); } /* * To create a good backchain for this CPU in the dump store_status @@ -109,18 +116,6 @@ static noinline void __machine_kdump(void *image) store_status(__do_machine_kdump, image); } -static unsigned long do_start_kdump(unsigned long addr) -{ - struct kimage *image = (struct kimage *) addr; - int (*start_kdump)(int) = (void *)image->start; - int rc; - - __arch_local_irq_stnsm(0xfb); /* disable DAT */ - rc = start_kdump(0); - __arch_local_irq_stosm(0x04); /* enable DAT */ - return rc; -} - #endif /* CONFIG_CRASH_DUMP */ /* @@ -129,12 +124,10 @@ static unsigned long do_start_kdump(unsigned long addr) static bool kdump_csum_valid(struct kimage *image) { #ifdef CONFIG_CRASH_DUMP + purgatory_t purgatory = (purgatory_t)image->start; int rc; - preempt_disable(); - rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump, - unsigned long, (unsigned long)image); - preempt_enable(); + rc = call_nodat(1, int, purgatory, int, 0); return rc == 0; #else return false; @@ -187,7 +180,7 @@ void arch_kexec_unprotect_crashkres(void) static int machine_kexec_prepare_kdump(void) { #ifdef CONFIG_CRASH_DUMP - if (MACHINE_IS_VM) + if (machine_is_vm()) diag10_range(PFN_DOWN(crashk_res.start), PFN_DOWN(crashk_res.end - crashk_res.start + 1)); return 0; @@ -208,7 +201,7 @@ int machine_kexec_prepare(struct kimage *image) return -EINVAL; /* Get the destination where the assembler code should be copied to.*/ - reboot_code_buffer = (void *) page_to_phys(image->control_code_page); + reboot_code_buffer = page_to_virt(image->control_code_page); /* Then copy it */ memcpy(reboot_code_buffer, relocate_kernel, relocate_kernel_len); @@ -219,17 +212,6 @@ void machine_kexec_cleanup(struct kimage *image) { } -void arch_crash_save_vmcoreinfo(void) -{ - VMCOREINFO_SYMBOL(lowcore_ptr); - VMCOREINFO_SYMBOL(high_memory); - VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); - vmcoreinfo_append_str("SDMA=%lx\n", __sdma); - vmcoreinfo_append_str("EDMA=%lx\n", __edma); - vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); -} - void machine_shutdown(void) { } @@ -244,15 +226,20 @@ void machine_crash_shutdown(struct pt_regs *regs) */ static void __do_machine_kexec(void *data) { - relocate_kernel_t data_mover; + unsigned long data_mover, entry, diag308_subcode; struct kimage *image = data; + data_mover = page_to_phys(image->control_code_page); + entry = virt_to_phys(&image->head); + diag308_subcode = DIAG308_CLEAR_RESET; + if (sclp.has_iplcc) + diag308_subcode |= DIAG308_FLAG_EI; s390_reset_system(); - data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page); - __arch_local_irq_stnsm(0xfb); /* disable DAT - avoid no-execute */ - /* Call the moving routine */ - (*data_mover)(&image->head, image->start); + call_nodat(3, void, (relocate_kernel_t)data_mover, + unsigned long, entry, + unsigned long, image->start, + unsigned long, diag308_subcode); /* Die if kexec returns */ disabled_wait(); @@ -263,7 +250,6 @@ static void __do_machine_kexec(void *data) */ static void __machine_kexec(void *data) { - __arch_local_irq_stosm(0x04); /* enable DAT */ pfault_fini(); tracing_off(); debug_locks_off(); diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index f9e4baa64b67..a36d7311c668 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -7,11 +7,14 @@ * Author(s): Philipp Rudo <prudo@linux.vnet.ibm.com> */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/elf.h> #include <linux/errno.h> #include <linux/kexec.h> #include <linux/module_signature.h> #include <linux/verification.h> +#include <linux/vmalloc.h> #include <asm/boot_data.h> #include <asm/ipl.h> #include <asm/setup.h> @@ -28,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; struct module_signature *ms; unsigned long sig_len; + int ret; /* Skip signature verification when not secure IPLed. */ if (!ipl_secure_flag) @@ -62,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) return -EBADMSG; } - return verify_pkcs7_signature(kernel, kernel_len, - kernel + kernel_len, sig_len, - VERIFY_USE_PLATFORM_KEYRING, - VERIFYING_MODULE_SIGNATURE, - NULL, NULL); + ret = verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_SECONDARY_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) + ret = verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); + return ret; } #endif /* CONFIG_KEXEC_SIG */ @@ -94,6 +105,7 @@ static int kexec_file_update_purgatory(struct kimage *image, if (ret) return ret; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { u64 crash_size; @@ -110,21 +122,24 @@ static int kexec_file_update_purgatory(struct kimage *image, sizeof(crash_size), false); } +#endif return ret; } static int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif ret = kexec_load_purgatory(image, &buf); if (ret) @@ -137,7 +152,7 @@ static int kexec_file_add_purgatory(struct kimage *image, static int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -147,8 +162,10 @@ static int kexec_file_add_initrd(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; +#endif buf.memsz = buf.bufsz; data->parm->initrd_start = data->memsz; @@ -167,18 +184,17 @@ static int kexec_file_add_ipl_report(struct kimage *image, { __u32 *lc_ipl_parmblock_ptr; unsigned int len, ncerts; - struct kexec_buf buf; + struct kexec_buf buf = {}; unsigned long addr; void *ptr, *end; + int ret; buf.image = image; data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; - if (image->type == KEXEC_TYPE_CRASH) - buf.mem += crashk_res.start; - ptr = (void *)ipl_cert_list_addr; + ptr = __va(ipl_cert_list_addr); end = ptr + ipl_cert_list_size; ncerts = 0; while (ptr < end) { @@ -190,7 +206,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, addr = data->memsz + data->report->size; addr += ncerts * sizeof(struct ipl_rb_certificate_entry); - ptr = (void *)ipl_cert_list_addr; + ptr = __va(ipl_cert_list_addr); while (ptr < end) { len = *(unsigned int *)ptr; ptr += sizeof(len); @@ -199,9 +215,13 @@ static int kexec_file_add_ipl_report(struct kimage *image, ptr += len; } + ret = -ENOMEM; buf.buffer = ipl_report_finish(data->report); + if (!buf.buffer) + goto out; buf.bufsz = data->report->size; buf.memsz = buf.bufsz; + image->arch.ipl_buf = buf.buffer; data->memsz += buf.memsz; @@ -209,14 +229,23 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); *lc_ipl_parmblock_ptr = (__u32)buf.mem; - return kexec_add_buffer(&buf); +#ifdef CONFIG_CRASH_DUMP + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; +#endif + + ret = kexec_add_buffer(&buf); +out: + return ret; } void *kexec_file_add_components(struct kimage *image, int (*add_kernel)(struct kimage *image, struct s390_load_data *data)) { + unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE; struct s390_load_data data = {0}; + unsigned long minsize; int ret; data.report = ipl_report_init(&ipl_block); @@ -227,17 +256,32 @@ void *kexec_file_add_components(struct kimage *image, if (ret) goto out; - if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { - ret = -EINVAL; + ret = -EINVAL; + minsize = PARMAREA + offsetof(struct parmarea, command_line); + if (image->kernel_buf_len < minsize) goto out; - } + + if (data.parm->max_command_line_size) + max_command_line_size = data.parm->max_command_line_size; + + if (minsize + max_command_line_size < minsize) + goto out; + + if (image->kernel_buf_len < minsize + max_command_line_size) + goto out; + + if (image->cmdline_buf_len >= max_command_line_size) + goto out; + memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { data.parm->oldmem_base = crashk_res.start; data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1; } +#endif if (image->initrd_buf) { ret = kexec_file_add_initrd(image, &data); @@ -267,8 +311,16 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab) { + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; Elf_Rela *relas; int i, r_type; + int ret; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; relas = (void *)pi->ehdr + relsec->sh_offset; @@ -281,15 +333,27 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, sym = (void *)pi->ehdr + symtab->sh_offset; sym += ELF64_R_SYM(relas[i].r_info); - if (sym->st_shndx == SHN_UNDEF) + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); return -ENOEXEC; + } - if (sym->st_shndx == SHN_COMMON) + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); return -ENOEXEC; + } if (sym->st_shndx >= pi->ehdr->e_shnum && - sym->st_shndx != SHN_ABS) + sym->st_shndx != SHN_ABS) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); return -ENOEXEC; + } loc = pi->purgatory_buf; loc += section->sh_offset; @@ -303,21 +367,23 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, addr = section->sh_addr + relas[i].r_offset; r_type = ELF64_R_TYPE(relas[i].r_info); - arch_kexec_do_relocs(r_type, loc, val, addr); + + if (r_type == R_390_PLT32DBL) + r_type = R_390_PC32DBL; + + ret = arch_kexec_do_relocs(r_type, loc, val, addr); + if (ret) { + pr_err("Unknown rela relocation: %d\n", r_type); + return -ENOEXEC; + } } return 0; } -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) +int arch_kimage_file_post_load_cleanup(struct kimage *image) { - /* A kernel must be at least large enough to contain head.S. During - * load memory in head.S will be accessed, e.g. to register the next - * command line. If the next kernel were smaller the current kernel - * will panic at load. - */ - if (buf_len < HEAD_END) - return -ENOEXEC; - - return kexec_image_probe_default(image, buf, buf_len); + vfree(image->arch.ipl_buf); + image->arch.ipl_buf = NULL; + + return kexec_image_post_load_cleanup_default(image); } diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index faf64c2f90f5..1fec370fecf4 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -2,8 +2,6 @@ /* * Copyright IBM Corp. 2008, 2009 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * */ #include <linux/linkage.h> @@ -11,96 +9,177 @@ #include <asm/ftrace.h> #include <asm/nospec-insn.h> #include <asm/ptrace.h> -#include <asm/export.h> +#include <asm/march.h> + +#define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) + +#define STACK_FRAME_SIZE_FREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE) +#define STACK_FREGS (STACK_FRAME_OVERHEAD) +#define STACK_FREGS_PTREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS) +#define STACK_FREGS_PTREGS_GPRS (STACK_FREGS_PTREGS + __PT_GPRS) +#define STACK_FREGS_PTREGS_PSW (STACK_FREGS_PTREGS + __PT_PSW) +#define STACK_FREGS_PTREGS_ORIG_GPR2 (STACK_FREGS_PTREGS + __PT_ORIG_GPR2) +#define STACK_FREGS_PTREGS_FLAGS (STACK_FREGS_PTREGS + __PT_FLAGS) + +/* packed stack: allocate just enough for r14, r15 and backchain */ +#define TRACED_FUNC_FRAME_SIZE 24 + +#ifdef CONFIG_FUNCTION_TRACER GEN_BR_THUNK %r1 GEN_BR_THUNK %r14 .section .kprobes.text, "ax" -ENTRY(ftrace_stub) +SYM_FUNC_START(ftrace_stub) BR_EX %r14 -ENDPROC(ftrace_stub) - -#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) -#define STACK_PTREGS (STACK_FRAME_OVERHEAD) -#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) -#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) -#ifdef __PACK_STACK -/* allocate just enough for r14, r15 and backchain */ -#define TRACED_FUNC_FRAME_SIZE 24 -#else -#define TRACED_FUNC_FRAME_SIZE STACK_FRAME_OVERHEAD -#endif +SYM_FUNC_END(ftrace_stub) + +SYM_CODE_START(ftrace_stub_direct_tramp) + lgr %r1, %r0 + BR_EX %r1 +SYM_CODE_END(ftrace_stub_direct_tramp) -ENTRY(ftrace_caller) - .globl ftrace_regs_caller - .set ftrace_regs_caller,ftrace_caller + .macro ftrace_regs_entry, allregs=0 stg %r14,(__SF_GPRS+8*8)(%r15) # save traced function caller - lghi %r14,0 # save condition code - ipm %r14 # don't put any instructions - sllg %r14,%r14,16 # clobbering CC before this point + + .if \allregs == 1 + # save psw mask + # don't put any instructions clobbering CC before this point + epsw %r1,%r14 + risbg %r14,%r1,0,31,32 + .endif + lgr %r1,%r15 # allocate stack frame for ftrace_caller to contain traced function aghi %r15,-TRACED_FUNC_FRAME_SIZE stg %r1,__SF_BACKCHAIN(%r15) stg %r0,(__SF_GPRS+8*8)(%r15) stg %r15,(__SF_GPRS+9*8)(%r15) - # allocate pt_regs and stack frame for ftrace_trace_function - aghi %r15,-STACK_FRAME_SIZE - stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15) - stg %r14,(STACK_PTREGS_PSW)(%r15) + # allocate ftrace_regs and stack frame for ftrace_trace_function + aghi %r15,-STACK_FRAME_SIZE_FREGS + stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15) + xc STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + + .if \allregs == 1 + stg %r14,(STACK_FREGS_PTREGS_PSW)(%r15) + mvghi STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS + .else + xc STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15) + .endif + lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address - stosm (STACK_PTREGS_PSW)(%r15),0 aghi %r1,-TRACED_FUNC_FRAME_SIZE stg %r1,__SF_BACKCHAIN(%r15) - stg %r0,(STACK_PTREGS_PSW+8)(%r15) - stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15) -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + stg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) + stmg %r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) + .endm + +SYM_CODE_START(ftrace_regs_caller) + ftrace_regs_entry 1 + j ftrace_common +SYM_CODE_END(ftrace_regs_caller) + +SYM_CODE_START(ftrace_caller) + ftrace_regs_entry 0 + j ftrace_common +SYM_CODE_END(ftrace_caller) + +SYM_CODE_START(ftrace_common) +#ifdef MARCH_HAS_Z196_FEATURES aghik %r2,%r0,-MCOUNT_INSN_SIZE lgrl %r4,function_trace_op - lgrl %r1,ftrace_trace_function + lgrl %r1,ftrace_func #else lgr %r2,%r0 aghi %r2,-MCOUNT_INSN_SIZE larl %r4,function_trace_op lg %r4,0(%r4) - larl %r1,ftrace_trace_function + larl %r1,ftrace_func lg %r1,0(%r1) #endif lgr %r3,%r14 - la %r5,STACK_PTREGS(%r15) + la %r5,STACK_FREGS(%r15) BASR_EX %r14,%r1 -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -# The j instruction gets runtime patched to a nop instruction. -# See ftrace_enable_ftrace_graph_caller. - .globl ftrace_graph_caller -ftrace_graph_caller: - j ftrace_graph_caller_end - lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) - lg %r4,(STACK_PTREGS_PSW+8)(%r15) - brasl %r14,prepare_ftrace_return - stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) -ftrace_graph_caller_end: - .globl ftrace_graph_caller_end + lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) +#ifdef MARCH_HAS_Z196_FEATURES + ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + locgrz %r1,%r0 +#else + lg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) + ltgr %r1,%r1 + jnz 0f + lgr %r1,%r0 #endif - lg %r1,(STACK_PTREGS_PSW+8)(%r15) - lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) +0: lmg %r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 -ENDPROC(ftrace_caller) +SYM_CODE_END(ftrace_common) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(return_to_handler) +SYM_FUNC_START(return_to_handler) stmg %r2,%r5,32(%r15) lgr %r1,%r15 - aghi %r15,-STACK_FRAME_OVERHEAD + # allocate ftrace_regs and stack frame for ftrace_return_to_handler + aghi %r15,-STACK_FRAME_SIZE_FREGS stg %r1,__SF_BACKCHAIN(%r15) + stg %r2,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) + stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15) + la %r2,STACK_FRAME_OVERHEAD(%r15) brasl %r14,ftrace_return_to_handler - aghi %r15,STACK_FRAME_OVERHEAD + aghi %r15,STACK_FRAME_SIZE_FREGS lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 -ENDPROC(return_to_handler) +SYM_FUNC_END(return_to_handler) #endif +#endif /* CONFIG_FUNCTION_TRACER */ + +SYM_CODE_START(ftrace_shared_hotpatch_trampoline_br) + lmg %r0,%r1,2(%r1) + br %r1 +SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_br_end, SYM_L_GLOBAL) +SYM_CODE_END(ftrace_shared_hotpatch_trampoline_br) + +#ifdef CONFIG_EXPOLINE +SYM_CODE_START(ftrace_shared_hotpatch_trampoline_exrl) + lmg %r0,%r1,2(%r1) + exrl %r0,0f + j . +0: br %r1 +SYM_INNER_LABEL(ftrace_shared_hotpatch_trampoline_exrl_end, SYM_L_GLOBAL) +SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl) +#endif /* CONFIG_EXPOLINE */ + +#ifdef CONFIG_RETHOOK + +SYM_CODE_START(arch_rethook_trampoline) + stg %r14,(__SF_GPRS+8*8)(%r15) + lay %r15,-STACK_FRAME_SIZE_PTREGS(%r15) + stmg %r0,%r14,STACK_PTREGS_GPRS(%r15) + + # store original stack pointer in backchain and pt_regs + lay %r7,STACK_FRAME_SIZE_PTREGS(%r15) + stg %r7,__SF_BACKCHAIN(%r15) + stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15) + + # store full psw + epsw %r2,%r3 + risbg %r3,%r2,0,31,32 + stg %r3,STACK_PTREGS_PSW(%r15) + larl %r1,arch_rethook_trampoline + stg %r1,STACK_PTREGS_PSW+8(%r15) + + lay %r2,STACK_PTREGS(%r15) + brasl %r14,arch_rethook_trampoline_callback + + mvc __SF_EMPTY(16,%r7),STACK_PTREGS_PSW(%r15) + lmg %r0,%r15,STACK_PTREGS_GPRS(%r15) + lpswe __SF_EMPTY(%r15) +SYM_CODE_END(arch_rethook_trampoline) + +#endif /* CONFIG_RETHOOK */ diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 4055f1c49814..9d1f8a50f5a4 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -14,15 +14,22 @@ #include <linux/elf.h> #include <linux/vmalloc.h> #include <linux/fs.h> +#include <linux/ftrace.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/kasan.h> #include <linux/moduleloader.h> #include <linux/bug.h> #include <linux/memory.h> +#include <linux/execmem.h> +#include <asm/arch-stackprotector.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> #include <asm/facility.h> +#include <asm/ftrace.lds.h> +#include <asm/set_memory.h> +#include <asm/setup.h> +#include <asm/asm-offsets.h> #if 0 #define DEBUGP printk @@ -30,23 +37,14 @@ #define DEBUGP(fmt , ...) #endif -#define PLT_ENTRY_SIZE 20 +#define PLT_ENTRY_SIZE 22 -void *module_alloc(unsigned long size) +#ifdef CONFIG_FUNCTION_TRACER +void module_arch_cleanup(struct module *mod) { - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { - vfree(p); - return NULL; - } - return p; + execmem_free(mod->arch.trampolines_start); } +#endif void module_arch_freeing_init(struct module *mod) { @@ -115,6 +113,7 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, Elf_Rela *rela; char *strings; int nrela, i, j; + struct module_memory *mod_mem; /* Find symbol table and string table. */ symtab = NULL; @@ -162,14 +161,15 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, /* Increase core size by size of got & plt and set start offsets for got and plt. */ - me->core_layout.size = ALIGN(me->core_layout.size, 4); - me->arch.got_offset = me->core_layout.size; - me->core_layout.size += me->arch.got_size; - me->arch.plt_offset = me->core_layout.size; + mod_mem = &me->mem[MOD_TEXT]; + mod_mem->size = ALIGN(mod_mem->size, 4); + me->arch.got_offset = mod_mem->size; + mod_mem->size += me->arch.got_size; + me->arch.plt_offset = mod_mem->size; if (me->arch.plt_size) { if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) me->arch.plt_size += PLT_ENTRY_SIZE; - me->core_layout.size += me->arch.plt_size; + mod_mem->size += me->arch.plt_size; } return 0; } @@ -293,7 +293,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_GOTPLT64: /* 64 bit offset to jump slot. */ case R_390_GOTPLTENT: /* 32 bit rel. offset to jump slot >> 1. */ if (info->got_initialized == 0) { - Elf_Addr *gotent = me->core_layout.base + + Elf_Addr *gotent = me->mem[MOD_TEXT].base + me->arch.got_offset + info->got_offset; @@ -318,7 +318,8 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, rc = apply_rela_bits(loc, val, 0, 64, 0, write); else if (r_type == R_390_GOTENT || r_type == R_390_GOTPLTENT) { - val += (Elf_Addr) me->core_layout.base - loc; + val += (Elf_Addr)me->mem[MOD_TEXT].base + + me->arch.got_offset - loc; rc = apply_rela_bits(loc, val, 1, 32, 1, write); } break; @@ -330,27 +331,26 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */ case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */ if (info->plt_initialized == 0) { - unsigned int insn[5]; - unsigned int *ip = me->core_layout.base + - me->arch.plt_offset + - info->plt_offset; - - insn[0] = 0x0d10e310; /* basr 1,0 */ - insn[1] = 0x100a0004; /* lg 1,10(1) */ + unsigned char insn[PLT_ENTRY_SIZE]; + char *plt_base; + char *ip; + + plt_base = me->mem[MOD_TEXT].base + me->arch.plt_offset; + ip = plt_base + info->plt_offset; + *(int *)insn = 0x0d10e310; /* basr 1,0 */ + *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */ if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) { - unsigned int *ij; - ij = me->core_layout.base + - me->arch.plt_offset + - me->arch.plt_size - PLT_ENTRY_SIZE; - insn[2] = 0xa7f40000 + /* j __jump_r1 */ - (unsigned int)(u16) - (((unsigned long) ij - 8 - - (unsigned long) ip) / 2); + char *jump_r1; + + jump_r1 = plt_base + me->arch.plt_size - + PLT_ENTRY_SIZE; + /* brcl 0xf,__jump_r1 */ + *(short *)&insn[8] = 0xc0f4; + *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2; } else { - insn[2] = 0x07f10000; /* br %r1 */ + *(int *)&insn[8] = 0x07f10000; /* br %r1 */ } - insn[3] = (unsigned int) (val >> 32); - insn[4] = (unsigned int) val; + *(long *)&insn[14] = val; write(ip, insn, sizeof(insn)); info->plt_initialized = 1; @@ -365,7 +365,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, val - loc + 0xffffUL < 0x1ffffeUL) || (r_type == R_390_PLT32DBL && val - loc + 0xffffffffULL < 0x1fffffffeULL))) - val = (Elf_Addr) me->core_layout.base + + val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.plt_offset + info->plt_offset; val += rela->r_addend - loc; @@ -387,7 +387,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, case R_390_GOTOFF32: /* 32 bit offset to GOT. */ case R_390_GOTOFF64: /* 64 bit offset to GOT. */ val = val + rela->r_addend - - ((Elf_Addr) me->core_layout.base + me->arch.got_offset); + ((Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset); if (r_type == R_390_GOTOFF16) rc = apply_rela_bits(loc, val, 0, 16, 0, write); else if (r_type == R_390_GOTOFF32) @@ -397,7 +397,7 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab, break; case R_390_GOTPC: /* 32 bit PC relative offset to GOT. */ case R_390_GOTPCDBL: /* 32 bit PC rel. off. to GOT shifted by 1. */ - val = (Elf_Addr) me->core_layout.base + me->arch.got_offset + + val = (Elf_Addr) me->mem[MOD_TEXT].base + me->arch.got_offset + rela->r_addend - loc; if (r_type == R_390_GOTPC) rc = apply_rela_bits(loc, val, 1, 32, 0, write); @@ -466,6 +466,30 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, write); } +#ifdef CONFIG_FUNCTION_TRACER +static int module_alloc_ftrace_hotpatch_trampolines(struct module *me, + const Elf_Shdr *s) +{ + char *start, *end; + int numpages; + size_t size; + + size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size); + numpages = DIV_ROUND_UP(size, PAGE_SIZE); + start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE); + if (!start) + return -ENOMEM; + set_memory_rox((unsigned long)start, numpages); + end = start + size; + + me->arch.trampolines_start = (struct ftrace_hotpatch_trampoline *)start; + me->arch.trampolines_end = (struct ftrace_hotpatch_trampoline *)end; + me->arch.next_trampoline = me->arch.trampolines_start; + + return 0; +} +#endif /* CONFIG_FUNCTION_TRACER */ + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) @@ -473,22 +497,17 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *s; char *secstrings, *secname; void *aseg; + int rc = 0; if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable && me->arch.plt_size) { unsigned int *ij; - ij = me->core_layout.base + me->arch.plt_offset + + ij = me->mem[MOD_TEXT].base + me->arch.plt_offset + me->arch.plt_size - PLT_ENTRY_SIZE; - if (test_facility(35)) { - ij[0] = 0xc6000000; /* exrl %r0,.+10 */ - ij[1] = 0x0005a7f4; /* j . */ - ij[2] = 0x000007f1; /* br %r1 */ - } else { - ij[0] = 0x44000000 | (unsigned int) - offsetof(struct lowcore, br_r1_trampoline); - ij[1] = 0xa7f40000; /* j . */ - } + ij[0] = 0xc6000000; /* exrl %r0,.+10 */ + ij[1] = 0x0005a7f4; /* j . */ + ij[2] = 0x000007f1; /* br %r1 */ } secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; @@ -507,8 +526,22 @@ int module_finalize(const Elf_Ehdr *hdr, if (IS_ENABLED(CONFIG_EXPOLINE) && (str_has_prefix(secname, ".s390_return"))) nospec_revert(aseg, aseg + s->sh_size); + + if (IS_ENABLED(CONFIG_STACKPROTECTOR) && + (str_has_prefix(secname, "__stack_protector_loc"))) { + rc = stack_protector_apply(aseg, aseg + s->sh_size); + if (rc) + break; + } + +#ifdef CONFIG_FUNCTION_TRACER + if (!strcmp(FTRACE_CALLSITE_SECTION, secname)) { + rc = module_alloc_ftrace_hotpatch_trampolines(me, s); + if (rc) + break; + } +#endif /* CONFIG_FUNCTION_TRACER */ } - jump_label_apply_nops(me); - return 0; + return rc; } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 20f8e1868853..a55abbf65333 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -6,12 +6,14 @@ * Author(s): Ingo Adlung <adlung@de.ibm.com>, * Martin Schwidefsky <schwidefsky@de.ibm.com>, * Cornelia Huck <cornelia.huck@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, */ #include <linux/kernel_stat.h> +#include <linux/utsname.h> +#include <linux/cpufeature.h> #include <linux/init.h> #include <linux/errno.h> +#include <linux/entry-common.h> #include <linux/hardirq.h> #include <linux/log2.h> #include <linux/kprobes.h> @@ -19,18 +21,18 @@ #include <linux/time.h> #include <linux/module.h> #include <linux/sched/signal.h> - -#include <linux/export.h> +#include <linux/kvm_host.h> #include <asm/lowcore.h> +#include <asm/ctlreg.h> +#include <asm/fpu.h> #include <asm/smp.h> #include <asm/stp.h> #include <asm/cputime.h> #include <asm/nmi.h> #include <asm/crw.h> -#include <asm/switch_to.h> -#include <asm/ctl_reg.h> #include <asm/asm-offsets.h> -#include <linux/kvm_host.h> +#include <asm/pai.h> +#include <asm/vtime.h> struct mcck_struct { unsigned int kill_task : 1; @@ -41,92 +43,177 @@ struct mcck_struct { }; static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck); -static struct kmem_cache *mcesa_cache; -static unsigned long mcesa_origin_lc; static inline int nmi_needs_mcesa(void) { - return MACHINE_HAS_VX || MACHINE_HAS_GS; -} - -static inline unsigned long nmi_get_mcesa_size(void) -{ - if (MACHINE_HAS_GS) - return MCESA_MAX_SIZE; - return MCESA_MIN_SIZE; + return cpu_has_vx() || cpu_has_gs(); } /* * The initial machine check extended save area for the boot CPU. - * It will be replaced by nmi_init() with an allocated structure. - * The structure is required for machine check happening early in - * the boot process. + * It will be replaced on the boot CPU reinit with an allocated + * structure. The structure is required for machine check happening + * early in the boot process. */ -static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE); +static struct mcesa boot_mcesa __aligned(MCESA_MAX_SIZE); -void __init nmi_alloc_boot_cpu(struct lowcore *lc) +void __init nmi_alloc_mcesa_early(u64 *mcesad) { if (!nmi_needs_mcesa()) return; - lc->mcesad = (unsigned long) &boot_mcesa; - if (MACHINE_HAS_GS) - lc->mcesad |= ilog2(MCESA_MAX_SIZE); + *mcesad = __pa(&boot_mcesa); + if (cpu_has_gs()) + *mcesad |= ilog2(MCESA_MAX_SIZE); } -static int __init nmi_init(void) +int nmi_alloc_mcesa(u64 *mcesad) { - unsigned long origin, cr0, size; + unsigned long size; + void *origin; + *mcesad = 0; if (!nmi_needs_mcesa()) return 0; - size = nmi_get_mcesa_size(); - if (size > MCESA_MIN_SIZE) - mcesa_origin_lc = ilog2(size); - /* create slab cache for the machine-check-extended-save-areas */ - mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL); - if (!mcesa_cache) - panic("Couldn't create nmi save area cache"); - origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); + size = cpu_has_gs() ? MCESA_MAX_SIZE : MCESA_MIN_SIZE; + origin = kmalloc(size, GFP_KERNEL); if (!origin) - panic("Couldn't allocate nmi save area"); + return -ENOMEM; /* The pointer is stored with mcesa_bits ORed in */ - kmemleak_not_leak((void *) origin); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - /* Replace boot_mcesa on the boot CPU */ - S390_lowcore.mcesad = origin | mcesa_origin_lc; - __ctl_load(cr0, 0, 0); + kmemleak_not_leak(origin); + *mcesad = __pa(origin); + if (cpu_has_gs()) + *mcesad |= ilog2(MCESA_MAX_SIZE); return 0; } -early_initcall(nmi_init); -int nmi_alloc_per_cpu(struct lowcore *lc) +void nmi_free_mcesa(u64 *mcesad) { - unsigned long origin; - if (!nmi_needs_mcesa()) - return 0; - origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL); - if (!origin) - return -ENOMEM; - /* The pointer is stored with mcesa_bits ORed in */ - kmemleak_not_leak((void *) origin); - lc->mcesad = origin | mcesa_origin_lc; - return 0; + return; + kfree(__va(*mcesad & MCESA_ORIGIN_MASK)); } -void nmi_free_per_cpu(struct lowcore *lc) +static __always_inline char *nmi_puts(char *dest, const char *src) { - if (!nmi_needs_mcesa()) - return; - kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK)); + while (*src) + *dest++ = *src++; + *dest = 0; + return dest; } -static notrace void s390_handle_damage(void) +static __always_inline char *u64_to_hex(char *dest, u64 val) { + int i, num; + + for (i = 1; i <= 16; i++) { + num = (val >> (64 - 4 * i)) & 0xf; + if (num >= 10) + *dest++ = 'A' + num - 10; + else + *dest++ = '0' + num; + } + *dest = 0; + return dest; +} + +static notrace void nmi_print_info(void) +{ + struct lowcore *lc = get_lowcore(); + char message[100]; + char *ptr; + int i; + + ptr = nmi_puts(message, "Unrecoverable machine check, code: "); + ptr = u64_to_hex(ptr, lc->mcck_interruption_code); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, init_utsname()->release); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, arch_hw_string); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, "PSW: "); + ptr = u64_to_hex(ptr, lc->mcck_old_psw.mask); + ptr = nmi_puts(ptr, " "); + ptr = u64_to_hex(ptr, lc->mcck_old_psw.addr); + ptr = nmi_puts(ptr, " PFX: "); + ptr = u64_to_hex(ptr, (u64)get_lowcore()); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, "LBA: "); + ptr = u64_to_hex(ptr, lc->last_break_save_area); + ptr = nmi_puts(ptr, " EDC: "); + ptr = u64_to_hex(ptr, lc->external_damage_code); + ptr = nmi_puts(ptr, " FSA: "); + ptr = u64_to_hex(ptr, lc->failing_storage_address); + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + + ptr = nmi_puts(message, "CRS:\n"); + sclp_emergency_printk(message); + ptr = message; + for (i = 0; i < 16; i++) { + ptr = u64_to_hex(ptr, lc->cregs_save_area[i].val); + ptr = nmi_puts(ptr, " "); + if ((i + 1) % 4 == 0) { + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + ptr = message; + } + } + + ptr = nmi_puts(message, "GPRS:\n"); + sclp_emergency_printk(message); + ptr = message; + for (i = 0; i < 16; i++) { + ptr = u64_to_hex(ptr, lc->gpregs_save_area[i]); + ptr = nmi_puts(ptr, " "); + if ((i + 1) % 4 == 0) { + ptr = nmi_puts(ptr, "\n"); + sclp_emergency_printk(message); + ptr = message; + } + } + + ptr = nmi_puts(message, "System stopped\n"); + sclp_emergency_printk(message); +} + +static notrace void __noreturn s390_handle_damage(void) +{ + struct lowcore *lc = get_lowcore(); + union ctlreg0 cr0, cr0_new; + psw_t psw_save; + smp_emergency_stop(); + diag_amode31_ops.diag308_reset(); + + /* + * Disable low address protection and make machine check new PSW a + * disabled wait PSW. Any additional machine check cannot be handled. + */ + local_ctl_store(0, &cr0.reg); + cr0_new = cr0; + cr0_new.lap = 0; + local_ctl_load(0, &cr0_new.reg); + psw_save = lc->mcck_new_psw; + psw_bits(lc->mcck_new_psw).io = 0; + psw_bits(lc->mcck_new_psw).ext = 0; + psw_bits(lc->mcck_new_psw).wait = 1; + nmi_print_info(); + + /* + * Restore machine check new PSW and control register 0 to original + * values. This makes possible system dump analysis easier. + */ + lc->mcck_new_psw = psw_save; + local_ctl_load(0, &cr0.reg); disabled_wait(); - while (1); } NOKPROBE_SYMBOL(s390_handle_damage); @@ -134,19 +221,20 @@ NOKPROBE_SYMBOL(s390_handle_damage); * Main machine check handler function. Will be called with interrupts disabled * and machine checks enabled. */ -void __s390_handle_mcck(void) +void s390_handle_mcck(void) { struct mcck_struct mcck; + unsigned long mflags; /* * Disable machine checks and get the current state of accumulated * machine checks. Afterwards delete the old state and enable machine * checks again. */ - local_mcck_disable(); + local_mcck_save(mflags); mcck = *this_cpu_ptr(&cpu_mcck); memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); - local_mcck_enable(); + local_mcck_restore(mflags); if (mcck.channel_report) crw_handle_channel_report(); @@ -163,184 +251,80 @@ void __s390_handle_mcck(void) static int mchchk_wng_posted = 0; /* Use single cpu clear, as we cannot handle smp here. */ - __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ + local_ctl_clear_bit(14, CR14_WARNING_SUBMASK_BIT); if (xchg(&mchchk_wng_posted, 1) == 0) kill_cad_pid(SIGPWR, 1); } if (mcck.stp_queue) stp_queue_work(); if (mcck.kill_task) { - local_irq_enable(); printk(KERN_EMERG "mcck: Terminating task because of machine " "malfunction (code 0x%016lx).\n", mcck.mcck_code); printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", current->comm, current->pid); - do_exit(SIGSEGV); + if (is_global_init(current)) + panic("mcck: Attempting to kill init!\n"); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, current, PIDTYPE_PID); } } -void noinstr s390_handle_mcck(void) -{ - trace_hardirqs_off(); - __s390_handle_mcck(); - trace_hardirqs_on(); -} -/* - * returns 0 if all required registers are available - * returns 1 otherwise +/** + * nmi_registers_valid - verify if registers are valid + * @mci: machine check interruption code + * + * Inspect a machine check interruption code and verify if all required + * registers are valid. For some registers the corresponding validity bit is + * ignored and the registers are set to the expected value. + * Returns true if all registers are valid, otherwise false. */ -static int notrace s390_validate_registers(union mci mci, int umode) +static bool notrace nmi_registers_valid(union mci mci) { - struct mcesa *mcesa; - void *fpt_save_area; union ctlreg2 cr2; - int kill_task; - u64 zero; - - kill_task = 0; - zero = 0; - - if (!mci.gr) { - /* - * General purpose registers couldn't be restored and have - * unknown contents. Stop system or terminate process. - */ - if (!umode) - s390_handle_damage(); - kill_task = 1; - } - if (!mci.fp) { - /* - * Floating point registers can't be restored. If the - * kernel currently uses floating point registers the - * system is stopped. If the process has its floating - * pointer registers loaded it is terminated. - */ - if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7) - s390_handle_damage(); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; - } - fpt_save_area = &S390_lowcore.floating_pt_save_area; - if (!mci.fc) { - /* - * Floating point control register can't be restored. - * If the kernel currently uses the floating pointer - * registers and needs the FPC register the system is - * stopped. If the process has its floating pointer - * registers loaded it is terminated. Otherwise the - * FPC is just validated. - */ - if (S390_lowcore.fpu_flags & KERNEL_FPC) - s390_handle_damage(); - asm volatile( - " lfpc %0\n" - : - : "Q" (zero)); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; - } else { - asm volatile( - " lfpc %0\n" - : - : "Q" (S390_lowcore.fpt_creg_save_area)); - } - mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); - if (!MACHINE_HAS_VX) { - /* Validate floating point registers */ - asm volatile( - " ld 0,0(%0)\n" - " ld 1,8(%0)\n" - " ld 2,16(%0)\n" - " ld 3,24(%0)\n" - " ld 4,32(%0)\n" - " ld 5,40(%0)\n" - " ld 6,48(%0)\n" - " ld 7,56(%0)\n" - " ld 8,64(%0)\n" - " ld 9,72(%0)\n" - " ld 10,80(%0)\n" - " ld 11,88(%0)\n" - " ld 12,96(%0)\n" - " ld 13,104(%0)\n" - " ld 14,112(%0)\n" - " ld 15,120(%0)\n" - : - : "a" (fpt_save_area) - : "memory"); - } else { - /* Validate vector registers */ - union ctlreg0 cr0; - - if (!mci.vr) { - /* - * Vector registers can't be restored. If the kernel - * currently uses vector registers the system is - * stopped. If the process has its vector registers - * loaded it is terminated. Otherwise just validate - * the registers. - */ - if (S390_lowcore.fpu_flags & KERNEL_VXR) - s390_handle_damage(); - if (!test_cpu_flag(CIF_FPU)) - kill_task = 1; - } - cr0.val = S390_lowcore.cregs_save_area[0]; - cr0.afp = cr0.vx = 1; - __ctl_load(cr0.val, 0, 0); - asm volatile( - " la 1,%0\n" - " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ - " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ - : - : "Q" (*(struct vx_array *)mcesa->vector_save_area) - : "1"); - __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); - } - /* Validate access registers */ - asm volatile( - " lam 0,15,0(%0)\n" - : - : "a" (&S390_lowcore.access_regs_save_area) - : "memory"); - if (!mci.ar) { - /* - * Access registers have unknown contents. - * Terminating task. - */ - kill_task = 1; - } - /* Validate guarded storage registers */ - cr2.val = S390_lowcore.cregs_save_area[2]; - if (cr2.gse) { - if (!mci.gs) { - /* - * Guarded storage register can't be restored and - * the current processes uses guarded storage. - * It has to be terminated. - */ - kill_task = 1; - } else { - load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); - } - } /* - * The getcpu vdso syscall reads CPU number from the programmable + * The getcpu vdso syscall reads the CPU number from the programmable * field of the TOD clock. Disregard the TOD programmable register - * validity bit and load the CPU number into the TOD programmable - * field unconditionally. + * validity bit and load the CPU number into the TOD programmable field + * unconditionally. */ set_tod_programmable_field(raw_smp_processor_id()); - /* Validate clock comparator register */ - set_clock_comparator(S390_lowcore.clock_comparator); - + /* + * Set the clock comparator register to the next expected value. + */ + set_clock_comparator(get_lowcore()->clock_comparator); + if (!mci.gr || !mci.fp || !mci.fc) + return false; + /* + * The vector validity must only be checked if not running a + * KVM guest. For KVM guests the machine check is forwarded by + * KVM and it is the responsibility of the guest to take + * appropriate actions. The host vector or FPU values have been + * saved by KVM and will be restored by KVM. + */ + if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) + return false; + if (!mci.ar) + return false; + /* + * Two cases for guarded storage registers: + * - machine check in kernel or userspace + * - machine check while running SIE (KVM guest) + * For kernel or userspace the userspace values of guarded storage + * control can not be recreated, the process must be terminated. + * For SIE the guest values of guarded storage can not be recreated. + * This is either due to a bug or due to GS being disabled in the + * guest. The guest will be notified by KVM code and the guests machine + * check handling must take care of this. The host values are saved by + * KVM and are not affected. + */ + cr2.reg = get_lowcore()->cregs_save_area[2]; + if (cr2.gse && !mci.gs && !test_cpu_flag(CIF_MCCK_GUEST)) + return false; if (!mci.ms || !mci.pm || !mci.ia) - kill_task = 1; - - return kill_task; + return false; + return true; } -NOKPROBE_SYMBOL(s390_validate_registers); +NOKPROBE_SYMBOL(nmi_registers_valid); /* * Backup the guest's machine check info to its description block @@ -351,8 +335,7 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs) struct sie_page *sie_page; /* r14 contains the sie block, which was set in sie64a */ - struct kvm_s390_sie_block *sie_block = - (struct kvm_s390_sie_block *) regs->gprs[14]; + struct kvm_s390_sie_block *sie_block = phys_to_virt(regs->gprs[14]); if (sie_block == NULL) /* Something's seriously wrong, stop system. */ @@ -360,11 +343,10 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs) sie_page = container_of(sie_block, struct sie_page, sie_block); mcck_backup = &sie_page->mcck_info; - mcck_backup->mcic = S390_lowcore.mcck_interruption_code & + mcck_backup->mcic = get_lowcore()->mcck_interruption_code & ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE); - mcck_backup->ext_damage_code = S390_lowcore.external_damage_code; - mcck_backup->failing_storage_address - = S390_lowcore.failing_storage_address; + mcck_backup->ext_damage_code = get_lowcore()->external_damage_code; + mcck_backup->failing_storage_address = get_lowcore()->failing_storage_address; } NOKPROBE_SYMBOL(s390_backup_mcck_info); @@ -379,23 +361,25 @@ NOKPROBE_SYMBOL(s390_backup_mcck_info); /* * machine check handler. */ -int notrace s390_do_machine_check(struct pt_regs *regs) +void notrace s390_do_machine_check(struct pt_regs *regs) { static int ipd_count; static DEFINE_SPINLOCK(ipd_lock); static unsigned long long last_ipd; + struct lowcore *lc = get_lowcore(); struct mcck_struct *mcck; unsigned long long tmp; + irqentry_state_t irq_state; union mci mci; unsigned long mcck_dam_code; int mcck_pending = 0; - nmi_enter(); + irq_state = irqentry_nmi_enter(regs); if (user_mode(regs)) update_timer_mcck(); inc_irq_stat(NMI_NMI); - mci.val = S390_lowcore.mcck_interruption_code; + mci.val = lc->mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); /* @@ -438,7 +422,9 @@ int notrace s390_do_machine_check(struct pt_regs *regs) s390_handle_damage(); } } - if (s390_validate_registers(mci, user_mode(regs))) { + if (!nmi_registers_valid(mci)) { + if (!user_mode(regs)) + s390_handle_damage(); /* * Couldn't restore all register contents for the * user space process -> mark task for termination. @@ -461,13 +447,27 @@ int notrace s390_do_machine_check(struct pt_regs *regs) } if (mci.ed && mci.ec) { /* External damage */ - if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) + if (lc->external_damage_code & (1U << ED_STP_SYNC)) mcck->stp_queue |= stp_sync_check(); - if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) + if (lc->external_damage_code & (1U << ED_STP_ISLAND)) mcck->stp_queue |= stp_island_check(); mcck_pending = 1; } - + /* + * Reinject storage related machine checks into the guest if they + * happen when the guest is running. + */ + if (!test_cpu_flag(CIF_MCCK_GUEST)) { + /* Storage error uncorrected */ + if (mci.se) + s390_handle_damage(); + /* Storage key-error uncorrected */ + if (mci.ke) + s390_handle_damage(); + /* Storage degradation */ + if (mci.ds && mci.fa) + s390_handle_damage(); + } if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; @@ -492,24 +492,18 @@ int notrace s390_do_machine_check(struct pt_regs *regs) } clear_cpu_flag(CIF_MCCK_GUEST); - if (user_mode(regs) && mcck_pending) { - nmi_exit(); - return 1; - } - if (mcck_pending) schedule_mcck_handler(); - nmi_exit(); - return 0; + irqentry_nmi_exit(regs, irq_state); } NOKPROBE_SYMBOL(s390_do_machine_check); static int __init machine_check_init(void) { - ctl_set_bit(14, 25); /* enable external damage MCH */ - ctl_set_bit(14, 27); /* enable system recovery MCH */ - ctl_set_bit(14, 24); /* enable warning MCH */ + system_ctl_set_bit(14, CR14_EXTERNAL_DAMAGE_SUBMASK_BIT); + system_ctl_set_bit(14, CR14_RECOVERY_SUBMASK_BIT); + system_ctl_set_bit(14, CR14_WARNING_SUBMASK_BIT); return 0; } early_initcall(machine_check_init); diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 250e4dbf653c..e11ec15960a1 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -4,6 +4,8 @@ #include <linux/cpu.h> #include <asm/nospec-branch.h> +int nobp = IS_ENABLED(CONFIG_KERNEL_NOBP); + static int __init nobp_setup_early(char *str) { bool enabled; @@ -14,14 +16,14 @@ static int __init nobp_setup_early(char *str) return rc; if (enabled && test_facility(82)) { /* - * The user explicitely requested nobp=1, enable it and + * The user explicitly requested nobp=1, enable it and * disable the expoline support. */ - __set_facility(82, alt_stfle_fac_list); + nobp = 1; if (IS_ENABLED(CONFIG_EXPOLINE)) nospec_disable = 1; } else { - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } return 0; } @@ -29,7 +31,7 @@ early_param("nobp", nobp_setup_early); static int __init nospec_setup_early(char *str) { - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; return 0; } early_param("nospec", nospec_setup_early); @@ -38,9 +40,9 @@ static int __init nospec_report(void) { if (test_facility(156)) pr_info("Spectre V2 mitigation: etokens\n"); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) + if (nospec_uses_trampoline()) pr_info("Spectre V2 mitigation: execute trampolines\n"); - if (__test_facility(82, alt_stfle_fac_list)) + if (nobp_enabled()) pr_info("Spectre V2 mitigation: limited branch prediction\n"); return 0; } @@ -66,14 +68,14 @@ void __init nospec_auto_detect(void) */ if (__is_defined(CC_USING_EXPOLINE)) nospec_disable = 1; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } else if (__is_defined(CC_USING_EXPOLINE)) { /* * The kernel has been compiled with expolines. * Keep expolines enabled and disable nobp. */ nospec_disable = 0; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } /* * If the kernel has not been compiled with expolines the @@ -86,7 +88,7 @@ static int __init spectre_v2_setup_early(char *str) { if (str && !strncmp(str, "on", 2)) { nospec_disable = 0; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } if (str && !strncmp(str, "off", 3)) nospec_disable = 1; @@ -105,6 +107,7 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) s32 *epo; /* Second part of the instruction replace is always a nop */ + memcpy(insnbuf + 2, branch, sizeof(branch)); for (epo = start; epo < end; epo++) { instr = (u8 *) epo + *epo; if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04) @@ -113,46 +116,24 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) type = BRASL_EXPOLINE; /* brasl instruction */ else continue; - thunk = instr + (*(int *)(instr + 2)) * 2; + thunk = instr + (long)(*(int *)(instr + 2)) * 2; if (thunk[0] == 0xc6 && thunk[1] == 0x00) /* exrl %r0,<target-br> */ - br = thunk + (*(int *)(thunk + 2)) * 2; - else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 && - thunk[6] == 0x44 && thunk[7] == 0x00 && - (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 && - (thunk[1] & 0xf0) == (thunk[8] & 0xf0)) - /* larl %rx,<target br> + ex %r0,0(%rx) */ - br = thunk + (*(int *)(thunk + 2)) * 2; + br = thunk + (long)(*(int *)(thunk + 2)) * 2; else continue; - /* Check for unconditional branch 0x07f? or 0x47f???? */ - if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0) + if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0) continue; - - memcpy(insnbuf + 2, branch, sizeof(branch)); switch (type) { case BRCL_EXPOLINE: + /* brcl to thunk, replace with br + nop */ insnbuf[0] = br[0]; insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); - if (br[0] == 0x47) { - /* brcl to b, replace with bc + nopr */ - insnbuf[2] = br[2]; - insnbuf[3] = br[3]; - } else { - /* brcl to br, replace with bcr + nop */ - } break; case BRASL_EXPOLINE: + /* brasl to thunk, replace with basr + nop */ + insnbuf[0] = 0x0d; insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); - if (br[0] == 0x47) { - /* brasl to b, replace with bas + nopr */ - insnbuf[0] = 0x4d; - insnbuf[2] = br[2]; - insnbuf[3] = br[3]; - } else { - /* brasl to br, replace with basr + nop */ - insnbuf[0] = 0x0d; - } break; } diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index b4b5c8c21166..5970dd3ee7c5 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -7,17 +7,17 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return sysfs_emit(buf, "Mitigation: __user pointer sanitization\n"); } ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { if (test_facility(156)) - return sprintf(buf, "Mitigation: etokens\n"); - if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) - return sprintf(buf, "Mitigation: execute trampolines\n"); - if (__test_facility(82, alt_stfle_fac_list)) - return sprintf(buf, "Mitigation: limited branch prediction\n"); - return sprintf(buf, "Vulnerable\n"); + return sysfs_emit(buf, "Mitigation: etokens\n"); + if (nospec_uses_trampoline()) + return sysfs_emit(buf, "Mitigation: execute trampolines\n"); + if (nobp_enabled()) + return sysfs_emit(buf, "Mitigation: limited branch prediction\n"); + return sysfs_emit(buf, "Vulnerable\n"); } diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 51c5a9f6e525..2fc40f97c0ad 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -14,9 +14,6 @@ #include <linux/node.h> #include <asm/numa.h> -struct pglist_data *node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(node_data); - void __init numa_setup(void) { int nid; @@ -24,19 +21,8 @@ void __init numa_setup(void) nodes_clear(node_possible_map); node_set(0, node_possible_map); node_set_online(0); - for (nid = 0; nid < MAX_NUMNODES; nid++) { - NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); - if (!NODE_DATA(nid)) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(pg_data_t), 8); - } + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = memblock_alloc_or_panic(sizeof(pg_data_t), 8); NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } - -static int __init numa_init_late(void) -{ - register_one_node(0); - return 0; -} -arch_initcall(numa_init_late); diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 5a7420b23aa8..94fa44776d0c 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -6,15 +6,19 @@ * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "os_info" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "os_info: " fmt #include <linux/crash_dump.h> #include <linux/kernel.h> #include <linux/slab.h> #include <asm/checksum.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/os_info.h> +#include <asm/physmem_info.h> +#include <asm/maccess.h> +#include <asm/asm-offsets.h> +#include <asm/sections.h> +#include <asm/ipl.h> /* * OS info structure has to be page aligned @@ -27,7 +31,7 @@ static struct os_info os_info __page_aligned_data; u32 os_info_csum(struct os_info *os_info) { int size = sizeof(*os_info) - offsetof(struct os_info, version_major); - return (__force u32)csum_partial(&os_info->version_major, size, 0); + return (__force u32)cksm(&os_info->version_major, size, 0); } /* @@ -41,13 +45,24 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size) } /* - * Add OS info entry and update checksum + * Add OS info data entry and update checksum */ -void os_info_entry_add(int nr, void *ptr, u64 size) +void os_info_entry_add_data(int nr, void *ptr, u64 size) { - os_info.entry[nr].addr = (u64)(unsigned long)ptr; + os_info.entry[nr].addr = __pa(ptr); os_info.entry[nr].size = size; - os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0); + os_info.entry[nr].csum = (__force u32)cksm(ptr, size, 0); + os_info.csum = os_info_csum(&os_info); +} + +/* + * Add OS info value entry and update checksum + */ +void os_info_entry_add_val(int nr, u64 value) +{ + os_info.entry[nr].val = value; + os_info.entry[nr].size = 0; + os_info.entry[nr].csum = 0; os_info.csum = os_info_csum(&os_info); } @@ -56,13 +71,25 @@ void os_info_entry_add(int nr, void *ptr, u64 size) */ void __init os_info_init(void) { - void *ptr = &os_info; + struct lowcore *abs_lc; + BUILD_BUG_ON(sizeof(struct os_info) != PAGE_SIZE); os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; + os_info_entry_add_val(OS_INFO_IDENTITY_BASE, __identity_base); + os_info_entry_add_val(OS_INFO_KASLR_OFFSET, kaslr_offset()); + os_info_entry_add_val(OS_INFO_KASLR_OFF_PHYS, __kaslr_offset_phys); + os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap); + os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START); + os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END); + os_info_entry_add_val(OS_INFO_IMAGE_START, (unsigned long)_stext); + os_info_entry_add_val(OS_INFO_IMAGE_END, (unsigned long)_end); + os_info_entry_add_val(OS_INFO_IMAGE_PHYS, __pa_symbol(_stext)); os_info.csum = os_info_csum(&os_info); - mem_assign_absolute(S390_lowcore.os_info, (unsigned long) ptr); + abs_lc = get_abs_lowcore(); + abs_lc->os_info = __pa(&os_info); + put_abs_lowcore(abs_lc); } #ifdef CONFIG_CRASH_DUMP @@ -90,11 +117,11 @@ static void os_info_old_alloc(int nr, int align) goto fail; } buf_align = PTR_ALIGN(buf, align); - if (copy_oldmem_kernel(buf_align, (void *) addr, size)) { + if (copy_oldmem_kernel(buf_align, addr, size)) { msg = "copy failed"; goto fail_free; } - csum = (__force u32)csum_partial(buf_align, size, 0); + csum = (__force u32)cksm(buf_align, size, 0); if (csum != os_info_old->entry[nr].csum) { msg = "checksum failed"; goto fail_free; @@ -121,17 +148,16 @@ static void os_info_old_init(void) if (os_info_init) return; - if (!OLDMEM_BASE) + if (!oldmem_data.start && !is_ipl_type_dump()) goto fail; - if (copy_oldmem_kernel(&addr, &S390_lowcore.os_info, sizeof(addr))) + if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr))) goto fail; if (addr == 0 || addr % PAGE_SIZE) goto fail; os_info_old = kzalloc(sizeof(*os_info_old), GFP_KERNEL); if (!os_info_old) goto fail; - if (copy_oldmem_kernel(os_info_old, (void *) addr, - sizeof(*os_info_old))) + if (copy_oldmem_kernel(os_info_old, addr, sizeof(*os_info_old))) goto fail_free; if (os_info_old->magic != OS_INFO_MAGIC) goto fail_free; @@ -154,7 +180,7 @@ fail: } /* - * Return pointer to os infor entry and its size + * Return pointer to os info entry and its size */ void *os_info_old_entry(int nr, unsigned long *size) { diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 975a00c8c564..408ab93112bf 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,28 +2,322 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2021 + * Copyright IBM Corp. 2012, 2023 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> * Thomas Richter <tmricht@linux.ibm.com> */ -#define KMSG_COMPONENT "cpum_cf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpum_cf: " fmt #include <linux/kernel.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/init.h> -#include <linux/export.h> #include <linux/miscdevice.h> +#include <linux/perf_event.h> -#include <asm/cpu_mcf.h> +#include <asm/cpu_mf.h> #include <asm/hwctrset.h> #include <asm/debug.h> +/* Perf PMU definitions for the counter facility */ +#define PERF_CPUM_CF_MAX_CTR 0xffffUL /* Max ctr for ECCTR */ +#define PERF_EVENT_CPUM_CF_DIAG 0xBC000UL /* Event: Counter sets */ + +enum cpumf_ctr_set { + CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ + CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ + CPUMF_CTR_SET_CRYPTO = 2, /* Crypto-Activity Counter Set */ + CPUMF_CTR_SET_EXT = 3, /* Extended Counter Set */ + CPUMF_CTR_SET_MT_DIAG = 4, /* MT-diagnostic Counter Set */ + + /* Maximum number of counter sets */ + CPUMF_CTR_SET_MAX, +}; + +#define CPUMF_LCCTL_ENABLE_SHIFT 16 +#define CPUMF_LCCTL_ACTCTL_SHIFT 0 + +static inline void ctr_set_enable(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; +} + +static inline void ctr_set_disable(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); +} + +static inline void ctr_set_start(u64 *state, u64 ctrsets) +{ + *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; +} + +static inline void ctr_set_stop(u64 *state, u64 ctrsets) +{ + *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); +} + +static inline int ctr_stcctm(enum cpumf_ctr_set set, u64 range, u64 *dest) +{ + switch (set) { + case CPUMF_CTR_SET_BASIC: + return stcctm(BASIC, range, dest); + case CPUMF_CTR_SET_USER: + return stcctm(PROBLEM_STATE, range, dest); + case CPUMF_CTR_SET_CRYPTO: + return stcctm(CRYPTO_ACTIVITY, range, dest); + case CPUMF_CTR_SET_EXT: + return stcctm(EXTENDED, range, dest); + case CPUMF_CTR_SET_MT_DIAG: + return stcctm(MT_DIAG_CLEARING, range, dest); + case CPUMF_CTR_SET_MAX: + return 3; + } + return 3; +} + +struct cpu_cf_events { + refcount_t refcnt; /* Reference count */ + atomic_t ctr_set[CPUMF_CTR_SET_MAX]; + u64 state; /* For perf_event_open SVC */ + u64 dev_state; /* For /dev/hwctr */ + unsigned int flags; + size_t used; /* Bytes used in data */ + size_t usedss; /* Bytes used in start/stop */ + unsigned char start[PAGE_SIZE]; /* Counter set at event add */ + unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ + unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ + unsigned int sets; /* # Counter set saved in memory */ +}; + static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ static debug_info_t *cf_dbg; +/* + * The CPU Measurement query counter information instruction contains + * information which varies per machine generation, but is constant and + * does not change when running on a particular machine, such as counter + * first and second version number. This is needed to determine the size + * of counter sets. Extract this information at device driver initialization. + */ +static struct cpumf_ctr_info cpumf_ctr_info; + +struct cpu_cf_ptr { + struct cpu_cf_events *cpucf; +}; + +static struct cpu_cf_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct cpu_cf_ptr __percpu *cfptr; +} cpu_cf_root; + +/* + * Serialize event initialization and event removal. Both are called from + * user space in task context with perf_event_open() and close() + * system calls. + * + * This mutex serializes functions cpum_cf_alloc_cpu() called at event + * initialization via cpumf_pmu_event_init() and function cpum_cf_free_cpu() + * called at event removal via call back function hw_perf_event_destroy() + * when the event is deleted. They are serialized to enforce correct + * bookkeeping of pointer and reference counts anchored by + * struct cpu_cf_root and the access to cpu_cf_root::refcnt and the + * per CPU pointers stored in cpu_cf_root::cfptr. + */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Get pointer to per-cpu structure. + * + * Function get_cpu_cfhw() is called from + * - cfset_copy_all(): This function is protected by cpus_read_lock(), so + * CPU hot plug remove can not happen. Event removal requires a close() + * first. + * + * Function this_cpu_cfhw() is called from perf common code functions: + * - pmu_{en|dis}able(), pmu_{add|del}()and pmu_{start|stop}(): + * All functions execute with interrupts disabled on that particular CPU. + * - cfset_ioctl_{on|off}, cfset_cpu_read(): see comment cfset_copy_all(). + * + * Therefore it is safe to access the CPU specific pointer to the event. + */ +static struct cpu_cf_events *get_cpu_cfhw(int cpu) +{ + struct cpu_cf_ptr __percpu *p = cpu_cf_root.cfptr; + + if (p) { + struct cpu_cf_ptr *q = per_cpu_ptr(p, cpu); + + return q->cpucf; + } + return NULL; +} + +static struct cpu_cf_events *this_cpu_cfhw(void) +{ + return get_cpu_cfhw(smp_processor_id()); +} + +/* Disable counter sets on dedicated CPU */ +static void cpum_cf_reset_cpu(void *flags) +{ + lcctl(0); +} + +/* Free per CPU data when the last event is removed. */ +static void cpum_cf_free_root(void) +{ + if (!refcount_dec_and_test(&cpu_cf_root.refcnt)) + return; + free_percpu(cpu_cf_root.cfptr); + cpu_cf_root.cfptr = NULL; + irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); + on_each_cpu(cpum_cf_reset_cpu, NULL, 1); + debug_sprintf_event(cf_dbg, 4, "%s root.refcnt %u cfptr %d\n", + __func__, refcount_read(&cpu_cf_root.refcnt), + !cpu_cf_root.cfptr); +} + +/* + * On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int cpum_cf_alloc_root(void) +{ + int rc = 0; + + if (refcount_inc_not_zero(&cpu_cf_root.refcnt)) + return rc; + + /* The memory is already zeroed. */ + cpu_cf_root.cfptr = alloc_percpu(struct cpu_cf_ptr); + if (cpu_cf_root.cfptr) { + refcount_set(&cpu_cf_root.refcnt, 1); + on_each_cpu(cpum_cf_reset_cpu, NULL, 1); + irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); + } else { + rc = -ENOMEM; + } + + return rc; +} + +/* Free CPU counter data structure for a PMU */ +static void cpum_cf_free_cpu(int cpu) +{ + struct cpu_cf_events *cpuhw; + struct cpu_cf_ptr *p; + + mutex_lock(&pmc_reserve_mutex); + /* + * When invoked via CPU hotplug handler, there might be no events + * installed or that particular CPU might not have an + * event installed. This anchor pointer can be NULL! + */ + if (!cpu_cf_root.cfptr) + goto out; + p = per_cpu_ptr(cpu_cf_root.cfptr, cpu); + cpuhw = p->cpucf; + /* + * Might be zero when called from CPU hotplug handler and no event + * installed on that CPU, but on different CPUs. + */ + if (!cpuhw) + goto out; + + if (refcount_dec_and_test(&cpuhw->refcnt)) { + kfree(cpuhw); + p->cpucf = NULL; + } + cpum_cf_free_root(); +out: + mutex_unlock(&pmc_reserve_mutex); +} + +/* Allocate CPU counter data structure for a PMU. Called under mutex lock. */ +static int cpum_cf_alloc_cpu(int cpu) +{ + struct cpu_cf_events *cpuhw; + struct cpu_cf_ptr *p; + int rc; + + mutex_lock(&pmc_reserve_mutex); + rc = cpum_cf_alloc_root(); + if (rc) + goto unlock; + p = per_cpu_ptr(cpu_cf_root.cfptr, cpu); + cpuhw = p->cpucf; + + if (!cpuhw) { + cpuhw = kzalloc(sizeof(*cpuhw), GFP_KERNEL); + if (cpuhw) { + p->cpucf = cpuhw; + refcount_set(&cpuhw->refcnt, 1); + } else { + rc = -ENOMEM; + } + } else { + refcount_inc(&cpuhw->refcnt); + } + if (rc) { + /* + * Error in allocation of event, decrement anchor. Since + * cpu_cf_event in not created, its destroy() function is not + * invoked. Adjust the reference counter for the anchor. + */ + cpum_cf_free_root(); + } +unlock: + mutex_unlock(&pmc_reserve_mutex); + return rc; +} + +/* + * Create/delete per CPU data structures for /dev/hwctr interface and events + * created by perf_event_open(). + * If cpu is -1, track task on all available CPUs. This requires + * allocation of hardware data structures for all CPUs. This setup handles + * perf_event_open() with task context and /dev/hwctr interface. + * If cpu is non-zero install event on this CPU only. This setup handles + * perf_event_open() with CPU context. + */ +static int cpum_cf_alloc(int cpu) +{ + cpumask_var_t mask; + int rc; + + if (cpu == -1) { + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + for_each_online_cpu(cpu) { + rc = cpum_cf_alloc_cpu(cpu); + if (rc) { + for_each_cpu(cpu, mask) + cpum_cf_free_cpu(cpu); + break; + } + cpumask_set_cpu(cpu, mask); + } + free_cpumask_var(mask); + } else { + rc = cpum_cf_alloc_cpu(cpu); + } + return rc; +} + +static void cpum_cf_free(int cpu) +{ + if (cpu == -1) { + for_each_online_cpu(cpu) + cpum_cf_free_cpu(cpu); + } else { + cpum_cf_free_cpu(cpu); + } +} + #define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ /* interval in seconds */ @@ -96,11 +390,10 @@ struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ /* Create the trailer data at the end of a page. */ static void cfdiag_trailer(struct cf_trailer_entry *te) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); struct cpuid cpuid; - te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ - te->csvn = cpuhw->info.csvn; + te->cfvn = cpumf_ctr_info.cfvn; /* Counter version numbers */ + te->csvn = cpumf_ctr_info.csvn; get_cpu_id(&cpuid); /* Machine type */ te->mach_type = cpuid.machine; @@ -112,6 +405,63 @@ static void cfdiag_trailer(struct cf_trailer_entry *te) te->timestamp = get_tod_clock_fast(); } +/* + * The number of counters per counter set varies between machine generations, + * but is constant when running on a particular machine generation. + * Determine each counter set size at device driver initialization and + * retrieve it later. + */ +static size_t cpumf_ctr_setsizes[CPUMF_CTR_SET_MAX]; +static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset) +{ + size_t ctrset_size = 0; + + switch (ctrset) { + case CPUMF_CTR_SET_BASIC: + if (cpumf_ctr_info.cfvn >= 1) + ctrset_size = 6; + break; + case CPUMF_CTR_SET_USER: + if (cpumf_ctr_info.cfvn == 1) + ctrset_size = 6; + else if (cpumf_ctr_info.cfvn >= 3) + ctrset_size = 2; + break; + case CPUMF_CTR_SET_CRYPTO: + if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5) + ctrset_size = 16; + else if (cpumf_ctr_info.csvn >= 6) + ctrset_size = 20; + break; + case CPUMF_CTR_SET_EXT: + if (cpumf_ctr_info.csvn == 1) + ctrset_size = 32; + else if (cpumf_ctr_info.csvn == 2) + ctrset_size = 48; + else if (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5) + ctrset_size = 128; + else if (cpumf_ctr_info.csvn >= 6 && cpumf_ctr_info.csvn <= 8) + ctrset_size = 160; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (cpumf_ctr_info.csvn > 3) + ctrset_size = 48; + break; + case CPUMF_CTR_SET_MAX: + break; + } + cpumf_ctr_setsizes[ctrset] = ctrset_size; +} + +/* + * Return the maximum possible counter set size (in number of 8 byte counters) + * depending on type and model number. + */ +static size_t cpum_cf_read_setsize(enum cpumf_ctr_set ctrset) +{ + return cpumf_ctr_setsizes[ctrset]; +} + /* Read a counter set. The counter set number determines the counter set and * the CPUM-CF first and second version number determine the number of * available counters in each counter set. @@ -130,14 +480,13 @@ static void cfdiag_trailer(struct cf_trailer_entry *te) static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, size_t room, bool error_ok) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); size_t ctrset_size, need = 0; int rc = 3; /* Assume write failure */ ctrdata->def = CF_DIAG_CTRSET_DEF; ctrdata->set = ctrset; ctrdata->res1 = 0; - ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); + ctrset_size = cpum_cf_read_setsize(ctrset); if (ctrset_size) { /* Save data */ need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); @@ -151,13 +500,17 @@ static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, need = 0; } - debug_sprintf_event(cf_dbg, 3, - "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" - " need %zd rc %d\n", __func__, ctrset, ctrset_size, - cpuhw->info.cfvn, cpuhw->info.csvn, need, rc); return need; } +static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { + [CPUMF_CTR_SET_BASIC] = 0x02, + [CPUMF_CTR_SET_USER] = 0x04, + [CPUMF_CTR_SET_CRYPTO] = 0x08, + [CPUMF_CTR_SET_EXT] = 0x01, + [CPUMF_CTR_SET_MT_DIAG] = 0x20, +}; + /* Read out all counter sets and save them in the provided data buffer. * The last 64 byte host an artificial trailer entry. */ @@ -205,25 +558,31 @@ static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) struct cf_trailer_entry *trailer_start, *trailer_stop; struct cf_ctrset_entry *ctrstart, *ctrstop; size_t offset = 0; + int i; - auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; - do { + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + /* Counter set not authorized */ + if (!(auth & cpumf_ctr_ctl[i])) + continue; + /* Counter set size zero was not saved */ + if (!cpum_cf_read_setsize(i)) + continue; + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { pr_err_once("cpum_cf_diag counter set compare error " "in set %i\n", ctrstart->set); return 0; } - auth &= ~cpumf_ctr_ctl[ctrstart->set]; if (ctrstart->def == CF_DIAG_CTRSET_DEF) { cfdiag_diffctrset((u64 *)(ctrstart + 1), (u64 *)(ctrstop + 1), ctrstart->ctr); offset += ctrstart->ctr * sizeof(u64) + sizeof(*ctrstart); } - } while (ctrstart->def && auth); + } /* Save time_stamp from start of event in stop's trailer */ trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); @@ -251,40 +610,35 @@ static enum cpumf_ctr_set get_counter_set(u64 event) return set; } -static int validate_ctr_version(const struct hw_perf_event *hwc, - enum cpumf_ctr_set set) +static int validate_ctr_version(const u64 config, enum cpumf_ctr_set set) { - struct cpu_cf_events *cpuhw; - int err = 0; u16 mtdiag_ctl; - - cpuhw = &get_cpu_var(cpu_cf_events); + int err = 0; /* check required version for counter sets */ switch (set) { case CPUMF_CTR_SET_BASIC: case CPUMF_CTR_SET_USER: - if (cpuhw->info.cfvn < 1) + if (cpumf_ctr_info.cfvn < 1) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_CRYPTO: - if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 && - hwc->config > 79) || - (cpuhw->info.csvn >= 6 && hwc->config > 83)) + if ((cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5 && + config > 79) || (cpumf_ctr_info.csvn >= 6 && config > 83)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_EXT: - if (cpuhw->info.csvn < 1) + if (cpumf_ctr_info.csvn < 1) err = -EOPNOTSUPP; - if ((cpuhw->info.csvn == 1 && hwc->config > 159) || - (cpuhw->info.csvn == 2 && hwc->config > 175) || - (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5 - && hwc->config > 255) || - (cpuhw->info.csvn >= 6 && hwc->config > 287)) + if ((cpumf_ctr_info.csvn == 1 && config > 159) || + (cpumf_ctr_info.csvn == 2 && config > 175) || + (cpumf_ctr_info.csvn >= 3 && cpumf_ctr_info.csvn <= 5 && + config > 255) || + (cpumf_ctr_info.csvn >= 6 && config > 287)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_MT_DIAG: - if (cpuhw->info.csvn <= 3) + if (cpumf_ctr_info.csvn <= 3) err = -EOPNOTSUPP; /* * MT-diagnostic counters are read-only. The counter set @@ -299,35 +653,15 @@ static int validate_ctr_version(const struct hw_perf_event *hwc, * counter set is enabled and active. */ mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]; - if (!((cpuhw->info.auth_ctl & mtdiag_ctl) && - (cpuhw->info.enable_ctl & mtdiag_ctl) && - (cpuhw->info.act_ctl & mtdiag_ctl))) + if (!((cpumf_ctr_info.auth_ctl & mtdiag_ctl) && + (cpumf_ctr_info.enable_ctl & mtdiag_ctl) && + (cpumf_ctr_info.act_ctl & mtdiag_ctl))) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_MAX: err = -EOPNOTSUPP; } - put_cpu_var(cpu_cf_events); - return err; -} - -static int validate_ctr_auth(const struct hw_perf_event *hwc) -{ - struct cpu_cf_events *cpuhw; - int err = 0; - - cpuhw = &get_cpu_var(cpu_cf_events); - - /* Check authorization for cpu counter sets. - * If the particular CPU counter set is not authorized, - * return with -ENOENT in order to fall back to other - * PMUs that might suffice the event request. - */ - if (!(hwc->config_base & cpuhw->info.auth_ctl)) - err = -ENOENT; - - put_cpu_var(cpu_cf_events); return err; } @@ -338,20 +672,17 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc) */ static void cpumf_pmu_enable(struct pmu *pmu) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); int err; - if (cpuhw->flags & PMU_F_ENABLED) + if (!cpuhw || (cpuhw->flags & PMU_F_ENABLED)) return; err = lcctl(cpuhw->state | cpuhw->dev_state); - if (err) { - pr_err("Enabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - - cpuhw->flags |= PMU_F_ENABLED; + if (err) + pr_err("Enabling the performance measuring unit failed with rc=%x\n", err); + else + cpuhw->flags |= PMU_F_ENABLED; } /* @@ -361,40 +692,26 @@ static void cpumf_pmu_enable(struct pmu *pmu) */ static void cpumf_pmu_disable(struct pmu *pmu) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err; + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); u64 inactive; + int err; - if (!(cpuhw->flags & PMU_F_ENABLED)) + if (!cpuhw || !(cpuhw->flags & PMU_F_ENABLED)) return; inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); inactive |= cpuhw->dev_state; err = lcctl(inactive); - if (err) { - pr_err("Disabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - - cpuhw->flags &= ~PMU_F_ENABLED; + if (err) + pr_err("Disabling the performance measuring unit failed with rc=%x\n", err); + else + cpuhw->flags &= ~PMU_F_ENABLED; } - -/* Number of perf events counting hardware events */ -static atomic_t num_events = ATOMIC_INIT(0); -/* Used to avoid races in calling reserve/release_cpumf_hardware */ -static DEFINE_MUTEX(pmc_reserve_mutex); - /* Release the PMU if event is the last perf event */ static void hw_perf_event_destroy(struct perf_event *event) { - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - __kernel_cpumcf_end(); - mutex_unlock(&pmc_reserve_mutex); - } + cpum_cf_free(event->cpu); } /* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ @@ -418,12 +735,10 @@ static const int cpumf_generic_events_user[] = { [PERF_COUNT_HW_BUS_CYCLES] = -1, }; -static void cpumf_hw_inuse(void) +static int is_userspace_event(u64 ev) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_inc_return(&num_events) == 1) - __kernel_cpumcf_begin(); - mutex_unlock(&pmc_reserve_mutex); + return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev; } static int __hw_perf_event_init(struct perf_event *event, unsigned int type) @@ -431,7 +746,6 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; enum cpumf_ctr_set set; - int err = 0; u64 ev; switch (type) { @@ -445,22 +759,27 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) break; case PERF_TYPE_HARDWARE: - if (is_sampling_event(event)) /* No sampling support */ - return -ENOENT; ev = attr->config; - /* Count user space (problem-state) only */ if (!attr->exclude_user && attr->exclude_kernel) { - if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_user[ev]; - - /* No support for kernel space counters only */ + /* + * Count user space (problem-state) only + * Handle events 32 and 33 as 0:u and 1:u + */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_user[ev]; + } } else if (!attr->exclude_kernel && attr->exclude_user) { + /* No support for kernel space counters only */ return -EOPNOTSUPP; - } else { /* Count user and kernel space */ - if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_basic[ev]; + } else { + /* Count user and kernel space, incl. events 32 + 33 */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_basic[ev]; + } } break; @@ -497,32 +816,53 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) } /* Initialize for using the CPU-measurement counter facility */ - cpumf_hw_inuse(); + if (cpum_cf_alloc(event->cpu)) + return -ENOMEM; event->destroy = hw_perf_event_destroy; - /* Finally, validate version and authorization of the counter set */ - err = validate_ctr_auth(hwc); - if (!err) - err = validate_ctr_version(hwc, set); + /* + * Finally, validate version and authorization of the counter set. + * If the particular CPU counter set is not authorized, + * return with -ENOENT in order to fall back to other + * PMUs that might suffice the event request. + */ + if (!(hwc->config_base & cpumf_ctr_info.auth_ctl)) + return -ENOENT; + return validate_ctr_version(hwc->config, set); +} - return err; +/* Events CPU_CYCLES and INSTRUCTIONS can be submitted with two different + * attribute::type values: + * - PERF_TYPE_HARDWARE: + * - pmu->type: + * Handle both type of invocations identical. They address the same hardware. + * The result is different when event modifiers exclude_kernel and/or + * exclude_user are also set. + */ +static int cpumf_pmu_event_type(struct perf_event *event) +{ + u64 ev = event->attr.config; + + if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev) + return PERF_TYPE_HARDWARE; + return PERF_TYPE_RAW; } static int cpumf_pmu_event_init(struct perf_event *event) { unsigned int type = event->attr.type; - int err; + int err = -ENOENT; + if (is_sampling_event(event)) /* No sampling support */ + return err; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) err = __hw_perf_event_init(event, type); else if (event->pmu->type == type) /* Registered as unknown PMU */ - err = __hw_perf_event_init(event, PERF_TYPE_RAW); - else - return -ENOENT; - - if (unlikely(err) && event->destroy) - event->destroy(event); + err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); return err; } @@ -532,8 +872,8 @@ static int hw_perf_event_reset(struct perf_event *event) u64 prev, new; int err; + prev = local64_read(&event->hw.prev_count); do { - prev = local64_read(&event->hw.prev_count); err = ecctr(event->hw.config, &new); if (err) { if (err != 3) @@ -545,7 +885,7 @@ static int hw_perf_event_reset(struct perf_event *event) */ new = 0; } - } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new)); return err; } @@ -555,12 +895,12 @@ static void hw_perf_event_update(struct perf_event *event) u64 prev, new, delta; int err; + prev = local64_read(&event->hw.prev_count); do { - prev = local64_read(&event->hw.prev_count); err = ecctr(event->hw.config, &new); if (err) return; - } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new)); delta = (prev <= new) ? new - prev : (-1ULL - prev) + new + 1; /* overflow */ @@ -577,7 +917,7 @@ static void cpumf_pmu_read(struct perf_event *event) static void cpumf_pmu_start(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct hw_perf_event *hwc = &event->hw; int i; @@ -634,17 +974,10 @@ static int cfdiag_push_sample(struct perf_event *event, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = cpuhw->usedss; raw.frag.data = cpuhw->stop; - raw.size = raw.frag.size; - data.raw = &raw; + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); - debug_sprintf_event(cf_dbg, 3, - "%s event %#llx sample_type %#llx raw %d ov %d\n", - __func__, event->hw.config, - event->attr.sample_type, raw.size, overflow); - if (overflow) - event->pmu->stop(event, 0); perf_event_update_userpage(event); return overflow; @@ -652,7 +985,7 @@ static int cfdiag_push_sample(struct perf_event *event, static void cpumf_pmu_stop(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct hw_perf_event *hwc = &event->hw; int i; @@ -679,15 +1012,16 @@ static void cpumf_pmu_stop(struct perf_event *event, int flags) false); if (cfdiag_diffctr(cpuhw, event->hw.config_base)) cfdiag_push_sample(event, cpuhw); - } else + } else { hw_perf_event_update(event); + } hwc->state |= PERF_HES_UPTODATE; } } static int cpumf_pmu_add(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); ctr_set_enable(&cpuhw->state, event->hw.config_base); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; @@ -700,7 +1034,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) static void cpumf_pmu_del(struct perf_event *event, int flags) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); int i; cpumf_pmu_stop(event, PERF_EF_UPDATE); @@ -711,7 +1045,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) * * When a new perf event has been added but not yet started, this can * clear enable control and resets all counters in a set. Therefore, - * cpumf_pmu_start() always has to reenable a counter set. + * cpumf_pmu_start() always has to re-enable a counter set. */ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) if (!atomic_read(&cpuhw->ctr_set[i])) @@ -732,31 +1066,172 @@ static struct pmu cpumf_pmu = { .read = cpumf_pmu_read, }; +static struct cfset_session { /* CPUs and counter set bit mask */ + struct list_head head; /* Head of list of active processes */ +} cfset_session = { + .head = LIST_HEAD_INIT(cfset_session.head) +}; + +static refcount_t cfset_opencnt = REFCOUNT_INIT(0); /* Access count */ +/* + * Synchronize access to device /dev/hwc. This mutex protects against + * concurrent access to functions cfset_open() and cfset_release(). + * Same for CPU hotplug add and remove events triggering + * cpum_cf_online_cpu() and cpum_cf_offline_cpu(). + * It also serializes concurrent device ioctl access from multiple + * processes accessing /dev/hwc. + * + * The mutex protects concurrent access to the /dev/hwctr session management + * struct cfset_session and reference counting variable cfset_opencnt. + */ +static DEFINE_MUTEX(cfset_ctrset_mutex); + +/* + * CPU hotplug handles only /dev/hwctr device. + * For perf_event_open() the CPU hotplug handling is done on kernel common + * code: + * - CPU add: Nothing is done since a file descriptor can not be created + * and returned to the user. + * - CPU delete: Handled by common code via pmu_disable(), pmu_stop() and + * pmu_delete(). The event itself is removed when the file descriptor is + * closed. + */ +static int cfset_online_cpu(unsigned int cpu); + +static int cpum_cf_online_cpu(unsigned int cpu) +{ + int rc = 0; + + /* + * Ignore notification for perf_event_open(). + * Handle only /dev/hwctr device sessions. + */ + mutex_lock(&cfset_ctrset_mutex); + if (refcount_read(&cfset_opencnt)) { + rc = cpum_cf_alloc_cpu(cpu); + if (!rc) + cfset_online_cpu(cpu); + } + mutex_unlock(&cfset_ctrset_mutex); + return rc; +} + +static int cfset_offline_cpu(unsigned int cpu); + +static int cpum_cf_offline_cpu(unsigned int cpu) +{ + /* + * During task exit processing of grouped perf events triggered by CPU + * hotplug processing, pmu_disable() is called as part of perf context + * removal process. Therefore do not trigger event removal now for + * perf_event_open() created events. Perf common code triggers event + * destruction when the event file descriptor is closed. + * + * Handle only /dev/hwctr device sessions. + */ + mutex_lock(&cfset_ctrset_mutex); + if (refcount_read(&cfset_opencnt)) { + cfset_offline_cpu(cpu); + cpum_cf_free_cpu(cpu); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +/* Return true if store counter set multiple instruction is available */ +static inline int stccm_avail(void) +{ + return test_facility(142); +} + +/* CPU-measurement alerts for the counter facility */ +static void cpumf_measurement_alert(struct ext_code ext_code, + unsigned int alert, unsigned long unused) +{ + struct cpu_cf_events *cpuhw; + + if (!(alert & CPU_MF_INT_CF_MASK)) + return; + + inc_irq_stat(IRQEXT_CMC); + + /* + * Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. + */ + cpuhw = this_cpu_cfhw(); + if (!cpuhw) + return; + + /* counter authorization change alert */ + if (alert & CPU_MF_INT_CF_CACA) + qctri(&cpumf_ctr_info); + + /* loss of counter data alert */ + if (alert & CPU_MF_INT_CF_LCDA) + pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); + + /* loss of MT counter data alert */ + if (alert & CPU_MF_INT_CF_MTDA) + pr_warn("CPU[%i] MT counter data was lost\n", + smp_processor_id()); +} + static int cfset_init(void); static int __init cpumf_pmu_init(void) { int rc; - if (!kernel_cpumcf_avail()) + /* Extract counter measurement facility information */ + if (!cpum_cf_avail() || qctri(&cpumf_ctr_info)) return -ENODEV; + /* Determine and store counter set sizes for later reference */ + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + cpum_cf_make_setsize(rc); + + /* + * Clear bit 15 of cr0 to unauthorize problem-state to + * extract measurement counters + */ + system_ctl_clear_bit(0, CR0_CPUMF_EXTRACTION_AUTH_BIT); + + /* register handler for measurement-alert interruptions */ + rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, + cpumf_measurement_alert); + if (rc) { + pr_err("Registering for CPU-measurement alerts failed with rc=%i\n", rc); + return rc; + } + /* Setup s390dbf facility */ - cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); + cf_dbg = debug_register("cpum_cf", 2, 1, 128); if (!cf_dbg) { pr_err("Registration of s390dbf(cpum_cf) failed\n"); - return -ENOMEM; - }; + rc = -ENOMEM; + goto out1; + } debug_register_view(cf_dbg, &debug_sprintf_view); cpumf_pmu.attr_groups = cpumf_cf_event_group(); rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); if (rc) { - debug_unregister_view(cf_dbg, &debug_sprintf_view); - debug_unregister(cf_dbg); pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + goto out2; } else if (stccm_avail()) { /* Setup counter set device */ cfset_init(); } + + rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, + "perf/s390/cf:online", + cpum_cf_online_cpu, cpum_cf_offline_cpu); + return rc; + +out2: + debug_unregister_view(cf_dbg, &debug_sprintf_view); + debug_unregister(cf_dbg); +out1: + unregister_external_irq(EXT_IRQ_MEASURE_ALERT, cpumf_measurement_alert); return rc; } @@ -765,22 +1240,38 @@ static int __init cpumf_pmu_init(void) * counter set via normal file operations. */ -static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Excl. access */ -static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ unsigned int sets; /* Counter set bit mask */ atomic_t cpus_ack; /* # CPUs successfully executed func */ }; -static struct cfset_request { /* CPUs and counter set bit mask */ +struct cfset_request { /* CPUs and counter set bit mask */ unsigned long ctrset; /* Bit mask of counter set to read */ cpumask_t mask; /* CPU mask to read from */ -} cfset_request; + struct list_head node; /* Chain to cfset_session.head */ +}; + +static void cfset_session_init(void) +{ + INIT_LIST_HEAD(&cfset_session.head); +} + +/* Remove current request from global bookkeeping. Maintain a counter set bit + * mask on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_del(struct cfset_request *p) +{ + list_del(&p->node); +} -static void cfset_ctrset_clear(void) +/* Add current request to global bookkeeping. Maintain a counter set bit mask + * on a per CPU basis. + * Done in process context under mutex protection. + */ +static void cfset_session_add(struct cfset_request *p) { - cpumask_clear(&cfset_request.mask); - cfset_request.ctrset = 0; + list_add(&p->node, &cfset_session.head); } /* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access @@ -815,27 +1306,33 @@ static void cfset_ctrset_clear(void) /* Stop all counter sets via ioctl interface */ static void cfset_ioctl_off(void *parm) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct cfset_call_on_cpu_parm *p = parm; int rc; - cpuhw->dev_state = 0; + /* Check if any counter set used by /dev/hwctr */ for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) - if ((p->sets & cpumf_ctr_ctl[rc])) - atomic_dec(&cpuhw->ctr_set[rc]); - rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if ((p->sets & cpumf_ctr_ctl[rc])) { + if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { + ctr_set_disable(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + ctr_set_stop(&cpuhw->dev_state, + cpumf_ctr_ctl[rc]); + } + } + /* Keep perf_event_open counter sets */ + rc = lcctl(cpuhw->dev_state | cpuhw->state); if (rc) pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", cpuhw->state, S390_HWCTR_DEVICE, rc); - cpuhw->flags &= ~PMU_F_IN_USE; - debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", - __func__, rc, cpuhw->state, cpuhw->dev_state); + if (!cpuhw->dev_state) + cpuhw->flags &= ~PMU_F_IN_USE; } /* Start counter sets on particular CPU */ static void cfset_ioctl_on(void *parm) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct cfset_call_on_cpu_parm *p = parm; int rc; @@ -851,22 +1348,33 @@ static void cfset_ioctl_on(void *parm) else pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); - debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", - __func__, rc, cpuhw->state, cpuhw->dev_state); } static void cfset_release_cpu(void *p) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); int rc; - debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", - __func__, cpuhw->state, cpuhw->dev_state); + cpuhw->dev_state = 0; rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ if (rc) pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", cpuhw->state, S390_HWCTR_DEVICE, rc); - cpuhw->dev_state = 0; +} + +/* This modifies the process CPU mask to adopt it to the currently online + * CPUs. Offline CPUs can not be addresses. This call terminates the access + * and is usually followed by close() or a new iotcl(..., START, ...) which + * creates a new request structure. + */ +static void cfset_all_stop(struct cfset_request *req) +{ + struct cfset_call_on_cpu_parm p = { + .sets = req->ctrset, + }; + + cpumask_and(&req->mask, &req->mask, cpu_online_mask); + on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1); } /* Release function is also called when application gets terminated without @@ -874,46 +1382,55 @@ static void cfset_release_cpu(void *p) */ static int cfset_release(struct inode *inode, struct file *file) { - on_each_cpu(cfset_release_cpu, NULL, 1); - hw_perf_event_destroy(NULL); - cfset_ctrset_clear(); - atomic_set(&cfset_opencnt, 0); + mutex_lock(&cfset_ctrset_mutex); + /* Open followed by close/exit has no private_data */ + if (file->private_data) { + cfset_all_stop(file->private_data); + cfset_session_del(file->private_data); + kfree(file->private_data); + file->private_data = NULL; + } + if (refcount_dec_and_test(&cfset_opencnt)) { /* Last close */ + on_each_cpu(cfset_release_cpu, NULL, 1); + cpum_cf_free(-1); + } + mutex_unlock(&cfset_ctrset_mutex); return 0; } +/* + * Open via /dev/hwctr device. Allocate all per CPU resources on the first + * open of the device. The last close releases all per CPU resources. + * Parallel perf_event_open system calls also use per CPU resources. + * These invocations are handled via reference counting on the per CPU data + * structures. + */ static int cfset_open(struct inode *inode, struct file *file) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* Only one user space program can open /dev/hwctr */ - if (atomic_xchg(&cfset_opencnt, 1)) - return -EBUSY; + int rc = 0; - cpumf_hw_inuse(); + if (!perfmon_capable()) + return -EPERM; file->private_data = NULL; - /* nonseekable_open() never fails */ - return nonseekable_open(inode, file); -} -static int cfset_all_stop(void) -{ - struct cfset_call_on_cpu_parm p = { - .sets = cfset_request.ctrset, - }; - cpumask_var_t mask; + mutex_lock(&cfset_ctrset_mutex); + if (!refcount_inc_not_zero(&cfset_opencnt)) { /* First open */ + rc = cpum_cf_alloc(-1); + if (!rc) { + cfset_session_init(); + refcount_set(&cfset_opencnt, 1); + } + } + mutex_unlock(&cfset_ctrset_mutex); - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); - on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); - free_cpumask_var(mask); - return 0; + /* nonseekable_open() never fails */ + return rc ?: nonseekable_open(inode, file); } -static int cfset_all_start(void) +static int cfset_all_start(struct cfset_request *req) { struct cfset_call_on_cpu_parm p = { - .sets = cfset_request.ctrset, + .sets = req->ctrset, .cpus_ack = ATOMIC_INIT(0), }; cpumask_var_t mask; @@ -921,18 +1438,16 @@ static int cfset_all_start(void) if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + cpumask_and(mask, &req->mask, cpu_online_mask); on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); rc = -EIO; - debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__); } free_cpumask_var(mask); return rc; } - /* Return the maximum required space for all possible CPUs in case one * CPU will be onlined during the START, READ, STOP cycles. * To find out the size of the counter sets, any one CPU will do. They @@ -940,34 +1455,32 @@ static int cfset_all_start(void) */ static size_t cfset_needspace(unsigned int sets) { - struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); size_t bytes = 0; int i; for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { if (!(sets & cpumf_ctr_ctl[i])) continue; - bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + + bytes += cpum_cf_read_setsize(i) * sizeof(u64) + sizeof(((struct s390_ctrset_setdata *)0)->set) + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); } bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); - put_cpu_ptr(&cpu_cf_events); return bytes; } static int cfset_all_copy(unsigned long arg, cpumask_t *mask) { struct s390_ctrset_read __user *ctrset_read; - unsigned int cpu, cpus, rc; + unsigned int cpu, cpus, rc = 0; void __user *uptr; ctrset_read = (struct s390_ctrset_read __user *)arg; uptr = ctrset_read->data; for_each_cpu(cpu, mask) { - struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu); + struct cpu_cf_events *cpuhw = get_cpu_cfhw(cpu); struct s390_ctrset_cpudata __user *ctrset_cpudata; ctrset_cpudata = uptr; @@ -975,17 +1488,18 @@ static int cfset_all_copy(unsigned long arg, cpumask_t *mask) rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, cpuhw->used); - if (rc) - return -EFAULT; + if (rc) { + rc = -EFAULT; + goto out; + } uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; cond_resched(); } cpus = cpumask_weight(mask); if (put_user(cpus, &ctrset_read->no_cpus)) - return -EFAULT; - debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__, - uptr - (void __user *)ctrset_read->data); - return 0; + rc = -EFAULT; +out: + return rc; } static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, @@ -1008,7 +1522,7 @@ static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, /* Read all counter sets. */ static void cfset_cpu_read(void *parm) { - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpu_cf_events *cpuhw = this_cpu_cfhw(); struct cfset_call_on_cpu_parm *p = parm; int set, set_size; size_t space; @@ -1025,7 +1539,7 @@ static void cfset_cpu_read(void *parm) if (!(p->sets & cpumf_ctr_ctl[set])) continue; /* Counter set not in list */ - set_size = cpum_cf_ctrset_size(set, &cpuhw->info); + set_size = cpum_cf_read_setsize(set); space = sizeof(cpuhw->data) - cpuhw->used; space = cfset_cpuset_read(sp, set, set_size, space); if (space) { @@ -1033,11 +1547,9 @@ static void cfset_cpu_read(void *parm) cpuhw->sets += 1; } } - debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__, - cpuhw->sets, cpuhw->used); } -static int cfset_all_read(unsigned long arg) +static int cfset_all_read(unsigned long arg, struct cfset_request *req) { struct cfset_call_on_cpu_parm p; cpumask_var_t mask; @@ -1046,46 +1558,49 @@ static int cfset_all_read(unsigned long arg) if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - p.sets = cfset_request.ctrset; - cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + p.sets = req->ctrset; + cpumask_and(mask, &req->mask, cpu_online_mask); on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); rc = cfset_all_copy(arg, mask); free_cpumask_var(mask); return rc; } -static long cfset_ioctl_read(unsigned long arg) +static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) { - struct s390_ctrset_read read; - int ret = 0; + int ret = -ENODATA; - if (copy_from_user(&read, (char __user *)arg, sizeof(read))) - return -EFAULT; - ret = cfset_all_read(arg); + if (req && req->ctrset) + ret = cfset_all_read(arg, req); return ret; } -static long cfset_ioctl_stop(void) +static long cfset_ioctl_stop(struct file *file) { - int ret = ENXIO; - - if (cfset_request.ctrset) { - ret = cfset_all_stop(); - cfset_ctrset_clear(); + struct cfset_request *req = file->private_data; + int ret = -ENXIO; + + if (req) { + cfset_all_stop(req); + cfset_session_del(req); + kfree(req); + file->private_data = NULL; + ret = 0; } return ret; } -static long cfset_ioctl_start(unsigned long arg) +static long cfset_ioctl_start(unsigned long arg, struct file *file) { struct s390_ctrset_start __user *ustart; struct s390_ctrset_start start; + struct cfset_request *preq; void __user *umask; unsigned int len; int ret = 0; size_t need; - if (cfset_request.ctrset) + if (file->private_data) return -EBUSY; ustart = (struct s390_ctrset_start __user *)arg; if (copy_from_user(&start, ustart, sizeof(start))) @@ -1100,25 +1615,34 @@ static long cfset_ioctl_start(unsigned long arg) return -EINVAL; /* Invalid counter set */ if (!start.counter_sets) return -EINVAL; /* No counter set at all? */ - cpumask_clear(&cfset_request.mask); + + preq = kzalloc(sizeof(*preq), GFP_KERNEL); + if (!preq) + return -ENOMEM; + cpumask_clear(&preq->mask); len = min_t(u64, start.cpumask_len, cpumask_size()); umask = (void __user *)start.cpumask; - if (copy_from_user(&cfset_request.mask, umask, len)) + if (copy_from_user(&preq->mask, umask, len)) { + kfree(preq); return -EFAULT; - if (cpumask_empty(&cfset_request.mask)) + } + if (cpumask_empty(&preq->mask)) { + kfree(preq); return -EINVAL; + } need = cfset_needspace(start.counter_sets); - if (put_user(need, &ustart->data_bytes)) - ret = -EFAULT; - if (ret) - goto out; - cfset_request.ctrset = start.counter_sets; - ret = cfset_all_start(); -out: - if (ret) - cfset_ctrset_clear(); - debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n", - __func__, cfset_request.ctrset, need, ret); + if (put_user(need, &ustart->data_bytes)) { + kfree(preq); + return -EFAULT; + } + preq->ctrset = start.counter_sets; + ret = cfset_all_start(preq); + if (!ret) { + cfset_session_add(preq); + file->private_data = preq; + } else { + kfree(preq); + } return ret; } @@ -1128,7 +1652,7 @@ out: * counter set keeps running until explicitly stopped. Returns the number * of bytes needed to store the counter values. If another S390_HWCTR_START * ioctl subcommand is called without a previous S390_HWCTR_STOP stop - * command, -EBUSY is returned. + * command on the same file descriptor, -EBUSY is returned. * S390_HWCTR_READ: Read the counter set values from specified CPU list given * with the S390_HWCTR_START command. * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the @@ -1138,24 +1662,24 @@ static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int ret; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cfset_ctrset_mutex); switch (cmd) { case S390_HWCTR_START: - ret = cfset_ioctl_start(arg); + ret = cfset_ioctl_start(arg, file); break; case S390_HWCTR_STOP: - ret = cfset_ioctl_stop(); + ret = cfset_ioctl_stop(file); break; case S390_HWCTR_READ: - ret = cfset_ioctl_read(arg); + ret = cfset_ioctl_read(arg, file->private_data); break; default: ret = -ENOTTY; break; } mutex_unlock(&cfset_ctrset_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -1164,62 +1688,65 @@ static const struct file_operations cfset_fops = { .open = cfset_open, .release = cfset_release, .unlocked_ioctl = cfset_ioctl, - .compat_ioctl = cfset_ioctl, - .llseek = no_llseek }; static struct miscdevice cfset_dev = { .name = S390_HWCTR_DEVICE, .minor = MISC_DYNAMIC_MINOR, .fops = &cfset_fops, + .mode = 0666, }; -int cfset_online_cpu(unsigned int cpu) +/* Hotplug add of a CPU. Scan through all active processes and add + * that CPU to the list of CPUs supplied with ioctl(..., START, ...). + */ +static int cfset_online_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; - mutex_lock(&cfset_ctrset_mutex); - if (cfset_request.ctrset) { - p.sets = cfset_request.ctrset; - cfset_ioctl_on(&p); - cpumask_set_cpu(cpu, &cfset_request.mask); + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_on(&p); + cpumask_set_cpu(cpu, &rp->mask); + } } - mutex_unlock(&cfset_ctrset_mutex); return 0; } -int cfset_offline_cpu(unsigned int cpu) +/* Hotplug remove of a CPU. Scan through all active processes and clear + * that CPU from the list of CPUs supplied with ioctl(..., START, ...). + * Adjust reference counts. + */ +static int cfset_offline_cpu(unsigned int cpu) { struct cfset_call_on_cpu_parm p; + struct cfset_request *rp; - mutex_lock(&cfset_ctrset_mutex); - if (cfset_request.ctrset) { - p.sets = cfset_request.ctrset; - cfset_ioctl_off(&p); - cpumask_clear_cpu(cpu, &cfset_request.mask); + if (!list_empty(&cfset_session.head)) { + list_for_each_entry(rp, &cfset_session.head, node) { + p.sets = rp->ctrset; + cfset_ioctl_off(&p); + cpumask_clear_cpu(cpu, &rp->mask); + } } - mutex_unlock(&cfset_ctrset_mutex); return 0; } static void cfdiag_read(struct perf_event *event) { - debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__, - event->attr.config, local64_read(&event->count)); } static int get_authctrsets(void) { - struct cpu_cf_events *cpuhw; unsigned long auth = 0; enum cpumf_ctr_set i; - cpuhw = &get_cpu_var(cpu_cf_events); for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) + if (cpumf_ctr_info.auth_ctl & cpumf_ctr_ctl[i]) auth |= cpumf_ctr_ctl[i]; } - put_cpu_var(cpu_cf_events); return auth; } @@ -1253,8 +1780,6 @@ static int cfdiag_event_init2(struct perf_event *event) if (!event->hw.config_base) err = -EINVAL; - debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n", - __func__, err, event->hw.config_base); return err; } @@ -1279,12 +1804,11 @@ static int cfdiag_event_init(struct perf_event *event) } /* Initialize for using the CPU-measurement counter facility */ - cpumf_hw_inuse(); + if (cpum_cf_alloc(event->cpu)) + return -ENOMEM; event->destroy = hw_perf_event_destroy; err = cfdiag_event_init2(event); - if (unlikely(err)) - event->destroy(event); out: return err; } @@ -1327,7 +1851,7 @@ static const struct attribute_group *cfdiag_attr_groups[] = { /* Performance monitoring unit for event CF_DIAG. Since this event * is also started and stopped via the perf_event_open() system call, use * the same event enable/disable call back functions. They do not - * have a pointer to the perf_event strcture as first parameter. + * have a pointer to the perf_event structure as first parameter. * * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. * Reuse them and distinguish the event (always first parameter) via @@ -1357,7 +1881,7 @@ static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) enum cpumf_ctr_set i; for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - size_t size = cpum_cf_ctrset_size(i, info); + size_t size = cpum_cf_read_setsize(i); if (size) max_size += size * sizeof(u64) + @@ -1369,6 +1893,8 @@ static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) /* Get the CPU speed, try sampling facility first and CPU attributes second. */ static void cfdiag_get_cpu_speed(void) { + unsigned long mhz; + if (cpum_sf_avail()) { /* Sampling facility first */ struct hws_qsi_info_block si; @@ -1382,26 +1908,19 @@ static void cfdiag_get_cpu_speed(void) /* Fallback: CPU speed extract static part. Used in case * CPU Measurement Sampling Facility is turned off. */ - if (test_facility(34)) { - unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); - - if (mhz != -1UL) - cfdiag_cpu_speed = mhz & 0xffffffff; - } + mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + if (mhz != -1UL) + cfdiag_cpu_speed = mhz & 0xffffffff; } static int cfset_init(void) { - struct cpumf_ctr_info info; size_t need; int rc; - if (qctri(&info)) - return -ENODEV; - cfdiag_get_cpu_speed(); /* Make sure the counter set data fits into predefined buffer. */ - need = cfdiag_maxsize(&info); + need = cfdiag_maxsize(&cpumf_ctr_info); if (need > sizeof(((struct cpu_cf_events *)0)->start)) { pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", need); diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c deleted file mode 100644 index 30f0242de4a5..000000000000 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * CPU-Measurement Counter Facility Support - Common Layer - * - * Copyright IBM Corp. 2019 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - */ -#define KMSG_COMPONENT "cpum_cf_common" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <asm/ctl_reg.h> -#include <asm/irq.h> -#include <asm/cpu_mcf.h> - -/* Per-CPU event structure for the counter facility */ -DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = { - .ctr_set = { - [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0), - [CPUMF_CTR_SET_MT_DIAG] = ATOMIC_INIT(0), - }, - .alert = ATOMIC64_INIT(0), - .state = 0, - .dev_state = 0, - .flags = 0, - .used = 0, - .usedss = 0, - .sets = 0 -}; -/* Indicator whether the CPU-Measurement Counter Facility Support is ready */ -static bool cpum_cf_initalized; - -/* CPU-measurement alerts for the counter facility */ -static void cpumf_measurement_alert(struct ext_code ext_code, - unsigned int alert, unsigned long unused) -{ - struct cpu_cf_events *cpuhw; - - if (!(alert & CPU_MF_INT_CF_MASK)) - return; - - inc_irq_stat(IRQEXT_CMC); - cpuhw = this_cpu_ptr(&cpu_cf_events); - - /* Measurement alerts are shared and might happen when the PMU - * is not reserved. Ignore these alerts in this case. */ - if (!(cpuhw->flags & PMU_F_RESERVED)) - return; - - /* counter authorization change alert */ - if (alert & CPU_MF_INT_CF_CACA) - qctri(&cpuhw->info); - - /* loss of counter data alert */ - if (alert & CPU_MF_INT_CF_LCDA) - pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); - - /* loss of MT counter data alert */ - if (alert & CPU_MF_INT_CF_MTDA) - pr_warn("CPU[%i] MT counter data was lost\n", - smp_processor_id()); - - /* store alert for special handling by in-kernel users */ - atomic64_or(alert, &cpuhw->alert); -} - -#define PMC_INIT 0 -#define PMC_RELEASE 1 -static void cpum_cf_setup_cpu(void *flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - switch (*((int *) flags)) { - case PMC_INIT: - memset(&cpuhw->info, 0, sizeof(cpuhw->info)); - qctri(&cpuhw->info); - cpuhw->flags |= PMU_F_RESERVED; - break; - - case PMC_RELEASE: - cpuhw->flags &= ~PMU_F_RESERVED; - break; - } - - /* Disable CPU counter sets */ - lcctl(0); -} - -bool kernel_cpumcf_avail(void) -{ - return cpum_cf_initalized; -} -EXPORT_SYMBOL(kernel_cpumcf_avail); - -/* Initialize the CPU-measurement counter facility */ -int __kernel_cpumcf_begin(void) -{ - int flags = PMC_INIT; - - on_each_cpu(cpum_cf_setup_cpu, &flags, 1); - irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; -} -EXPORT_SYMBOL(__kernel_cpumcf_begin); - -/* Obtain the CPU-measurement alerts for the counter facility */ -unsigned long kernel_cpumcf_alert(int clear) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - unsigned long alert; - - alert = atomic64_read(&cpuhw->alert); - if (clear) - atomic64_set(&cpuhw->alert, 0); - - return alert; -} -EXPORT_SYMBOL(kernel_cpumcf_alert); - -/* Release the CPU-measurement counter facility */ -void __kernel_cpumcf_end(void) -{ - int flags = PMC_RELEASE; - - on_each_cpu(cpum_cf_setup_cpu, &flags, 1); - irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); -} -EXPORT_SYMBOL(__kernel_cpumcf_end); - -static int cpum_cf_setup(unsigned int cpu, int flags) -{ - local_irq_disable(); - cpum_cf_setup_cpu(&flags); - local_irq_enable(); - return 0; -} - -static int cpum_cf_online_cpu(unsigned int cpu) -{ - cpum_cf_setup(cpu, PMC_INIT); - return cfset_online_cpu(cpu); -} - -static int cpum_cf_offline_cpu(unsigned int cpu) -{ - cfset_offline_cpu(cpu); - return cpum_cf_setup(cpu, PMC_RELEASE); -} - -/* Return the maximum possible counter set size (in number of 8 byte counters) - * depending on type and model number. - */ -size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, - struct cpumf_ctr_info *info) -{ - size_t ctrset_size = 0; - - switch (ctrset) { - case CPUMF_CTR_SET_BASIC: - if (info->cfvn >= 1) - ctrset_size = 6; - break; - case CPUMF_CTR_SET_USER: - if (info->cfvn == 1) - ctrset_size = 6; - else if (info->cfvn >= 3) - ctrset_size = 2; - break; - case CPUMF_CTR_SET_CRYPTO: - if (info->csvn >= 1 && info->csvn <= 5) - ctrset_size = 16; - else if (info->csvn == 6) - ctrset_size = 20; - break; - case CPUMF_CTR_SET_EXT: - if (info->csvn == 1) - ctrset_size = 32; - else if (info->csvn == 2) - ctrset_size = 48; - else if (info->csvn >= 3 && info->csvn <= 5) - ctrset_size = 128; - else if (info->csvn == 6) - ctrset_size = 160; - break; - case CPUMF_CTR_SET_MT_DIAG: - if (info->csvn > 3) - ctrset_size = 48; - break; - case CPUMF_CTR_SET_MAX: - break; - } - - return ctrset_size; -} - -static int __init cpum_cf_init(void) -{ - int rc; - - if (!cpum_cf_avail()) - return -ENODEV; - - /* clear bit 15 of cr0 to unauthorize problem-state to - * extract measurement counters */ - ctl_clear_bit(0, 48); - - /* register handler for measurement-alert interruptions */ - rc = register_external_irq(EXT_IRQ_MEASURE_ALERT, - cpumf_measurement_alert); - if (rc) { - pr_err("Registering for CPU-measurement alerts " - "failed with rc=%i\n", rc); - return rc; - } - - rc = cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE, - "perf/s390/cf:online", - cpum_cf_online_cpu, cpum_cf_offline_cpu); - if (!rc) - cpum_cf_initalized = true; - - return rc; -} -early_initcall(cpum_cf_init); diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 37265f551a11..7ace1f9e4ccf 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -237,7 +237,6 @@ CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_NO_SPECIAL, 0x00f4); CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z14, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); - CPUMF_EVENT_ATTR(cf_z15, L1D_RO_EXCL_WRITES, 0x0080); CPUMF_EVENT_ATTR(cf_z15, DTLB2_WRITES, 0x0081); CPUMF_EVENT_ATTR(cf_z15, DTLB2_MISSES, 0x0082); @@ -291,10 +290,157 @@ CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_NO_SPECIAL, 0x00f4); CPUMF_EVENT_ATTR(cf_z15, TX_C_TABORT_SPECIAL, 0x00f5); CPUMF_EVENT_ATTR(cf_z15, DFLT_ACCESS, 0x00f7); CPUMF_EVENT_ATTR(cf_z15, DFLT_CYCLES, 0x00fc); -CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x00108); -CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x00109); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CC, 0x0108); +CPUMF_EVENT_ATTR(cf_z15, DFLT_CCFINISH, 0x0109); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); CPUMF_EVENT_ATTR(cf_z15, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z16, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z16, CRSTE_1MB_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z16, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z16, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z16, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z16, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z16, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z16, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ, 0x0091); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_CHIP_HIT, 0x0093); +CPUMF_EVENT_ATTR(cf_z16, DCW_REQ_DRAWER_HIT, 0x0094); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP, 0x0095); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_CHIP_HIT, 0x0097); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT, 0x0098); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE, 0x0099); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER, 0x009a); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER, 0x009b); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_CHIP_MEMORY, 0x009c); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_MODULE_MEMORY, 0x009d); +CPUMF_EVENT_ATTR(cf_z16, DCW_ON_DRAWER_MEMORY, 0x009e); +CPUMF_EVENT_ATTR(cf_z16, DCW_OFF_DRAWER_MEMORY, 0x009f); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT, 0x00a1); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4); +CPUMF_EVENT_ATTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_IV, 0x00a6); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7); +CPUMF_EVENT_ATTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ, 0x00a9); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_CHIP_HIT, 0x00ab); +CPUMF_EVENT_ATTR(cf_z16, ICW_REQ_DRAWER_HIT, 0x00ac); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP, 0x00ad); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_CHIP_HIT, 0x00af); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT, 0x00b0); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE, 0x00b1); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER, 0x00b2); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER, 0x00b3); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_CHIP_MEMORY, 0x00b4); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_MODULE_MEMORY, 0x00b5); +CPUMF_EVENT_ATTR(cf_z16, ICW_ON_DRAWER_MEMORY, 0x00b6); +CPUMF_EVENT_ATTR(cf_z16, ICW_OFF_DRAWER_MEMORY, 0x00b7); +CPUMF_EVENT_ATTR(cf_z16, BCD_DFP_EXECUTION_SLOTS, 0x00e0); +CPUMF_EVENT_ATTR(cf_z16, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z16, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z16, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z16, TX_NC_TABORT, 0x00f4); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_NO_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z16, TX_C_TABORT_SPECIAL, 0x00f6); +CPUMF_EVENT_ATTR(cf_z16, DFLT_ACCESS, 0x00f8); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CYCLES, 0x00fd); +CPUMF_EVENT_ATTR(cf_z16, SORTL, 0x0100); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CC, 0x0109); +CPUMF_EVENT_ATTR(cf_z16, DFLT_CCFINISH, 0x010a); +CPUMF_EVENT_ATTR(cf_z16, NNPA_INVOCATIONS, 0x010b); +CPUMF_EVENT_ATTR(cf_z16, NNPA_COMPLETIONS, 0x010c); +CPUMF_EVENT_ATTR(cf_z16, NNPA_WAIT_LOCK, 0x010d); +CPUMF_EVENT_ATTR(cf_z16, NNPA_HOLD_LOCK, 0x010e); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); +CPUMF_EVENT_ATTR(cf_z17, L1D_RO_EXCL_WRITES, 0x0080); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_WRITES, 0x0081); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_MISSES, 0x0082); +CPUMF_EVENT_ATTR(cf_z17, CRSTE_1MB_WRITES, 0x0083); +CPUMF_EVENT_ATTR(cf_z17, DTLB2_GPAGE_WRITES, 0x0084); +CPUMF_EVENT_ATTR(cf_z17, ITLB2_WRITES, 0x0086); +CPUMF_EVENT_ATTR(cf_z17, ITLB2_MISSES, 0x0087); +CPUMF_EVENT_ATTR(cf_z17, TLB2_PTE_WRITES, 0x0089); +CPUMF_EVENT_ATTR(cf_z17, TLB2_CRSTE_WRITES, 0x008a); +CPUMF_EVENT_ATTR(cf_z17, TLB2_ENGINES_BUSY, 0x008b); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TEND, 0x008c); +CPUMF_EVENT_ATTR(cf_z17, TX_NC_TEND, 0x008d); +CPUMF_EVENT_ATTR(cf_z17, L1C_TLB2_MISSES, 0x008f); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ, 0x0091); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_IV, 0x0092); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_CHIP_HIT, 0x0093); +CPUMF_EVENT_ATTR(cf_z17, DCW_REQ_DRAWER_HIT, 0x0094); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP, 0x0095); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_IV, 0x0096); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_CHIP_HIT, 0x0097); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_DRAWER_HIT, 0x0098); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_MODULE, 0x0099); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_DRAWER, 0x009a); +CPUMF_EVENT_ATTR(cf_z17, DCW_OFF_DRAWER, 0x009b); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_CHIP_MEMORY, 0x009c); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_MODULE_MEMORY, 0x009d); +CPUMF_EVENT_ATTR(cf_z17, DCW_ON_DRAWER_MEMORY, 0x009e); +CPUMF_EVENT_ATTR(cf_z17, DCW_OFF_DRAWER_MEMORY, 0x009f); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_IV, 0x00a0); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_CHIP_HIT, 0x00a1); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_MODULE_DRAWER_HIT, 0x00a2); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_IV, 0x00a3); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_CHIP_HIT, 0x00a4); +CPUMF_EVENT_ATTR(cf_z17, IDCW_ON_DRAWER_DRAWER_HIT, 0x00a5); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_IV, 0x00a6); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_CHIP_HIT, 0x00a7); +CPUMF_EVENT_ATTR(cf_z17, IDCW_OFF_DRAWER_DRAWER_HIT, 0x00a8); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ, 0x00a9); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_IV, 0x00aa); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_CHIP_HIT, 0x00ab); +CPUMF_EVENT_ATTR(cf_z17, ICW_REQ_DRAWER_HIT, 0x00ac); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP, 0x00ad); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_IV, 0x00ae); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_CHIP_HIT, 0x00af); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_CHIP_DRAWER_HIT, 0x00b0); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_MODULE, 0x00b1); +CPUMF_EVENT_ATTR(cf_z17, ICW_ON_DRAWER, 0x00b2); +CPUMF_EVENT_ATTR(cf_z17, ICW_OFF_DRAWER, 0x00b3); +CPUMF_EVENT_ATTR(cf_z17, CYCLES_SAMETHRD, 0x00ca); +CPUMF_EVENT_ATTR(cf_z17, CYCLES_DIFFTHRD, 0x00cb); +CPUMF_EVENT_ATTR(cf_z17, INST_SAMETHRD, 0x00cc); +CPUMF_EVENT_ATTR(cf_z17, INST_DIFFTHRD, 0x00cd); +CPUMF_EVENT_ATTR(cf_z17, WRONG_BRANCH_PREDICTION, 0x00ce); +CPUMF_EVENT_ATTR(cf_z17, VX_BCD_EXECUTION_SLOTS, 0x00e1); +CPUMF_EVENT_ATTR(cf_z17, DECIMAL_INSTRUCTIONS, 0x00e2); +CPUMF_EVENT_ATTR(cf_z17, LAST_HOST_TRANSLATIONS, 0x00e8); +CPUMF_EVENT_ATTR(cf_z17, TX_NC_TABORT, 0x00f4); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TABORT_NO_SPECIAL, 0x00f5); +CPUMF_EVENT_ATTR(cf_z17, TX_C_TABORT_SPECIAL, 0x00f6); +CPUMF_EVENT_ATTR(cf_z17, DFLT_ACCESS, 0x00f8); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CYCLES, 0x00fd); +CPUMF_EVENT_ATTR(cf_z17, SORTL, 0x0100); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CC, 0x0109); +CPUMF_EVENT_ATTR(cf_z17, DFLT_CCFINISH, 0x010a); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INVOCATIONS, 0x010b); +CPUMF_EVENT_ATTR(cf_z17, NNPA_COMPLETIONS, 0x010c); +CPUMF_EVENT_ATTR(cf_z17, NNPA_WAIT_LOCK, 0x010d); +CPUMF_EVENT_ATTR(cf_z17, NNPA_HOLD_LOCK, 0x010e); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_ONCHIP, 0x0110); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_OFFCHIP, 0x0111); +CPUMF_EVENT_ATTR(cf_z17, NNPA_INST_DIFF, 0x0112); +CPUMF_EVENT_ATTR(cf_z17, NNPA_4K_PREFETCH, 0x0114); +CPUMF_EVENT_ATTR(cf_z17, NNPA_COMPL_LOCK, 0x0115); +CPUMF_EVENT_ATTR(cf_z17, NNPA_RETRY_LOCK, 0x0116); +CPUMF_EVENT_ATTR(cf_z17, NNPA_RETRY_LOCK_WITH_PLO, 0x0117); +CPUMF_EVENT_ATTR(cf_z17, MT_DIAG_CYCLES_ONE_THR_ACTIVE, 0x01c0); +CPUMF_EVENT_ATTR(cf_z17, MT_DIAG_CYCLES_TWO_THR_ACTIVE, 0x01c1); static struct attribute *cpumcf_fvn1_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_fvn1, CPU_CYCLES), @@ -344,7 +490,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { +static struct attribute *cpumcf_svn_678_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), @@ -635,6 +781,161 @@ static struct attribute *cpumcf_z15_pmu_event_attr[] __initdata = { NULL, }; +static struct attribute *cpumcf_z16_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z16, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, CRSTE_1MB_WRITES), + CPUMF_EVENT_PTR(cf_z16, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z16, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z16, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z16, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z16, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, DCW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_MODULE_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_ON_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, IDCW_OFF_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, ICW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z16, BCD_DFP_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z16, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z16, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z16, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z16, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z16, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z16, SORTL), + CPUMF_EVENT_PTR(cf_z16, DFLT_CC), + CPUMF_EVENT_PTR(cf_z16, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z16, NNPA_INVOCATIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_COMPLETIONS), + CPUMF_EVENT_PTR(cf_z16, NNPA_WAIT_LOCK), + CPUMF_EVENT_PTR(cf_z16, NNPA_HOLD_LOCK), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z16, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + +static struct attribute *cpumcf_z17_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_z17, L1D_RO_EXCL_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, CRSTE_1MB_WRITES), + CPUMF_EVENT_PTR(cf_z17, DTLB2_GPAGE_WRITES), + CPUMF_EVENT_PTR(cf_z17, ITLB2_WRITES), + CPUMF_EVENT_PTR(cf_z17, ITLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, TLB2_PTE_WRITES), + CPUMF_EVENT_PTR(cf_z17, TLB2_CRSTE_WRITES), + CPUMF_EVENT_PTR(cf_z17, TLB2_ENGINES_BUSY), + CPUMF_EVENT_PTR(cf_z17, TX_C_TEND), + CPUMF_EVENT_PTR(cf_z17, TX_NC_TEND), + CPUMF_EVENT_PTR(cf_z17, L1C_TLB2_MISSES), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_IV), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z17, DCW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_CHIP_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_MODULE_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_ON_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z17, DCW_OFF_DRAWER_MEMORY), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_MODULE_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_ON_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_IV), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, IDCW_OFF_DRAWER_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_IV), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_REQ_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_IV), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_CHIP_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_CHIP_DRAWER_HIT), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_MODULE), + CPUMF_EVENT_PTR(cf_z17, ICW_ON_DRAWER), + CPUMF_EVENT_PTR(cf_z17, ICW_OFF_DRAWER), + CPUMF_EVENT_PTR(cf_z17, CYCLES_SAMETHRD), + CPUMF_EVENT_PTR(cf_z17, CYCLES_DIFFTHRD), + CPUMF_EVENT_PTR(cf_z17, INST_SAMETHRD), + CPUMF_EVENT_PTR(cf_z17, INST_DIFFTHRD), + CPUMF_EVENT_PTR(cf_z17, WRONG_BRANCH_PREDICTION), + CPUMF_EVENT_PTR(cf_z17, VX_BCD_EXECUTION_SLOTS), + CPUMF_EVENT_PTR(cf_z17, DECIMAL_INSTRUCTIONS), + CPUMF_EVENT_PTR(cf_z17, LAST_HOST_TRANSLATIONS), + CPUMF_EVENT_PTR(cf_z17, TX_NC_TABORT), + CPUMF_EVENT_PTR(cf_z17, TX_C_TABORT_NO_SPECIAL), + CPUMF_EVENT_PTR(cf_z17, TX_C_TABORT_SPECIAL), + CPUMF_EVENT_PTR(cf_z17, DFLT_ACCESS), + CPUMF_EVENT_PTR(cf_z17, DFLT_CYCLES), + CPUMF_EVENT_PTR(cf_z17, SORTL), + CPUMF_EVENT_PTR(cf_z17, DFLT_CC), + CPUMF_EVENT_PTR(cf_z17, DFLT_CCFINISH), + CPUMF_EVENT_PTR(cf_z17, NNPA_INVOCATIONS), + CPUMF_EVENT_PTR(cf_z17, NNPA_COMPLETIONS), + CPUMF_EVENT_PTR(cf_z17, NNPA_WAIT_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_HOLD_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_ONCHIP), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_OFFCHIP), + CPUMF_EVENT_PTR(cf_z17, NNPA_INST_DIFF), + CPUMF_EVENT_PTR(cf_z17, NNPA_4K_PREFETCH), + CPUMF_EVENT_PTR(cf_z17, NNPA_COMPL_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_RETRY_LOCK), + CPUMF_EVENT_PTR(cf_z17, NNPA_RETRY_LOCK_WITH_PLO), + CPUMF_EVENT_PTR(cf_z17, MT_DIAG_CYCLES_ONE_THR_ACTIVE), + CPUMF_EVENT_PTR(cf_z17, MT_DIAG_CYCLES_TWO_THR_ACTIVE), + NULL, +}; + /* END: CPUM_CF COUNTER DEFINITIONS ===================================== */ static struct attribute_group cpumcf_pmu_events_group = { @@ -711,16 +1012,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void) } /* Determine version specific crypto set */ - switch (ci.csvn) { - case 1 ... 5: + csvn = none; + if (ci.csvn >= 1 && ci.csvn <= 5) csvn = cpumcf_svn_12345_pmu_event_attr; - break; - case 6: - csvn = cpumcf_svn_6_pmu_event_attr; - break; - default: - csvn = none; - } + else if (ci.csvn >= 6) + csvn = cpumcf_svn_678_pmu_event_attr; /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); @@ -749,6 +1045,14 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 0x8562: model = cpumcf_z15_pmu_event_attr; break; + case 0x3931: + case 0x3932: + model = cpumcf_z16_pmu_event_attr; + break; + case 0x9175: + case 0x9176: + model = cpumcf_z17_pmu_event_attr; + break; default: model = none; break; diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index db62def4ef28..459af23a47a5 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -5,8 +5,7 @@ * Copyright IBM Corp. 2013, 2018 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "cpum_sf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpum_sf: " fmt #include <linux/kernel.h> #include <linux/kernel_stat.h> @@ -14,7 +13,6 @@ #include <linux/percpu.h> #include <linux/pid.h> #include <linux/notifier.h> -#include <linux/export.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/moduleparam.h> @@ -22,6 +20,23 @@ #include <asm/irq.h> #include <asm/debug.h> #include <asm/timex.h> +#include <linux/io.h> + +/* Perf PMU definitions for the sampling facility */ +#define PERF_CPUM_SF_MAX_CTR 2 +#define PERF_EVENT_CPUM_SF 0xB0000UL /* Event: Basic-sampling */ +#define PERF_EVENT_CPUM_SF_DIAG 0xBD000UL /* Event: Combined-sampling */ +#define PERF_CPUM_SF_BASIC_MODE 0x0001 /* Basic-sampling flag */ +#define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */ +#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ + +#define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config) +#define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc) +#define TEAR_REG(hwc) ((hwc)->last_tag) +#define SAMPL_RATE(hwc) ((hwc)->event_base) +#define SAMPL_FLAGS(hwc) ((hwc)->config_base) +#define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) +#define SAMPL_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) /* Minimum number of sample-data-block-tables: * At least one table is required for the sampling buffer structure. @@ -42,7 +57,7 @@ #define CPUM_SF_SDBT_TL_OFFSET (CPUM_SF_SDB_PER_TABLE * 8) static inline int require_table_link(const void *sdbt) { - return ((unsigned long) sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; + return ((unsigned long)sdbt & ~PAGE_MASK) == CPUM_SF_SDBT_TL_OFFSET; } /* Minimum and maximum sampling buffer sizes: @@ -99,15 +114,55 @@ static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); /* Debug feature */ static debug_info_t *sfdbg; +/* Sampling control helper functions */ +static inline unsigned long freq_to_sample_rate(struct hws_qsi_info_block *qsi, + unsigned long freq) +{ + return (USEC_PER_SEC / freq) * qsi->cpu_speed; +} + +static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, + unsigned long rate) +{ + return USEC_PER_SEC * qsi->cpu_speed / rate; +} + +/* Return pointer to trailer entry of an sample data block */ +static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v) +{ + void *ret; + + ret = (void *)v; + ret += PAGE_SIZE; + ret -= sizeof(struct hws_trailer_entry); + + return ret; +} + +/* + * Return true if the entry in the sample data block table (sdbt) + * is a link to the next sdbt + */ +static inline int is_link_entry(unsigned long *s) +{ + return *s & 0x1UL ? 1 : 0; +} + +/* Return pointer to the linked sdbt */ +static inline unsigned long *get_next_sdbt(unsigned long *s) +{ + return phys_to_virt(*s & ~0x1UL); +} + /* * sf_disable() - Switch off sampling facility */ -static int sf_disable(void) +static void sf_disable(void) { struct hws_lsctl_request_block sreq; memset(&sreq, 0, sizeof(sreq)); - return lsctl(&sreq); + lsctl(&sreq); } /* @@ -123,57 +178,44 @@ static int sf_buffer_available(struct cpu_hw_sf *cpuhw) */ static void free_sampling_buffer(struct sf_buffer *sfb) { - unsigned long *sdbt, *curr; - - if (!sfb->sdbt) - return; + unsigned long *sdbt, *curr, *head; sdbt = sfb->sdbt; - curr = sdbt; - + if (!sdbt) + return; + sfb->sdbt = NULL; /* Free the SDBT after all SDBs are processed... */ - while (1) { - if (!*curr || !sdbt) - break; - - /* Process table-link entries */ + head = sdbt; + curr = sdbt; + do { if (is_link_entry(curr)) { + /* Process table-link entries */ curr = get_next_sdbt(curr); - if (sdbt) - free_page((unsigned long) sdbt); - - /* If the origin is reached, sampling buffer is freed */ - if (curr == sfb->sdbt) - break; - else - sdbt = curr; + free_page((unsigned long)sdbt); + sdbt = curr; } else { /* Process SDB pointer */ - if (*curr) { - free_page(*curr); - curr++; - } + free_page((unsigned long)phys_to_virt(*curr)); + curr++; } - } - - debug_sprintf_event(sfdbg, 5, "%s: freed sdbt %#lx\n", __func__, - (unsigned long)sfb->sdbt); + } while (curr != head); memset(sfb, 0, sizeof(*sfb)); } static int alloc_sample_data_block(unsigned long *sdbt, gfp_t gfp_flags) { - unsigned long sdb, *trailer; + struct hws_trailer_entry *te; + unsigned long sdb; /* Allocate and initialize sample-data-block */ sdb = get_zeroed_page(gfp_flags); if (!sdb) return -ENOMEM; - trailer = trailer_entry_ptr(sdb); - *trailer = SDB_TE_ALERT_REQ_MASK; + te = trailer_entry_ptr(sdb); + te->header.a = 1; /* Link SDB into the sample-data-block-table */ - *sdbt = sdb; + *sdbt = virt_to_phys((void *)sdb); return 0; } @@ -212,10 +254,8 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, * the sampling buffer origin. */ if (sfb->sdbt != get_next_sdbt(tail)) { - debug_sprintf_event(sfdbg, 3, "%s: " - "sampling buffer is not linked: origin %#lx" - " tail %#lx\n", __func__, - (unsigned long)sfb->sdbt, + debug_sprintf_event(sfdbg, 3, "%s buffer not linked origin %#lx tail %#lx\n", + __func__, (unsigned long)sfb->sdbt, (unsigned long)tail); return -EINVAL; } @@ -225,14 +265,14 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, for (i = 0; i < num_sdb; i++) { /* Allocate a new SDB-table if it is full. */ if (require_table_link(tail)) { - new = (unsigned long *) get_zeroed_page(gfp_flags); + new = (unsigned long *)get_zeroed_page(gfp_flags); if (!new) { rc = -ENOMEM; break; } sfb->num_sdbt++; /* Link current page to tail of chain */ - *tail = (unsigned long)(void *) new + 1; + *tail = virt_to_phys((void *)new) + 1; tail_prev = tail; tail = new; } @@ -251,7 +291,7 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, */ if (tail_prev) { sfb->num_sdbt--; - free_page((unsigned long) new); + free_page((unsigned long)new); tail = tail_prev; } break; @@ -262,12 +302,9 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, } /* Link sampling buffer to its origin */ - *tail = (unsigned long) sfb->sdbt + 1; + *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; - debug_sprintf_event(sfdbg, 4, "%s: new buffer" - " settings: sdbt %lu sdb %lu\n", __func__, - sfb->num_sdbt, sfb->num_sdb); return rc; } @@ -290,7 +327,7 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) return -EINVAL; /* Allocate the sample-data-block-table origin */ - sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!sfb->sdbt) return -ENOMEM; sfb->num_sdb = 0; @@ -300,19 +337,12 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) * realloc_sampling_buffer() invocation. */ sfb->tail = sfb->sdbt; - *sfb->tail = (unsigned long)(void *) sfb->sdbt + 1; + *sfb->tail = virt_to_phys((void *)sfb->sdbt) + 1; /* Allocate requested number of sample-data-blocks */ rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); - if (rc) { + if (rc) free_sampling_buffer(sfb); - debug_sprintf_event(sfdbg, 4, "%s: " - "realloc_sampling_buffer failed with rc %i\n", - __func__, rc); - } else - debug_sprintf_event(sfdbg, 4, - "%s: tear %#lx dear %#lx\n", __func__, - (unsigned long)sfb->sdbt, (unsigned long)*sfb->sdbt); return rc; } @@ -324,8 +354,8 @@ static void sfb_set_limits(unsigned long min, unsigned long max) CPUM_SF_MAX_SDB = max; memset(&si, 0, sizeof(si)); - if (!qsi(&si)) - CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); + qsi(&si); + CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); } static unsigned long sfb_max_limit(struct hw_perf_event *hwc) @@ -344,12 +374,6 @@ static unsigned long sfb_pending_allocs(struct sf_buffer *sfb, return 0; } -static int sfb_has_pending_allocs(struct sf_buffer *sfb, - struct hw_perf_event *hwc) -{ - return sfb_pending_allocs(sfb, hwc) > 0; -} - static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc) { /* Limit the number of SDBs to not exceed the maximum */ @@ -366,14 +390,13 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc) static void deallocate_buffers(struct cpu_hw_sf *cpuhw) { - if (cpuhw->sfb.sdbt) + if (sf_buffer_available(cpuhw)) free_sampling_buffer(&cpuhw->sfb); } static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { unsigned long n_sdb, freq; - size_t sample_size; /* Calculate sampling buffers using 4K pages * @@ -404,7 +427,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up * to 511 SDBs). */ - sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); @@ -420,12 +442,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) if (sf_buffer_available(cpuhw)) return 0; - debug_sprintf_event(sfdbg, 3, - "%s: rate %lu f %lu sdb %lu/%lu" - " sample_size %lu cpuhw %p\n", __func__, - SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc), - sample_size, cpuhw); - return alloc_sampling_buffer(&cpuhw->sfb, sfb_pending_allocs(&cpuhw->sfb, hwc)); } @@ -482,8 +498,6 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, if (num) sfb_account_allocs(num, hwc); - debug_sprintf_event(sfdbg, 5, "%s: overflow %llu ratio %lu num %lu\n", - __func__, OVERFLOW_REG(hwc), ratio, num); OVERFLOW_REG(hwc) = 0; } @@ -501,13 +515,11 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, static void extend_sampling_buffer(struct sf_buffer *sfb, struct hw_perf_event *hwc) { - unsigned long num, num_old; - int rc; + unsigned long num; num = sfb_pending_allocs(sfb, hwc); if (!num) return; - num_old = sfb->num_sdb; /* Disable the sampling facility to reset any states and also * clear pending measurement alerts. @@ -519,62 +531,32 @@ static void extend_sampling_buffer(struct sf_buffer *sfb, * called by perf. Because this is a reallocation, it is fine if the * new SDB-request cannot be satisfied immediately. */ - rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC); - if (rc) - debug_sprintf_event(sfdbg, 5, "%s: realloc failed with rc %i\n", - __func__, rc); - - if (sfb_has_pending_allocs(sfb, hwc)) - debug_sprintf_event(sfdbg, 5, "%s: " - "req %lu alloc %lu remaining %lu\n", - __func__, num, sfb->num_sdb - num_old, - sfb_pending_allocs(sfb, hwc)); + realloc_sampling_buffer(sfb, num, GFP_ATOMIC); } /* Number of perf events counting hardware events */ -static atomic_t num_events; +static refcount_t num_events; /* Used to avoid races in calling reserve/release_cpumf_hardware */ static DEFINE_MUTEX(pmc_reserve_mutex); #define PMC_INIT 0 #define PMC_RELEASE 1 -#define PMC_FAILURE 2 static void setup_pmc_cpu(void *flags) { - int err; - struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf); + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); - err = 0; - switch (*((int *) flags)) { + sf_disable(); + switch (*((int *)flags)) { case PMC_INIT: - memset(cpusf, 0, sizeof(*cpusf)); - err = qsi(&cpusf->qsi); - if (err) - break; - cpusf->flags |= PMU_F_RESERVED; - err = sf_disable(); - if (err) - pr_err("Switching off the sampling facility failed " - "with rc %i\n", err); - debug_sprintf_event(sfdbg, 5, - "%s: initialized: cpuhw %p\n", __func__, - cpusf); + memset(cpuhw, 0, sizeof(*cpuhw)); + qsi(&cpuhw->qsi); + cpuhw->flags |= PMU_F_RESERVED; break; case PMC_RELEASE: - cpusf->flags &= ~PMU_F_RESERVED; - err = sf_disable(); - if (err) { - pr_err("Switching off the sampling facility failed " - "with rc %i\n", err); - } else - deallocate_buffers(cpusf); - debug_sprintf_event(sfdbg, 5, - "%s: released: cpuhw %p\n", __func__, - cpusf); + cpuhw->flags &= ~PMU_F_RESERVED; + deallocate_buffers(cpuhw); break; } - if (err) - *((int *) flags) |= PMC_FAILURE; } static void release_pmc_hardware(void) @@ -585,27 +567,19 @@ static void release_pmc_hardware(void) on_each_cpu(setup_pmc_cpu, &flags, 1); } -static int reserve_pmc_hardware(void) +static void reserve_pmc_hardware(void) { int flags = PMC_INIT; on_each_cpu(setup_pmc_cpu, &flags, 1); - if (flags & PMC_FAILURE) { - release_pmc_hardware(); - return -ENODEV; - } irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; } static void hw_perf_event_destroy(struct perf_event *event) { /* Release PMC if this is the last perf event */ - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - release_pmc_hardware(); + if (refcount_dec_and_mutex_lock(&num_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -671,7 +645,8 @@ static void cpumsf_output_event_pid(struct perf_event *event, /* Protect callchain buffers, tasks */ rcu_read_lock(); - perf_prepare_sample(&header, data, event, regs); + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); if (perf_output_begin(&handle, data, event, header.size)) goto out; @@ -708,9 +683,6 @@ static unsigned long getrate(bool freq, unsigned long sample, */ if (sample_rate_to_freq(si, rate) > sysctl_perf_event_sample_rate) { - debug_sprintf_event(sfdbg, 1, "%s: " - "Sampling rate exceeds maximum " - "perf sample rate\n", __func__); rate = 0; } } @@ -755,9 +727,6 @@ static int __hw_perf_event_init_rate(struct perf_event *event, attr->sample_period = rate; SAMPL_RATE(hwc) = rate; hw_init_period(hwc, SAMPL_RATE(hwc)); - debug_sprintf_event(sfdbg, 4, "%s: cpu %d period %#llx freq %d,%#lx\n", - __func__, event->cpu, event->attr.sample_period, - event->attr.freq, SAMPLE_FREQ_MODE(hwc)); return 0; } @@ -767,23 +736,16 @@ static int __hw_perf_event_init(struct perf_event *event) struct hws_qsi_info_block si; struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; - int cpu, err; + int cpu, err = 0; /* Reserve CPU-measurement sampling facility */ - err = 0; - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && reserve_pmc_hardware()) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); + mutex_lock(&pmc_reserve_mutex); + if (!refcount_inc_not_zero(&num_events)) { + reserve_pmc_hardware(); + refcount_set(&num_events, 1); } event->destroy = hw_perf_event_destroy; - if (err) - goto out; - /* Access per-CPU sampling information (query sampling info) */ /* * The event->cpu value can be -1 to count on every CPU, for example, @@ -795,9 +757,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ memset(&si, 0, sizeof(si)); cpuhw = NULL; - if (event->cpu == -1) + if (event->cpu == -1) { qsi(&si); - else { + } else { /* Event is pinned to a particular CPU, retrieve the per-CPU * sampling structure for accessing the CPU-specific QSI. */ @@ -834,21 +796,13 @@ static int __hw_perf_event_init(struct perf_event *event) SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_DIAG_MODE; } - /* Check and set other sampling flags */ - if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS) - SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS; - err = __hw_perf_event_init_rate(event, &si); if (err) goto out; - /* Initialize sample data overflow accounting */ - hwc->extra_reg.reg = REG_OVERFLOW; - OVERFLOW_REG(hwc) = 0; - /* Use AUX buffer. No need to allocate it by ourself */ if (attr->config == PERF_EVENT_CPUM_SF_DIAG) - return 0; + goto out; /* Allocate the per-CPU sampling buffer using the CPU information * from the event. If the event is not pinned to a particular @@ -878,6 +832,7 @@ static int __hw_perf_event_init(struct perf_event *event) if (is_default_overflow_handler(event)) event->overflow_handler = cpumsf_output_event_pid; out: + mutex_unlock(&pmc_reserve_mutex); return err; } @@ -919,10 +874,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event) return -ENOENT; } - /* Check online status of the CPU to which the event is pinned */ - if (event->cpu >= 0 && !cpu_online(event->cpu)) - return -ENODEV; - /* Force reset of idle/hv excludes regardless of what the * user requested. */ @@ -932,9 +883,6 @@ static int cpumsf_pmu_event_init(struct perf_event *event) event->attr.exclude_idle = 0; err = __hw_perf_event_init(event); - if (unlikely(err)) - if (event->destroy) - event->destroy(event); return err; } @@ -944,10 +892,14 @@ static void cpumsf_pmu_enable(struct pmu *pmu) struct hw_perf_event *hwc; int err; - if (cpuhw->flags & PMU_F_ENABLED) - return; - - if (cpuhw->flags & PMU_F_ERR_MASK) + /* + * Event must be + * - added/started on this CPU (PMU_F_IN_USE set) + * - and CPU must be available (PMU_F_RESERVED set) + * - and not already enabled (PMU_F_ENABLED not set) + * - and not in error condition (PMU_F_ERR_MASK not set) + */ + if (cpuhw->flags != (PMU_F_IN_USE | PMU_F_RESERVED)) return; /* Check whether to extent the sampling buffer. @@ -961,40 +913,27 @@ static void cpumsf_pmu_enable(struct pmu *pmu) * facility, but it can be fully re-enabled using sampling controls that * have been saved in cpumsf_pmu_disable(). */ - if (cpuhw->event) { - hwc = &cpuhw->event->hw; - if (!(SAMPL_DIAG_MODE(hwc))) { - /* - * Account number of overflow-designated - * buffer extents - */ - sfb_account_overflows(cpuhw, hwc); - extend_sampling_buffer(&cpuhw->sfb, hwc); - } - /* Rate may be adjusted with ioctl() */ - cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw); + hwc = &cpuhw->event->hw; + if (!(SAMPL_DIAG_MODE(hwc))) { + /* + * Account number of overflow-designated buffer extents + */ + sfb_account_overflows(cpuhw, hwc); + extend_sampling_buffer(&cpuhw->sfb, hwc); } + /* Rate may be adjusted with ioctl() */ + cpuhw->lsctl.interval = SAMPL_RATE(hwc); /* (Re)enable the PMU and sampling facility */ - cpuhw->flags |= PMU_F_ENABLED; - barrier(); - err = lsctl(&cpuhw->lsctl); if (err) { - cpuhw->flags &= ~PMU_F_ENABLED; - pr_err("Loading sampling controls failed: op %i err %i\n", - 1, err); + pr_err("Loading sampling controls failed: op 1 err %i\n", err); return; } /* Load current program parameter */ - lpp(&S390_lowcore.lpp); - - debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i " - "interval %#lx tear %#lx dear %#lx\n", __func__, - cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, - cpuhw->lsctl.cd, cpuhw->lsctl.interval, - cpuhw->lsctl.tear, cpuhw->lsctl.dear); + lpp(&get_lowcore()->lpp); + cpuhw->flags |= PMU_F_ENABLED; } static void cpumsf_pmu_disable(struct pmu *pmu) @@ -1017,31 +956,27 @@ static void cpumsf_pmu_disable(struct pmu *pmu) err = lsctl(&inactive); if (err) { - pr_err("Loading sampling controls failed: op %i err %i\n", - 2, err); + pr_err("Loading sampling controls failed: op 2 err %i\n", err); return; } - /* Save state of TEAR and DEAR register contents */ - err = qsi(&si); - if (!err) { - /* TEAR/DEAR values are valid only if the sampling facility is - * enabled. Note that cpumsf_pmu_disable() might be called even - * for a disabled sampling facility because cpumsf_pmu_enable() - * controls the enable/disable state. - */ - if (si.es) { - cpuhw->lsctl.tear = si.tear; - cpuhw->lsctl.dear = si.dear; - } - } else - debug_sprintf_event(sfdbg, 3, "%s: qsi() failed with err %i\n", - __func__, err); + /* + * Save state of TEAR and DEAR register contents. + * TEAR/DEAR values are valid only if the sampling facility is + * enabled. Note that cpumsf_pmu_disable() might be called even + * for a disabled sampling facility because cpumsf_pmu_enable() + * controls the enable/disable state. + */ + qsi(&si); + if (si.es) { + cpuhw->lsctl.tear = si.tear; + cpuhw->lsctl.dear = si.dear; + } cpuhw->flags &= ~PMU_F_ENABLED; } -/* perf_exclude_event() - Filter event +/* perf_event_exclude() - Filter event * @event: The perf event * @regs: pt_regs structure * @sde_regs: Sample-data-entry (sde) regs structure @@ -1050,7 +985,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) * * Return non-zero if the event shall be excluded. */ -static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, +static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, struct perf_sf_sde_regs *sde_regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -1133,12 +1068,9 @@ static int perf_push_sample(struct perf_event *event, data.tid_entry.pid = basic->hpp & LPP_PID_MASK; overflow = 0; - if (perf_exclude_event(event, ®s, sde_regs)) + if (perf_event_exclude(event, ®s, sde_regs)) goto out; - if (perf_event_overflow(event, &data, ®s)) { - overflow = 1; - event->pmu->stop(event, 0); - } + overflow = perf_event_overflow(event, &data, ®s); perf_event_update_userpage(event); out: return overflow; @@ -1160,7 +1092,7 @@ static void perf_event_count_update(struct perf_event *event, u64 count) * combined-sampling data entry consists of a basic- and a diagnostic-sampling * data entry. The sampling function is determined by the flags in the perf * event hardware structure. The function always works with a combined-sampling - * data entry but ignores the the diagnostic portion if it is not available. + * data entry but ignores the diagnostic portion if it is not available. * * Note that the implementation focuses on basic-sampling data entries and, if * such an entry is not valid, the entire combined-sampling data entry is @@ -1175,11 +1107,11 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, struct hws_trailer_entry *te; struct hws_basic_entry *sample; - te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); - sample = (struct hws_basic_entry *) *sdbt; - while ((unsigned long *) sample < (unsigned long *) te) { + te = trailer_entry_ptr((unsigned long)sdbt); + sample = (struct hws_basic_entry *)sdbt; + while ((unsigned long *)sample < (unsigned long *)te) { /* Check for an empty sample */ - if (!sample->def) + if (!sample->def || sample->LS) break; /* Update perf event period */ @@ -1202,11 +1134,6 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, /* Count discarded samples */ *overflow += 1; } else { - debug_sprintf_event(sfdbg, 4, - "%s: Found unknown" - " sampling data entry: te->f %i" - " basic.def %#4x (%p)\n", __func__, - te->f, sample->def, sample); /* Sample slot is not yet written or other record. * * This condition can occur if the buffer was reused @@ -1217,7 +1144,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, * that are not full. Stop processing if the first * invalid format was detected. */ - if (!te->f) + if (!te->header.f) break; } @@ -1235,71 +1162,62 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, * The sampling buffer position are retrieved and saved in the TEAR_REG * register of the specified perf event. * - * Only full sample-data-blocks are processed. Specify the flash_all flag - * to also walk through partially filled sample-data-blocks. It is ignored - * if PERF_CPUM_SF_FULL_BLOCKS is set. The PERF_CPUM_SF_FULL_BLOCKS flag - * enforces the processing of full sample-data-blocks only (trailer entries - * with the block-full-indicator bit set). + * Only full sample-data-blocks are processed. Specify the flush_all flag + * to also walk through partially filled sample-data-blocks. */ static void hw_perf_event_update(struct perf_event *event, int flush_all) { + unsigned long long event_overflow, sampl_overflow, num_sdb; struct hw_perf_event *hwc = &event->hw; + union hws_trailer_header prev, new; struct hws_trailer_entry *te; - unsigned long *sdbt; - unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags; + unsigned long *sdbt, sdb; int done; /* * AUX buffer is used when in diagnostic sampling mode. * No perf events/samples are created. */ - if (SAMPL_DIAG_MODE(&event->hw)) + if (SAMPL_DIAG_MODE(hwc)) return; - if (flush_all && SDB_FULL_BLOCKS(hwc)) - flush_all = 0; - - sdbt = (unsigned long *) TEAR_REG(hwc); + sdbt = (unsigned long *)TEAR_REG(hwc); done = event_overflow = sampl_overflow = num_sdb = 0; while (!done) { /* Get the trailer entry of the sample-data-block */ - te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); + sdb = (unsigned long)phys_to_virt(*sdbt); + te = trailer_entry_ptr(sdb); /* Leave loop if no more work to do (block full indicator) */ - if (!te->f) { + if (!te->header.f) { done = 1; if (!flush_all) break; } /* Check the sample overflow count */ - if (te->overflow) + if (te->header.overflow) /* Account sample overflows and, if a particular limit * is reached, extend the sampling buffer. * For details, see sfb_account_overflows(). */ - sampl_overflow += te->overflow; - - /* Timestamps are valid for full sample-data-blocks only */ - debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx " - "overflow %llu timestamp %#llx\n", - __func__, (unsigned long)sdbt, te->overflow, - (te->f) ? trailer_timestamp(te) : 0ULL); + sampl_overflow += te->header.overflow; /* Collect all samples from a single sample-data-block and * flag if an (perf) event overflow happened. If so, the PMU * is stopped and remaining samples will be discarded. */ - hw_collect_samples(event, sdbt, &event_overflow); + hw_collect_samples(event, (unsigned long *)sdb, &event_overflow); num_sdb++; /* Reset trailer (using compare-double-and-swap) */ + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - te_flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK; - te_flags |= SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - te->flags, te->overflow, - te_flags, 0ULL)); + new.val = prev.val; + new.f = 0; + new.a = 1; + new.overflow = 0; + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); /* Advance to next sample-data-block */ sdbt++; @@ -1307,7 +1225,7 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) sdbt = get_next_sdbt(sdbt); /* Update event hardware registers */ - TEAR_REG(hwc) = (unsigned long) sdbt; + TEAR_REG(hwc) = (unsigned long)sdbt; /* Stop processing sample-data if all samples of the current * sample-data-block were flushed even if it was not full. @@ -1329,25 +1247,30 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) * are dropped. * Slightly increase the interval to avoid hitting this limit. */ - if (event_overflow) { + if (event_overflow) SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); - debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n", - __func__, - DIV_ROUND_UP(SAMPL_RATE(hwc), 10)); - } +} + +static inline unsigned long aux_sdb_index(struct aux_buffer *aux, + unsigned long i) +{ + return i % aux->sfb.num_sdb; +} + +static inline unsigned long aux_sdb_num(unsigned long start, unsigned long end) +{ + return end >= start ? end - start + 1 : 0; +} - if (sampl_overflow || event_overflow) - debug_sprintf_event(sfdbg, 4, "%s: " - "overflows: sample %llu event %llu" - " total %llu num_sdb %llu\n", - __func__, sampl_overflow, event_overflow, - OVERFLOW_REG(hwc), num_sdb); +static inline unsigned long aux_sdb_num_alert(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->alert_mark); } -#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb) -#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0) -#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark) -#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark) +static inline unsigned long aux_sdb_num_empty(struct aux_buffer *aux) +{ + return aux_sdb_num(aux->head, aux->empty_mark); +} /* * Get trailer entry by index of SDB. @@ -1357,9 +1280,9 @@ static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, { unsigned long sdb; - index = AUX_SDB_INDEX(aux, index); + index = aux_sdb_index(aux, index); sdb = aux->sdb_index[index]; - return (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + return trailer_entry_ptr(sdb); } /* @@ -1381,10 +1304,10 @@ static void aux_output_end(struct perf_output_handle *handle) if (!aux) return; - range_scan = AUX_SDB_NUM_ALERT(aux); + range_scan = aux_sdb_num_alert(aux); for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); - if (!(te->flags & SDB_TE_BUFFER_FULL_MASK)) + if (!te->header.f) break; } /* i is num of SDBs which are full */ @@ -1392,10 +1315,7 @@ static void aux_output_end(struct perf_output_handle *handle) /* Remove alert indicators in the buffer */ te = aux_sdb_trailer(aux, aux->alert_mark); - te->flags &= ~SDB_TE_ALERT_REQ_MASK; - - debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n", - __func__, i, range_scan, aux->head); + te->header.a = 0; } /* @@ -1411,12 +1331,10 @@ static int aux_output_begin(struct perf_output_handle *handle, struct aux_buffer *aux, struct cpu_hw_sf *cpuhw) { - unsigned long range; - unsigned long i, range_scan, idx; - unsigned long head, base, offset; + unsigned long range, i, range_scan, idx, head, base, offset; struct hws_trailer_entry *te; - if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) + if (handle->head & ~PAGE_MASK) return -EINVAL; aux->head = handle->head >> PAGE_SHIFT; @@ -1428,18 +1346,14 @@ static int aux_output_begin(struct perf_output_handle *handle, * SDBs between aux->head and aux->empty_mark are already ready * for new data. range_scan is num of SDBs not within them. */ - debug_sprintf_event(sfdbg, 6, - "%s: range %ld head %ld alert %ld empty %ld\n", - __func__, range, aux->head, aux->alert_mark, - aux->empty_mark); - if (range > AUX_SDB_NUM_EMPTY(aux)) { - range_scan = range - AUX_SDB_NUM_EMPTY(aux); + if (range > aux_sdb_num_empty(aux)) { + range_scan = range - aux_sdb_num_empty(aux); idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); - te->flags &= ~(SDB_TE_BUFFER_FULL_MASK | - SDB_TE_ALERT_REQ_MASK); - te->overflow = 0; + te->header.f = 0; + te->header.a = 0; + te->header.overflow = 0; } /* Save the position of empty SDBs */ aux->empty_mark = aux->head + range - 1; @@ -1448,20 +1362,14 @@ static int aux_output_begin(struct perf_output_handle *handle, /* Set alert indicator */ aux->alert_mark = aux->head + range/2 - 1; te = aux_sdb_trailer(aux, aux->alert_mark); - te->flags = te->flags | SDB_TE_ALERT_REQ_MASK; + te->header.a = 1; /* Reset hardware buffer head */ - head = AUX_SDB_INDEX(aux, aux->head); + head = aux_sdb_index(aux, aux->head); base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; offset = head % CPUM_SF_SDB_PER_TABLE; - cpuhw->lsctl.tear = base + offset * sizeof(unsigned long); - cpuhw->lsctl.dear = aux->sdb_index[head]; - - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld " - "index %ld tear %#lx dear %#lx\n", __func__, - aux->head, aux->alert_mark, aux->empty_mark, - head / CPUM_SF_SDB_PER_TABLE, - cpuhw->lsctl.tear, cpuhw->lsctl.dear); + cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long); + cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]); return 0; } @@ -1475,14 +1383,15 @@ static int aux_output_begin(struct perf_output_handle *handle, static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, unsigned long long *overflow) { - unsigned long long orig_overflow, orig_flags, new_flags; + union hws_trailer_header prev, new; struct hws_trailer_entry *te; te = aux_sdb_trailer(aux, alert_index); + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - orig_flags = te->flags; - *overflow = orig_overflow = te->overflow; - if (orig_flags & SDB_TE_BUFFER_FULL_MASK) { + new.val = prev.val; + *overflow = prev.overflow; + if (prev.f) { /* * SDB is already set by hardware. * Abort and try to set somewhere @@ -1490,10 +1399,9 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, */ return false; } - new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - orig_flags, orig_overflow, - new_flags, 0ULL)); + new.a = 1; + new.overflow = 0; + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); return true; } @@ -1522,14 +1430,12 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, unsigned long long *overflow) { - unsigned long long orig_overflow, orig_flags, new_flags; - unsigned long i, range_scan, idx, idx_old; + union hws_trailer_header prev, new; + unsigned long i, range_scan, idx; + unsigned long long orig_overflow; struct hws_trailer_entry *te; - debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld " - "empty %ld\n", __func__, range, aux->head, - aux->alert_mark, aux->empty_mark); - if (range <= AUX_SDB_NUM_EMPTY(aux)) + if (range <= aux_sdb_num_empty(aux)) /* * No need to scan. All SDBs in range are marked as empty. * Just set alert indicator. Should check race with hardware @@ -1550,30 +1456,27 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, * Start scanning from one SDB behind empty_mark. If the new alert * indicator fall into this range, set it. */ - range_scan = range - AUX_SDB_NUM_EMPTY(aux); - idx_old = idx = aux->empty_mark + 1; + range_scan = range - aux_sdb_num_empty(aux); + idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); + prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - orig_flags = te->flags; - orig_overflow = te->overflow; - new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK; + new.val = prev.val; + orig_overflow = prev.overflow; + new.f = 0; + new.overflow = 0; if (idx == aux->alert_mark) - new_flags |= SDB_TE_ALERT_REQ_MASK; + new.a = 1; else - new_flags &= ~SDB_TE_ALERT_REQ_MASK; - } while (!cmpxchg_double(&te->flags, &te->overflow, - orig_flags, orig_overflow, - new_flags, 0ULL)); + new.a = 0; + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); *overflow += orig_overflow; } /* Update empty_mark to new position */ aux->empty_mark = aux->head + range - 1; - debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld " - "empty %ld\n", __func__, range_scan, idx_old, - idx - 1, aux->empty_mark); return true; } @@ -1590,12 +1493,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) unsigned long num_sdb; aux = perf_get_aux(handle); - if (WARN_ON_ONCE(!aux)) + if (!aux) return; /* Inform user space new data arrived */ - size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; - debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__, + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; + debug_sprintf_event(sfdbg, 6, "%s #alert %ld\n", __func__, size >> PAGE_SHIFT); perf_aux_output_end(handle, size); @@ -1607,12 +1510,9 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) pr_err("The AUX buffer with %lu pages for the " "diagnostic-sampling mode is full\n", num_sdb); - debug_sprintf_event(sfdbg, 1, - "%s: AUX buffer used up\n", - __func__); break; } - if (WARN_ON_ONCE(!aux)) + if (!aux) return; /* Update head and alert_mark to new position */ @@ -1632,23 +1532,11 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) perf_aux_output_end(&cpuhw->handle, size); pr_err("Sample data caused the AUX buffer with %lu " "pages to overflow\n", aux->sfb.num_sdb); - debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld " - "overflow %lld\n", __func__, - aux->head, range, overflow); } else { - size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + size = aux_sdb_num_alert(aux) << PAGE_SHIFT; perf_aux_output_end(&cpuhw->handle, size); - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " - "already full, try another\n", - __func__, - aux->head, aux->alert_mark); } } - - if (done) - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " - "empty %ld\n", __func__, aux->head, - aux->alert_mark, aux->empty_mark); } /* @@ -1670,15 +1558,13 @@ static void aux_buffer_free(void *data) kfree(aux->sdbt_index); kfree(aux->sdb_index); kfree(aux); - - debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt); } static void aux_sdb_init(unsigned long sdb) { struct hws_trailer_entry *te; - te = (struct hws_trailer_entry *)trailer_entry_ptr(sdb); + te = trailer_entry_ptr(sdb); /* Save clock base */ te->clock_base = 1; @@ -1741,7 +1627,7 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, /* Allocate the first SDBT */ sfb->num_sdbt = 0; - sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + sfb->sdbt = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!sfb->sdbt) goto no_sdbt; aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt; @@ -1753,23 +1639,23 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, */ for (i = 0; i < nr_pages; i++, tail++) { if (require_table_link(tail)) { - new = (unsigned long *) get_zeroed_page(GFP_KERNEL); + new = (unsigned long *)get_zeroed_page(GFP_KERNEL); if (!new) goto no_sdbt; aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; /* Link current page to tail of chain */ - *tail = (unsigned long)(void *) new + 1; + *tail = virt_to_phys(new) + 1; tail = new; } /* Tail is the entry in a SDBT */ - *tail = (unsigned long)pages[i]; + *tail = virt_to_phys(pages[i]); aux->sdb_index[i] = (unsigned long)pages[i]; aux_sdb_init((unsigned long)pages[i]); } sfb->num_sdb = nr_pages; /* Link the last entry in the SDBT to the first SDBT */ - *tail = (unsigned long) sfb->sdbt + 1; + *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; /* @@ -1779,9 +1665,6 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, */ aux->empty_mark = sfb->num_sdb - 1; - debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__, - sfb->num_sdbt, sfb->num_sdb); - return aux; no_sdbt: @@ -1802,7 +1685,7 @@ static void cpumsf_pmu_read(struct perf_event *event) /* Nothing to do ... updates are interrupt-driven */ } -/* Check if the new sampling period/freqeuncy is appropriate. +/* Check if the new sampling period/frequency is appropriate. * * Return non-zero on error and zero on passed checks. */ @@ -1814,8 +1697,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) memset(&si, 0, sizeof(si)); if (event->cpu == -1) { - if (qsi(&si)) - return -ENODEV; + qsi(&si); } else { /* Event is pinned to a particular CPU, retrieve the per-CPU * sampling structure for accessing the CPU-specific QSI. @@ -1825,7 +1707,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) si = cpuhw->qsi; } - do_freq = !!SAMPLE_FREQ_MODE(&event->hw); + do_freq = !!SAMPL_FREQ_MODE(&event->hw); rate = getrate(do_freq, value, &si); if (!rate) return -EINVAL; @@ -1833,10 +1715,6 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) event->attr.sample_period = rate; SAMPL_RATE(&event->hw) = rate; hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); - debug_sprintf_event(sfdbg, 4, "%s:" - " cpu %d value %#llx period %#llx freq %d\n", - __func__, event->cpu, value, - event->attr.sample_period, do_freq); return 0; } @@ -1847,12 +1725,8 @@ static void cpumsf_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + if (!(event->hw.state & PERF_HES_STOPPED)) return; - - if (flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - perf_pmu_disable(event->pmu); event->hw.state = 0; cpuhw->lsctl.cs = 1; @@ -1877,7 +1751,9 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags) event->hw.state |= PERF_HES_STOPPED; if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { - hw_perf_event_update(event, 1); + /* CPU hotplug off removes SDBs. No samples to extract. */ + if (cpuhw->flags & PMU_F_RESERVED) + hw_perf_event_update(event, 1); event->hw.state |= PERF_HES_UPTODATE; } perf_pmu_enable(event->pmu); @@ -1887,15 +1763,14 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); struct aux_buffer *aux; - int err; + int err = 0; if (cpuhw->flags & PMU_F_IN_USE) return -EAGAIN; - if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt) + if (!SAMPL_DIAG_MODE(&event->hw) && !sf_buffer_available(cpuhw)) return -EINVAL; - err = 0; perf_pmu_disable(event->pmu); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; @@ -1909,9 +1784,9 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) cpuhw->lsctl.h = 1; cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); if (!SAMPL_DIAG_MODE(&event->hw)) { - cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; - cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; - TEAR_REG(&event->hw) = (unsigned long) cpuhw->sfb.sdbt; + cpuhw->lsctl.tear = virt_to_phys(cpuhw->sfb.sdbt); + cpuhw->lsctl.dear = *(unsigned long *)cpuhw->sfb.sdbt; + TEAR_REG(&event->hw) = (unsigned long)cpuhw->sfb.sdbt; } /* Ensure sampling functions are in the disabled state. If disabled, @@ -2055,18 +1930,17 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Program alert request */ if (alert & CPU_MF_INT_SF_PRA) { - if (cpuhw->flags & PMU_F_IN_USE) + if (cpuhw->flags & PMU_F_IN_USE) { if (SAMPL_DIAG_MODE(&cpuhw->event->hw)) hw_collect_aux(cpuhw); else hw_perf_event_update(cpuhw->event, 0); - else - WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE)); + } } /* Report measurement alerts only for non-PRA codes */ if (alert != CPU_MF_INT_SF_PRA) - debug_sprintf_event(sfdbg, 6, "%s: alert %#x\n", __func__, + debug_sprintf_event(sfdbg, 6, "%s alert %#x\n", __func__, alert); /* Sampling authorization change request */ @@ -2082,7 +1956,7 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Invalid sampling buffer entry */ if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) { - pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n", + pr_err("A sampling buffer entry is incorrect (alert=%#x)\n", alert); cpuhw->flags |= PMU_F_ERR_IBE; sf_disable(); @@ -2094,7 +1968,7 @@ static int cpusf_pmu_setup(unsigned int cpu, int flags) /* Ignore the notification if no events are scheduled on the PMU. * This might be racy... */ - if (!atomic_read(&num_events)) + if (!refcount_read(&num_events)) return 0; local_irq_disable(); @@ -2156,10 +2030,12 @@ static const struct kernel_param_ops param_ops_sfb_size = { .get = param_get_sfb_size, }; -#define RS_INIT_FAILURE_QSI 0x0001 -#define RS_INIT_FAILURE_BSDES 0x0002 -#define RS_INIT_FAILURE_ALRT 0x0003 -#define RS_INIT_FAILURE_PERF 0x0004 +enum { + RS_INIT_FAILURE_BSDES = 2, /* Bad basic sampling size */ + RS_INIT_FAILURE_ALRT = 3, /* IRQ registration failure */ + RS_INIT_FAILURE_PERF = 4 /* PMU registration failure */ +}; + static void __init pr_cpumsf_err(unsigned int reason) { pr_err("Sampling facility support for perf is not available: " @@ -2175,11 +2051,7 @@ static int __init init_cpum_sampling_pmu(void) return -ENODEV; memset(&si, 0, sizeof(si)); - if (qsi(&si)) { - pr_cpumsf_err(RS_INIT_FAILURE_QSI); - return -ENODEV; - } - + qsi(&si); if (!si.as && !si.ad) return -ENODEV; @@ -2197,7 +2069,7 @@ static int __init init_cpum_sampling_pmu(void) CPUMF_EVENT_PTR(SF, SF_CYCLES_BASIC_DIAG); } - sfdbg = debug_register(KMSG_COMPONENT, 2, 1, 80); + sfdbg = debug_register("cpum_sf", 2, 1, 80); if (!sfdbg) { pr_err("Registering for s390dbf failed\n"); return -ENOMEM; diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index ea7729bebaa0..606750bae508 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -5,17 +5,17 @@ * Copyright IBM Corp. 2012, 2013 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> */ -#define KMSG_COMPONENT "perf" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "perf: " fmt #include <linux/kernel.h> #include <linux/perf_event.h> #include <linux/kvm_host.h> #include <linux/percpu.h> -#include <linux/export.h> #include <linux/seq_file.h> #include <linux/spinlock.h> +#include <linux/uaccess.h> #include <linux/sysfs.h> +#include <asm/stacktrace.h> #include <asm/irq.h> #include <asm/cpu_mf.h> #include <asm/lowcore.h> @@ -30,7 +30,7 @@ static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs) if (!stack) return NULL; - return (struct kvm_s390_sie_block *) stack->empty1[0]; + return (struct kvm_s390_sie_block *)stack->sie_control_block; } static bool is_in_guest(struct pt_regs *regs) @@ -54,7 +54,7 @@ static unsigned long instruction_pointer_guest(struct pt_regs *regs) return sie_block(regs)->gpsw.addr; } -unsigned long perf_instruction_pointer(struct pt_regs *regs) +unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) { return is_in_guest(regs) ? instruction_pointer_guest(regs) : instruction_pointer(regs); @@ -81,7 +81,7 @@ static unsigned long perf_misc_flags_sf(struct pt_regs *regs) return flags; } -unsigned long perf_misc_flags(struct pt_regs *regs) +unsigned long perf_arch_misc_flags(struct pt_regs *regs) { /* Check if the cpum_sf PMU has created the pt_regs structure. * In this case, perf misc flags can be easily extracted. Otherwise, @@ -212,6 +212,12 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } } +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + arch_stack_walk_user_common(NULL, NULL, entry, regs, true); +} + /* Perf definitions for PMU event attributes in sysfs */ ssize_t cpumf_events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) @@ -219,5 +225,5 @@ ssize_t cpumf_events_sysfs_show(struct device *dev, struct perf_pmu_events_attr *pmu_attr; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - return sprintf(page, "event=0x%04llx\n", pmu_attr->id); + return sysfs_emit(page, "event=0x%04llx\n", pmu_attr->id); } diff --git a/arch/s390/kernel/perf_pai.c b/arch/s390/kernel/perf_pai.c new file mode 100644 index 000000000000..810f5b6c5e01 --- /dev/null +++ b/arch/s390/kernel/perf_pai.c @@ -0,0 +1,1230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Performance event support - Processor Activity Instrumentation Facility + * + * Copyright IBM Corp. 2026 + * Author(s): Thomas Richter <tmricht@linux.ibm.com> + */ +#define pr_fmt(fmt) "pai: " fmt + +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/perf_event.h> +#include <asm/ctlreg.h> +#include <asm/pai.h> +#include <asm/debug.h> + +static debug_info_t *paidbg; + +DEFINE_STATIC_KEY_FALSE(pai_key); + +enum { + PAI_PMU_CRYPTO, /* Index of PMU pai_crypto */ + PAI_PMU_EXT, /* Index of PMU pai_ext */ + PAI_PMU_MAX /* # of PAI PMUs */ +}; + +enum { + PAIE1_CB_SZ = 0x200, /* Size of PAIE1 control block */ + PAIE1_CTRBLOCK_SZ = 0x400 /* Size of PAIE1 counter blocks */ +}; + +struct pai_userdata { + u16 num; + u64 value; +} __packed; + +/* Create the PAI extension 1 control block area. + * The PAI extension control block 1 is pointed to by lowcore + * address 0x1508 for each CPU. This control block is 512 bytes in size + * and requires a 512 byte boundary alignment. + */ +struct paiext_cb { /* PAI extension 1 control block */ + u64 header; /* Not used */ + u64 reserved1; + u64 acc; /* Addr to analytics counter control block */ + u8 reserved2[PAIE1_CTRBLOCK_SZ - 3 * sizeof(u64)]; +} __packed; + +struct pai_map { + unsigned long *area; /* Area for CPU to store counters */ + struct pai_userdata *save; /* Page to store no-zero counters */ + unsigned int active_events; /* # of PAI crypto users */ + refcount_t refcnt; /* Reference count mapped buffers */ + struct perf_event *event; /* Perf event for sampling */ + struct list_head syswide_list; /* List system-wide sampling events */ + struct paiext_cb *paiext_cb; /* PAI extension control block area */ + bool fullpage; /* True: counter area is a full page */ +}; + +struct pai_mapptr { + struct pai_map *mapptr; +}; + +static struct pai_root { /* Anchor to per CPU data */ + refcount_t refcnt; /* Overall active events */ + struct pai_mapptr __percpu *mapptr; +} pai_root[PAI_PMU_MAX]; + +/* This table defines the different parameters of the PAI PMUs. During + * initialization the machine dependent values are extracted and saved. + * However most of the values are static and do not change. + * There is one table entry per PAI PMU. + */ +struct pai_pmu { /* Define PAI PMU characteristics */ + const char *pmuname; /* Name of PMU */ + const int facility_nr; /* Facility number to check for support */ + unsigned int num_avail; /* # Counters defined by hardware */ + unsigned int num_named; /* # Counters known by name */ + unsigned long base; /* Counter set base number */ + unsigned long kernel_offset; /* Offset to kernel part in counter page */ + unsigned long area_size; /* Size of counter area */ + const char * const *names; /* List of counter names */ + struct pmu *pmu; /* Ptr to supporting PMU */ + int (*init)(struct pai_pmu *p); /* PMU support init function */ + void (*exit)(struct pai_pmu *p); /* PMU support exit function */ + struct attribute_group *event_group; /* Ptr to attribute of events */ +}; + +static struct pai_pmu pai_pmu[]; /* Forward declaration */ + +/* Free per CPU data when the last event is removed. */ +static void pai_root_free(int idx) +{ + if (refcount_dec_and_test(&pai_root[idx].refcnt)) { + free_percpu(pai_root[idx].mapptr); + pai_root[idx].mapptr = NULL; + } + debug_sprintf_event(paidbg, 5, "%s root[%d].refcount %d\n", __func__, + idx, refcount_read(&pai_root[idx].refcnt)); +} + +/* + * On initialization of first event also allocate per CPU data dynamically. + * Start with an array of pointers, the array size is the maximum number of + * CPUs possible, which might be larger than the number of CPUs currently + * online. + */ +static int pai_root_alloc(int idx) +{ + if (!refcount_inc_not_zero(&pai_root[idx].refcnt)) { + /* The memory is already zeroed. */ + pai_root[idx].mapptr = alloc_percpu(struct pai_mapptr); + if (!pai_root[idx].mapptr) + return -ENOMEM; + refcount_set(&pai_root[idx].refcnt, 1); + } + return 0; +} + +/* Release the PMU if event is the last perf event */ +static DEFINE_MUTEX(pai_reserve_mutex); + +/* Free all memory allocated for event counting/sampling setup */ +static void pai_free(struct pai_mapptr *mp) +{ + if (mp->mapptr->fullpage) + free_page((unsigned long)mp->mapptr->area); + else + kfree(mp->mapptr->area); + kfree(mp->mapptr->paiext_cb); + kvfree(mp->mapptr->save); + kfree(mp->mapptr); + mp->mapptr = NULL; +} + +/* Adjust usage counters and remove allocated memory when all users are + * gone. + */ +static void pai_event_destroy_cpu(struct perf_event *event, int cpu) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = per_cpu_ptr(pai_root[idx].mapptr, cpu); + struct pai_map *cpump = mp->mapptr; + + mutex_lock(&pai_reserve_mutex); + debug_sprintf_event(paidbg, 5, "%s event %#llx idx %d cpu %d users %d " + "refcnt %u\n", __func__, event->attr.config, idx, + event->cpu, cpump->active_events, + refcount_read(&cpump->refcnt)); + if (refcount_dec_and_test(&cpump->refcnt)) + pai_free(mp); + pai_root_free(idx); + mutex_unlock(&pai_reserve_mutex); +} + +static void pai_event_destroy(struct perf_event *event) +{ + int cpu; + + free_page(PAI_SAVE_AREA(event)); + if (event->cpu == -1) { + struct cpumask *mask = PAI_CPU_MASK(event); + + for_each_cpu(cpu, mask) + pai_event_destroy_cpu(event, cpu); + kfree(mask); + } else { + pai_event_destroy_cpu(event, event->cpu); + } +} + +static void paicrypt_event_destroy(struct perf_event *event) +{ + static_branch_dec(&pai_key); + pai_event_destroy(event); +} + +static u64 pai_getctr(unsigned long *page, int nr, unsigned long offset) +{ + if (offset) + nr += offset / sizeof(*page); + return page[nr]; +} + +/* Read the counter values. Return value from location in CMP. For base + * event xxx_ALL sum up all events. Returns counter value. + */ +static u64 pai_getdata(struct perf_event *event, bool kernel) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_pmu *pp = &pai_pmu[idx]; + struct pai_map *cpump = mp->mapptr; + unsigned int i; + u64 sum = 0; + + if (event->attr.config != pp->base) { + return pai_getctr(cpump->area, + event->attr.config - pp->base, + kernel ? pp->kernel_offset : 0); + } + + for (i = 1; i <= pp->num_avail; i++) { + u64 val = pai_getctr(cpump->area, i, + kernel ? pp->kernel_offset : 0); + + if (!val) + continue; + sum += val; + } + return sum; +} + +static u64 paicrypt_getall(struct perf_event *event) +{ + u64 sum = 0; + + if (!event->attr.exclude_kernel) + sum += pai_getdata(event, true); + if (!event->attr.exclude_user) + sum += pai_getdata(event, false); + + return sum; +} + +/* Check concurrent access of counting and sampling for crypto events. + * This function is called in process context and it is save to block. + * When the event initialization functions fails, no other call back will + * be invoked. + * + * Allocate the memory for the event. + */ +static int pai_alloc_cpu(struct perf_event *event, int cpu) +{ + int rc, idx = PAI_PMU_IDX(event); + struct pai_map *cpump = NULL; + bool need_paiext_cb = false; + struct pai_mapptr *mp; + + mutex_lock(&pai_reserve_mutex); + /* Allocate root node */ + rc = pai_root_alloc(idx); + if (rc) + goto unlock; + + /* Allocate node for this event */ + mp = per_cpu_ptr(pai_root[idx].mapptr, cpu); + cpump = mp->mapptr; + if (!cpump) { /* Paicrypt_map allocated? */ + rc = -ENOMEM; + cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); + if (!cpump) + goto undo; + /* Allocate memory for counter page and counter extraction. + * Only the first counting event has to allocate a page. + */ + mp->mapptr = cpump; + if (idx == PAI_PMU_CRYPTO) { + cpump->area = (unsigned long *)get_zeroed_page(GFP_KERNEL); + /* free_page() can handle 0x0 address */ + cpump->fullpage = true; + } else { /* PAI_PMU_EXT */ + /* + * Allocate memory for counter area and counter extraction. + * These are + * - a 512 byte block and requires 512 byte boundary + * alignment. + * - a 1KB byte block and requires 1KB boundary + * alignment. + * Only the first counting event has to allocate the area. + * + * Note: This works with commit 59bb47985c1d by default. + * Backporting this to kernels without this commit might + * needs adjustment. + */ + cpump->area = kzalloc(pai_pmu[idx].area_size, GFP_KERNEL); + cpump->paiext_cb = kzalloc(PAIE1_CB_SZ, GFP_KERNEL); + need_paiext_cb = true; + } + cpump->save = kvmalloc_array(pai_pmu[idx].num_avail + 1, + sizeof(struct pai_userdata), + GFP_KERNEL); + if (!cpump->area || !cpump->save || + (need_paiext_cb && !cpump->paiext_cb)) { + pai_free(mp); + goto undo; + } + INIT_LIST_HEAD(&cpump->syswide_list); + refcount_set(&cpump->refcnt, 1); + rc = 0; + } else { + refcount_inc(&cpump->refcnt); + } + +undo: + if (rc) { + /* Error in allocation of event, decrement anchor. Since + * the event in not created, its destroy() function is never + * invoked. Adjust the reference counter for the anchor. + */ + pai_root_free(idx); + } +unlock: + mutex_unlock(&pai_reserve_mutex); + /* If rc is non-zero, no increment of counter/sampler was done. */ + return rc; +} + +static int pai_alloc(struct perf_event *event) +{ + struct cpumask *maskptr; + int cpu, rc = -ENOMEM; + + maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); + if (!maskptr) + goto out; + + for_each_online_cpu(cpu) { + rc = pai_alloc_cpu(event, cpu); + if (rc) { + for_each_cpu(cpu, maskptr) + pai_event_destroy_cpu(event, cpu); + kfree(maskptr); + goto out; + } + cpumask_set_cpu(cpu, maskptr); + } + + /* + * On error all cpumask are freed and all events have been destroyed. + * Save of which CPUs data structures have been allocated for. + * Release them in pai_event_destroy call back function + * for this event. + */ + PAI_CPU_MASK(event) = maskptr; + rc = 0; +out: + return rc; +} + +/* Validate event number and return error if event is not supported. + * On successful return, PAI_PMU_IDX(event) is set to the index of + * the supporting paing_support[] array element. + */ +static int pai_event_valid(struct perf_event *event, int idx) +{ + struct perf_event_attr *a = &event->attr; + struct pai_pmu *pp = &pai_pmu[idx]; + + /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ + if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) + return -ENOENT; + /* Allow only CRYPTO_ALL/NNPA_ALL for sampling */ + if (a->sample_period && a->config != pp->base) + return -EINVAL; + /* PAI crypto event must be in valid range, try others if not */ + if (a->config < pp->base || a->config > pp->base + pp->num_avail) + return -ENOENT; + if (idx == PAI_PMU_EXT && a->exclude_user) + return -EINVAL; + PAI_PMU_IDX(event) = idx; + return 0; +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int pai_event_init(struct perf_event *event, int idx) +{ + struct perf_event_attr *a = &event->attr; + int rc; + + /* PAI event must be valid and in supported range */ + rc = pai_event_valid(event, idx); + if (rc) + goto out; + /* Get a page to store last counter values for sampling */ + if (a->sample_period) { + PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL); + if (!PAI_SAVE_AREA(event)) { + rc = -ENOMEM; + goto out; + } + } + + if (event->cpu >= 0) + rc = pai_alloc_cpu(event, event->cpu); + else + rc = pai_alloc(event); + if (rc) { + free_page(PAI_SAVE_AREA(event)); + goto out; + } + + if (a->sample_period) { + a->sample_period = 1; + a->freq = 0; + /* Register for paicrypt_sched_task() to be called */ + event->attach_state |= PERF_ATTACH_SCHED_CB; + /* Add raw data which contain the memory mapped counters */ + a->sample_type |= PERF_SAMPLE_RAW; + /* Turn off inheritance */ + a->inherit = 0; + } +out: + return rc; +} + +static int paicrypt_event_init(struct perf_event *event) +{ + int rc = pai_event_init(event, PAI_PMU_CRYPTO); + + if (!rc) { + event->destroy = paicrypt_event_destroy; + static_branch_inc(&pai_key); + } + return rc; +} + +static void pai_read(struct perf_event *event, + u64 (*fct)(struct perf_event *event)) +{ + u64 prev, new, delta; + + prev = local64_read(&event->hw.prev_count); + new = fct(event); + local64_set(&event->hw.prev_count, new); + delta = (prev <= new) ? new - prev : (-1ULL - prev) + new + 1; + local64_add(delta, &event->count); +} + +static void paicrypt_read(struct perf_event *event) +{ + pai_read(event, paicrypt_getall); +} + +static void pai_start(struct perf_event *event, int flags, + u64 (*fct)(struct perf_event *event)) +{ + int idx = PAI_PMU_IDX(event); + struct pai_pmu *pp = &pai_pmu[idx]; + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + u64 sum; + + if (!event->attr.sample_period) { /* Counting */ + sum = fct(event); /* Get current value */ + local64_set(&event->hw.prev_count, sum); + } else { /* Sampling */ + memcpy((void *)PAI_SAVE_AREA(event), cpump->area, pp->area_size); + /* Enable context switch callback for system-wide sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); + perf_sched_cb_inc(event->pmu); + } else { + cpump->event = event; + } + } +} + +static void paicrypt_start(struct perf_event *event, int flags) +{ + pai_start(event, flags, paicrypt_getall); +} + +static int pai_add(struct perf_event *event, int flags) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + unsigned long ccd; + + if (++cpump->active_events == 1) { + if (!pcb) { /* PAI crypto */ + ccd = virt_to_phys(cpump->area) | PAI_CRYPTO_KERNEL_OFFSET; + WRITE_ONCE(get_lowcore()->ccd, ccd); + local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); + } else { /* PAI extension 1 */ + ccd = virt_to_phys(pcb); + WRITE_ONCE(get_lowcore()->aicd, ccd); + pcb->acc = virt_to_phys(cpump->area) | 0x1; + /* Enable CPU instruction lookup for PAIE1 control block */ + local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT); + } + } + if (flags & PERF_EF_START) + pai_pmu[idx].pmu->start(event, PERF_EF_RELOAD); + event->hw.state = 0; + return 0; +} + +static int paicrypt_add(struct perf_event *event, int flags) +{ + return pai_add(event, flags); +} + +static void pai_have_sample(struct perf_event *, struct pai_map *); +static void pai_stop(struct perf_event *event, int flags) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + + if (!event->attr.sample_period) { /* Counting */ + pai_pmu[idx].pmu->read(event); + } else { /* Sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + perf_sched_cb_dec(event->pmu); + list_del(PAI_SWLIST(event)); + } else { + pai_have_sample(event, cpump); + cpump->event = NULL; + } + } + event->hw.state = PERF_HES_STOPPED; +} + +static void paicrypt_stop(struct perf_event *event, int flags) +{ + pai_stop(event, flags); +} + +static void pai_del(struct perf_event *event, int flags) +{ + int idx = PAI_PMU_IDX(event); + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + struct paiext_cb *pcb = cpump->paiext_cb; + + pai_pmu[idx].pmu->stop(event, PERF_EF_UPDATE); + if (--cpump->active_events == 0) { + if (!pcb) { /* PAI crypto */ + local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); + WRITE_ONCE(get_lowcore()->ccd, 0); + } else { /* PAI extension 1 */ + /* Disable CPU instruction lookup for PAIE1 control block */ + local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT); + pcb->acc = 0; + WRITE_ONCE(get_lowcore()->aicd, 0); + } + } +} + +static void paicrypt_del(struct perf_event *event, int flags) +{ + pai_del(event, flags); +} + +/* Create raw data and save it in buffer. Calculate the delta for each + * counter between this invocation and the last invocation. + * Returns number of bytes copied. + * Saves only entries with positive counter difference of the form + * 2 bytes: Number of counter + * 8 bytes: Value of counter + */ +static size_t pai_copy(struct pai_userdata *userdata, unsigned long *page, + struct pai_pmu *pp, unsigned long *page_old, + bool exclude_user, bool exclude_kernel) +{ + int i, outidx = 0; + + for (i = 1; i <= pp->num_avail; i++) { + u64 val = 0, val_old = 0; + + if (!exclude_kernel) { + val += pai_getctr(page, i, pp->kernel_offset); + val_old += pai_getctr(page_old, i, pp->kernel_offset); + } + if (!exclude_user) { + val += pai_getctr(page, i, 0); + val_old += pai_getctr(page_old, i, 0); + } + if (val >= val_old) + val -= val_old; + else + val = (~0ULL - val_old) + val + 1; + if (val) { + userdata[outidx].num = i; + userdata[outidx].value = val; + outidx++; + } + } + return outidx * sizeof(*userdata); +} + +/* Write sample when one or more counters values are nonzero. + * + * Note: The function paicrypt_sched_task() and pai_push_sample() are not + * invoked after function paicrypt_del() has been called because of function + * perf_sched_cb_dec(). Both functions are only + * called when sampling is active. Function perf_sched_cb_inc() + * has been invoked to install function paicrypt_sched_task() as call back + * to run at context switch time. + * + * This causes function perf_event_context_sched_out() and + * perf_event_context_sched_in() to check whether the PMU has installed an + * sched_task() callback. That callback is not active after paicrypt_del() + * returns and has deleted the event on that CPU. + */ +static int pai_push_sample(size_t rawsize, struct pai_map *cpump, + struct perf_event *event) +{ + int idx = PAI_PMU_IDX(event); + struct pai_pmu *pp = &pai_pmu[idx]; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + memset(&data, 0, sizeof(data)); + perf_sample_data_init(&data, 0, event->hw.last_period); + if (event->attr.sample_type & PERF_SAMPLE_TID) { + data.tid_entry.pid = task_tgid_nr(current); + data.tid_entry.tid = task_pid_nr(current); + } + if (event->attr.sample_type & PERF_SAMPLE_TIME) + data.time = event->clock(); + if (event->attr.sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) + data.id = event->id; + if (event->attr.sample_type & PERF_SAMPLE_CPU) { + data.cpu_entry.cpu = smp_processor_id(); + data.cpu_entry.reserved = 0; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = rawsize; + raw.frag.data = cpump->save; + perf_sample_save_raw_data(&data, event, &raw); + } + + overflow = perf_event_overflow(event, &data, ®s); + perf_event_update_userpage(event); + /* Save crypto counter lowcore page after reading event data. */ + memcpy((void *)PAI_SAVE_AREA(event), cpump->area, pp->area_size); + return overflow; +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void pai_have_sample(struct perf_event *event, struct pai_map *cpump) +{ + struct pai_pmu *pp; + size_t rawsize; + + if (!event) /* No event active */ + return; + pp = &pai_pmu[PAI_PMU_IDX(event)]; + rawsize = pai_copy(cpump->save, cpump->area, pp, + (unsigned long *)PAI_SAVE_AREA(event), + event->attr.exclude_user, + event->attr.exclude_kernel); + if (rawsize) /* No incremented counters */ + pai_push_sample(rawsize, cpump, event); +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void pai_have_samples(int idx) +{ + struct pai_mapptr *mp = this_cpu_ptr(pai_root[idx].mapptr); + struct pai_map *cpump = mp->mapptr; + struct perf_event *event; + + list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) + pai_have_sample(event, cpump); +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event CRYPTO_ALL is allowed. + */ +static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, save old values. + */ + if (!sched_in) + pai_have_samples(PAI_PMU_CRYPTO); +} + +/* ============================= paiext ====================================*/ + +static void paiext_event_destroy(struct perf_event *event) +{ + pai_event_destroy(event); +} + +/* Might be called on different CPU than the one the event is intended for. */ +static int paiext_event_init(struct perf_event *event) +{ + int rc = pai_event_init(event, PAI_PMU_EXT); + + if (!rc) { + event->attr.exclude_kernel = true; /* No kernel space part */ + event->destroy = paiext_event_destroy; + /* Offset of NNPA in paiext_cb */ + event->hw.config_base = offsetof(struct paiext_cb, acc); + } + return rc; +} + +static u64 paiext_getall(struct perf_event *event) +{ + return pai_getdata(event, false); +} + +static void paiext_read(struct perf_event *event) +{ + pai_read(event, paiext_getall); +} + +static void paiext_start(struct perf_event *event, int flags) +{ + pai_start(event, flags, paiext_getall); +} + +static int paiext_add(struct perf_event *event, int flags) +{ + return pai_add(event, flags); +} + +static void paiext_stop(struct perf_event *event, int flags) +{ + pai_stop(event, flags); +} + +static void paiext_del(struct perf_event *event, int flags) +{ + pai_del(event, flags); +} + +/* Called on schedule-in and schedule-out. No access to event structure, + * but for sampling only event NNPA_ALL is allowed. + */ +static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, + struct task_struct *task, bool sched_in) +{ + /* We started with a clean page on event installation. So read out + * results on schedule_out and if page was dirty, save old values. + */ + if (!sched_in) + pai_have_samples(PAI_PMU_EXT); +} + +/* Attribute definitions for paicrypt interface. As with other CPU + * Measurement Facilities, there is one attribute per mapped counter. + * The number of mapped counters may vary per machine generation. Use + * the QUERY PROCESSOR ACTIVITY COUNTER INFORMATION (QPACI) instruction + * to determine the number of mapped counters. The instructions returns + * a positive number, which is the highest number of supported counters. + * All counters less than this number are also supported, there are no + * holes. A returned number of zero means no support for mapped counters. + * + * The identification of the counter is a unique number. The chosen range + * is 0x1000 + offset in mapped kernel page. + * All CPU Measurement Facility counters identifiers must be unique and + * the numbers from 0 to 496 are already used for the CPU Measurement + * Counter facility. Numbers 0xb0000, 0xbc000 and 0xbd000 are already + * used for the CPU Measurement Sampling facility. + */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *paicrypt_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paicrypt_events_group = { + .name = "events", + .attrs = NULL /* Filled in attr_event_init() */ +}; + +static struct attribute_group paicrypt_format_group = { + .name = "format", + .attrs = paicrypt_format_attr, +}; + +static const struct attribute_group *paicrypt_attr_groups[] = { + &paicrypt_events_group, + &paicrypt_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paicrypt = { + .task_ctx_nr = perf_hw_context, + .event_init = paicrypt_event_init, + .add = paicrypt_add, + .del = paicrypt_del, + .start = paicrypt_start, + .stop = paicrypt_stop, + .read = paicrypt_read, + .sched_task = paicrypt_sched_task, + .attr_groups = paicrypt_attr_groups +}; + +/* List of symbolic PAI counter names. */ +static const char * const paicrypt_ctrnames[] = { + [0] = "CRYPTO_ALL", + [1] = "KM_DEA", + [2] = "KM_TDEA_128", + [3] = "KM_TDEA_192", + [4] = "KM_ENCRYPTED_DEA", + [5] = "KM_ENCRYPTED_TDEA_128", + [6] = "KM_ENCRYPTED_TDEA_192", + [7] = "KM_AES_128", + [8] = "KM_AES_192", + [9] = "KM_AES_256", + [10] = "KM_ENCRYPTED_AES_128", + [11] = "KM_ENCRYPTED_AES_192", + [12] = "KM_ENCRYPTED_AES_256", + [13] = "KM_XTS_AES_128", + [14] = "KM_XTS_AES_256", + [15] = "KM_XTS_ENCRYPTED_AES_128", + [16] = "KM_XTS_ENCRYPTED_AES_256", + [17] = "KMC_DEA", + [18] = "KMC_TDEA_128", + [19] = "KMC_TDEA_192", + [20] = "KMC_ENCRYPTED_DEA", + [21] = "KMC_ENCRYPTED_TDEA_128", + [22] = "KMC_ENCRYPTED_TDEA_192", + [23] = "KMC_AES_128", + [24] = "KMC_AES_192", + [25] = "KMC_AES_256", + [26] = "KMC_ENCRYPTED_AES_128", + [27] = "KMC_ENCRYPTED_AES_192", + [28] = "KMC_ENCRYPTED_AES_256", + [29] = "KMC_PRNG", + [30] = "KMA_GCM_AES_128", + [31] = "KMA_GCM_AES_192", + [32] = "KMA_GCM_AES_256", + [33] = "KMA_GCM_ENCRYPTED_AES_128", + [34] = "KMA_GCM_ENCRYPTED_AES_192", + [35] = "KMA_GCM_ENCRYPTED_AES_256", + [36] = "KMF_DEA", + [37] = "KMF_TDEA_128", + [38] = "KMF_TDEA_192", + [39] = "KMF_ENCRYPTED_DEA", + [40] = "KMF_ENCRYPTED_TDEA_128", + [41] = "KMF_ENCRYPTED_TDEA_192", + [42] = "KMF_AES_128", + [43] = "KMF_AES_192", + [44] = "KMF_AES_256", + [45] = "KMF_ENCRYPTED_AES_128", + [46] = "KMF_ENCRYPTED_AES_192", + [47] = "KMF_ENCRYPTED_AES_256", + [48] = "KMCTR_DEA", + [49] = "KMCTR_TDEA_128", + [50] = "KMCTR_TDEA_192", + [51] = "KMCTR_ENCRYPTED_DEA", + [52] = "KMCTR_ENCRYPTED_TDEA_128", + [53] = "KMCTR_ENCRYPTED_TDEA_192", + [54] = "KMCTR_AES_128", + [55] = "KMCTR_AES_192", + [56] = "KMCTR_AES_256", + [57] = "KMCTR_ENCRYPTED_AES_128", + [58] = "KMCTR_ENCRYPTED_AES_192", + [59] = "KMCTR_ENCRYPTED_AES_256", + [60] = "KMO_DEA", + [61] = "KMO_TDEA_128", + [62] = "KMO_TDEA_192", + [63] = "KMO_ENCRYPTED_DEA", + [64] = "KMO_ENCRYPTED_TDEA_128", + [65] = "KMO_ENCRYPTED_TDEA_192", + [66] = "KMO_AES_128", + [67] = "KMO_AES_192", + [68] = "KMO_AES_256", + [69] = "KMO_ENCRYPTED_AES_128", + [70] = "KMO_ENCRYPTED_AES_192", + [71] = "KMO_ENCRYPTED_AES_256", + [72] = "KIMD_SHA_1", + [73] = "KIMD_SHA_256", + [74] = "KIMD_SHA_512", + [75] = "KIMD_SHA3_224", + [76] = "KIMD_SHA3_256", + [77] = "KIMD_SHA3_384", + [78] = "KIMD_SHA3_512", + [79] = "KIMD_SHAKE_128", + [80] = "KIMD_SHAKE_256", + [81] = "KIMD_GHASH", + [82] = "KLMD_SHA_1", + [83] = "KLMD_SHA_256", + [84] = "KLMD_SHA_512", + [85] = "KLMD_SHA3_224", + [86] = "KLMD_SHA3_256", + [87] = "KLMD_SHA3_384", + [88] = "KLMD_SHA3_512", + [89] = "KLMD_SHAKE_128", + [90] = "KLMD_SHAKE_256", + [91] = "KMAC_DEA", + [92] = "KMAC_TDEA_128", + [93] = "KMAC_TDEA_192", + [94] = "KMAC_ENCRYPTED_DEA", + [95] = "KMAC_ENCRYPTED_TDEA_128", + [96] = "KMAC_ENCRYPTED_TDEA_192", + [97] = "KMAC_AES_128", + [98] = "KMAC_AES_192", + [99] = "KMAC_AES_256", + [100] = "KMAC_ENCRYPTED_AES_128", + [101] = "KMAC_ENCRYPTED_AES_192", + [102] = "KMAC_ENCRYPTED_AES_256", + [103] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_DEA", + [104] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_128", + [105] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_TDEA_192", + [106] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_DEA", + [107] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_128", + [108] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_TDEA_192", + [109] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_128", + [110] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_192", + [111] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_AES_256", + [112] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_128", + [113] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_192", + [114] = "PCC_COMPUTE_LAST_BLOCK_CMAC_USING_ENCRYPTED_AES_256", + [115] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_128", + [116] = "PCC_COMPUTE_XTS_PARAMETER_USING_AES_256", + [117] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_128", + [118] = "PCC_COMPUTE_XTS_PARAMETER_USING_ENCRYPTED_AES_256", + [119] = "PCC_SCALAR_MULTIPLY_P256", + [120] = "PCC_SCALAR_MULTIPLY_P384", + [121] = "PCC_SCALAR_MULTIPLY_P521", + [122] = "PCC_SCALAR_MULTIPLY_ED25519", + [123] = "PCC_SCALAR_MULTIPLY_ED448", + [124] = "PCC_SCALAR_MULTIPLY_X25519", + [125] = "PCC_SCALAR_MULTIPLY_X448", + [126] = "PRNO_SHA_512_DRNG", + [127] = "PRNO_TRNG_QUERY_RAW_TO_CONDITIONED_RATIO", + [128] = "PRNO_TRNG", + [129] = "KDSA_ECDSA_VERIFY_P256", + [130] = "KDSA_ECDSA_VERIFY_P384", + [131] = "KDSA_ECDSA_VERIFY_P521", + [132] = "KDSA_ECDSA_SIGN_P256", + [133] = "KDSA_ECDSA_SIGN_P384", + [134] = "KDSA_ECDSA_SIGN_P521", + [135] = "KDSA_ENCRYPTED_ECDSA_SIGN_P256", + [136] = "KDSA_ENCRYPTED_ECDSA_SIGN_P384", + [137] = "KDSA_ENCRYPTED_ECDSA_SIGN_P521", + [138] = "KDSA_EDDSA_VERIFY_ED25519", + [139] = "KDSA_EDDSA_VERIFY_ED448", + [140] = "KDSA_EDDSA_SIGN_ED25519", + [141] = "KDSA_EDDSA_SIGN_ED448", + [142] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED25519", + [143] = "KDSA_ENCRYPTED_EDDSA_SIGN_ED448", + [144] = "PCKMO_ENCRYPT_DEA_KEY", + [145] = "PCKMO_ENCRYPT_TDEA_128_KEY", + [146] = "PCKMO_ENCRYPT_TDEA_192_KEY", + [147] = "PCKMO_ENCRYPT_AES_128_KEY", + [148] = "PCKMO_ENCRYPT_AES_192_KEY", + [149] = "PCKMO_ENCRYPT_AES_256_KEY", + [150] = "PCKMO_ENCRYPT_ECC_P256_KEY", + [151] = "PCKMO_ENCRYPT_ECC_P384_KEY", + [152] = "PCKMO_ENCRYPT_ECC_P521_KEY", + [153] = "PCKMO_ENCRYPT_ECC_ED25519_KEY", + [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", + [155] = "IBM_RESERVED_155", + [156] = "IBM_RESERVED_156", + [157] = "KM_FULL_XTS_AES_128", + [158] = "KM_FULL_XTS_AES_256", + [159] = "KM_FULL_XTS_ENCRYPTED_AES_128", + [160] = "KM_FULL_XTS_ENCRYPTED_AES_256", + [161] = "KMAC_HMAC_SHA_224", + [162] = "KMAC_HMAC_SHA_256", + [163] = "KMAC_HMAC_SHA_384", + [164] = "KMAC_HMAC_SHA_512", + [165] = "KMAC_HMAC_ENCRYPTED_SHA_224", + [166] = "KMAC_HMAC_ENCRYPTED_SHA_256", + [167] = "KMAC_HMAC_ENCRYPTED_SHA_384", + [168] = "KMAC_HMAC_ENCRYPTED_SHA_512", + [169] = "PCKMO_ENCRYPT_HMAC_512_KEY", + [170] = "PCKMO_ENCRYPT_HMAC_1024_KEY", + [171] = "PCKMO_ENCRYPT_AES_XTS_128", + [172] = "PCKMO_ENCRYPT_AES_XTS_256", +}; + +static struct attribute *paiext_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group paiext_events_group = { + .name = "events", + .attrs = NULL, /* Filled in attr_event_init() */ +}; + +static struct attribute_group paiext_format_group = { + .name = "format", + .attrs = paiext_format_attr, +}; + +static const struct attribute_group *paiext_attr_groups[] = { + &paiext_events_group, + &paiext_format_group, + NULL, +}; + +/* Performance monitoring unit for mapped counters */ +static struct pmu paiext = { + .task_ctx_nr = perf_hw_context, + .event_init = paiext_event_init, + .add = paiext_add, + .del = paiext_del, + .start = paiext_start, + .stop = paiext_stop, + .read = paiext_read, + .sched_task = paiext_sched_task, + .attr_groups = paiext_attr_groups, +}; + +/* List of symbolic PAI extension 1 NNPA counter names. */ +static const char * const paiext_ctrnames[] = { + [0] = "NNPA_ALL", + [1] = "NNPA_ADD", + [2] = "NNPA_SUB", + [3] = "NNPA_MUL", + [4] = "NNPA_DIV", + [5] = "NNPA_MIN", + [6] = "NNPA_MAX", + [7] = "NNPA_LOG", + [8] = "NNPA_EXP", + [9] = "NNPA_IBM_RESERVED_9", + [10] = "NNPA_RELU", + [11] = "NNPA_TANH", + [12] = "NNPA_SIGMOID", + [13] = "NNPA_SOFTMAX", + [14] = "NNPA_BATCHNORM", + [15] = "NNPA_MAXPOOL2D", + [16] = "NNPA_AVGPOOL2D", + [17] = "NNPA_LSTMACT", + [18] = "NNPA_GRUACT", + [19] = "NNPA_CONVOLUTION", + [20] = "NNPA_MATMUL_OP", + [21] = "NNPA_MATMUL_OP_BCAST23", + [22] = "NNPA_SMALLBATCH", + [23] = "NNPA_LARGEDIM", + [24] = "NNPA_SMALLTENSOR", + [25] = "NNPA_1MFRAME", + [26] = "NNPA_2GFRAME", + [27] = "NNPA_ACCESSEXCEPT", + [28] = "NNPA_TRANSFORM", + [29] = "NNPA_GELU", + [30] = "NNPA_MOMENTS", + [31] = "NNPA_LAYERNORM", + [32] = "NNPA_MATMUL_OP_BCAST1", + [33] = "NNPA_SQRT", + [34] = "NNPA_INVSQRT", + [35] = "NNPA_NORM", + [36] = "NNPA_REDUCE", +}; + +static void __init attr_event_free(struct attribute **attrs) +{ + struct perf_pmu_events_attr *pa; + unsigned int i; + + for (i = 0; attrs[i]; i++) { + struct device_attribute *dap; + + dap = container_of(attrs[i], struct device_attribute, attr); + pa = container_of(dap, struct perf_pmu_events_attr, attr); + kfree(pa); + } + kfree(attrs); +} + +static struct attribute * __init attr_event_init_one(int num, + unsigned long base, + const char *name) +{ + struct perf_pmu_events_attr *pa; + + pa = kzalloc(sizeof(*pa), GFP_KERNEL); + if (!pa) + return NULL; + + sysfs_attr_init(&pa->attr.attr); + pa->id = base + num; + pa->attr.attr.name = name; + pa->attr.attr.mode = 0444; + pa->attr.show = cpumf_events_sysfs_show; + pa->attr.store = NULL; + return &pa->attr.attr; +} + +static struct attribute ** __init attr_event_init(struct pai_pmu *p) +{ + unsigned int min_attr = min_t(unsigned int, p->num_named, p->num_avail); + struct attribute **attrs; + unsigned int i; + + attrs = kmalloc_array(min_attr + 1, sizeof(*attrs), GFP_KERNEL | __GFP_ZERO); + if (!attrs) + goto out; + for (i = 0; i < min_attr; i++) { + attrs[i] = attr_event_init_one(i, p->base, p->names[i]); + if (!attrs[i]) { + attr_event_free(attrs); + attrs = NULL; + goto out; + } + } + attrs[i] = NULL; +out: + return attrs; +} + +static void __init pai_pmu_exit(struct pai_pmu *p) +{ + attr_event_free(p->event_group->attrs); + p->event_group->attrs = NULL; +} + +/* Add a PMU. Install its events and register the PMU device driver + * call back functions. + */ +static int __init pai_pmu_init(struct pai_pmu *p) +{ + int rc = -ENOMEM; + + + /* Export known PAI events */ + p->event_group->attrs = attr_event_init(p); + if (!p->event_group->attrs) { + pr_err("Creation of PMU %s /sysfs failed\n", p->pmuname); + goto out; + } + + rc = perf_pmu_register(p->pmu, p->pmuname, -1); + if (rc) { + pai_pmu_exit(p); + pr_err("Registering PMU %s failed with rc=%i\n", p->pmuname, + rc); + } +out: + return rc; +} + +/* PAI PMU characteristics table */ +static struct pai_pmu pai_pmu[] __refdata = { + [PAI_PMU_CRYPTO] = { + .pmuname = "pai_crypto", + .facility_nr = 196, + .num_named = ARRAY_SIZE(paicrypt_ctrnames), + .names = paicrypt_ctrnames, + .base = PAI_CRYPTO_BASE, + .kernel_offset = PAI_CRYPTO_KERNEL_OFFSET, + .area_size = PAGE_SIZE, + .init = pai_pmu_init, + .exit = pai_pmu_exit, + .pmu = &paicrypt, + .event_group = &paicrypt_events_group + }, + [PAI_PMU_EXT] = { + .pmuname = "pai_ext", + .facility_nr = 197, + .num_named = ARRAY_SIZE(paiext_ctrnames), + .names = paiext_ctrnames, + .base = PAI_NNPA_BASE, + .kernel_offset = 0, + .area_size = PAIE1_CTRBLOCK_SZ, + .init = pai_pmu_init, + .exit = pai_pmu_exit, + .pmu = &paiext, + .event_group = &paiext_events_group + } +}; + +/* + * Check if the PMU (via facility) is supported by machine. Try all of the + * supported PAI PMUs. + * Return number of successfully installed PMUs. + */ +static int __init paipmu_setup(void) +{ + struct qpaci_info_block ib; + int install_ok = 0, rc; + struct pai_pmu *p; + size_t i; + + for (i = 0; i < ARRAY_SIZE(pai_pmu); ++i) { + p = &pai_pmu[i]; + + if (!test_facility(p->facility_nr)) + continue; + + qpaci(&ib); + switch (i) { + case PAI_PMU_CRYPTO: + p->num_avail = ib.num_cc; + if (p->num_avail >= PAI_CRYPTO_MAXCTR) { + pr_err("Too many PMU %s counters %d\n", + p->pmuname, p->num_avail); + continue; + } + break; + case PAI_PMU_EXT: + p->num_avail = ib.num_nnpa; + break; + } + p->num_avail += 1; /* Add xxx_ALL event */ + if (p->init) { + rc = p->init(p); + if (!rc) + ++install_ok; + } + } + return install_ok; +} + +static int __init pai_init(void) +{ + /* Setup s390dbf facility */ + paidbg = debug_register("pai", 32, 256, 128); + if (!paidbg) { + pr_err("Registration of s390dbf pai failed\n"); + return -ENOMEM; + } + debug_register_view(paidbg, &debug_sprintf_view); + + if (!paipmu_setup()) { + /* No PMU registration, no need for debug buffer */ + debug_unregister_view(paidbg, &debug_sprintf_view); + debug_unregister(paidbg); + return -ENODEV; + } + return 0; +} + +device_initcall(pai_init); diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c index 6e9e5d5e927e..7b305f1456f8 100644 --- a/arch/s390/kernel/perf_regs.c +++ b/arch/s390/kernel/perf_regs.c @@ -5,8 +5,7 @@ #include <linux/errno.h> #include <linux/bug.h> #include <asm/ptrace.h> -#include <asm/fpu/api.h> -#include <asm/fpu/types.h> +#include <asm/fpu.h> u64 perf_reg_value(struct pt_regs *regs, int idx) { @@ -20,8 +19,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) return 0; idx -= PERF_REG_S390_FP0; - fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx) - : current->thread.fpu.fprs[idx]; + fp = *(freg_t *)(current->thread.ufpu.vxrs + idx); return fp.ui; } @@ -46,9 +44,6 @@ int perf_reg_validate(u64 mask) u64 perf_reg_abi(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_31BIT)) - return PERF_SAMPLE_REGS_ABI_32; - return PERF_SAMPLE_REGS_ABI_64; } @@ -63,6 +58,6 @@ void perf_get_regs_user(struct perf_regs *regs_user, */ regs_user->regs = task_pt_regs(current); if (user_mode(regs_user->regs)) - save_fpu_regs(); + save_user_fpu_regs(); regs_user->abi = perf_reg_abi(current); } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 350e94d0cac2..0df95dcb2101 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -24,22 +24,24 @@ #include <linux/tick.h> #include <linux/personality.h> #include <linux/syscalls.h> -#include <linux/compat.h> #include <linux/kprobes.h> #include <linux/random.h> -#include <linux/export.h> #include <linux/init_task.h> #include <linux/entry-common.h> +#include <linux/io.h> +#include <asm/guarded_storage.h> +#include <asm/access-regs.h> +#include <asm/switch_to.h> #include <asm/cpu_mf.h> -#include <asm/io.h> #include <asm/processor.h> +#include <asm/ptrace.h> #include <asm/vtimer.h> #include <asm/exec.h> +#include <asm/fpu.h> #include <asm/irq.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/stacktrace.h> -#include <asm/switch_to.h> #include <asm/runtime_instr.h> #include <asm/unwind.h> #include "entry.h" @@ -67,10 +69,10 @@ void flush_thread(void) void arch_setup_new_exec(void) { - if (S390_lowcore.current_pid != current->pid) { - S390_lowcore.current_pid = current->pid; + if (get_lowcore()->current_pid != current->pid) { + get_lowcore()->current_pid = current->pid; if (test_facility(40)) - lpp(&S390_lowcore.lpp); + lpp(&get_lowcore()->lpp); } } @@ -82,21 +84,30 @@ void arch_release_task_struct(struct task_struct *tsk) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { + save_user_fpu_regs(); + + *dst = *src; + dst->thread.kfpu_flags = 0; + /* - * Save the floating-point or vector register state of the current - * task and set the CIF_FPU flag to lazy restore the FPU register - * state when returning to user space. + * Don't transfer over the runtime instrumentation or the guarded + * storage control block pointers. These fields are cleared here instead + * of in copy_thread() to avoid premature freeing of associated memory + * on fork() failure. Wait to clear the RI flag because ->stack still + * refers to the source thread. */ - save_fpu_regs(); + dst->thread.ri_cb = NULL; + dst->thread.gs_cb = NULL; + dst->thread.gs_bc_cb = NULL; - memcpy(dst, src, arch_task_struct_size); - dst->thread.fpu.regs = dst->thread.fpu.fprs; return 0; } -int copy_thread(unsigned long clone_flags, unsigned long new_stackp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + u64 clone_flags = args->flags; + unsigned long new_stackp = args->stack; + unsigned long tls = args->tls; struct fake_frame { struct stack_frame sf; @@ -122,26 +133,23 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, p->thread.last_break = 1; frame->sf.back_chain = 0; - frame->sf.gprs[5] = (unsigned long)frame + sizeof(struct stack_frame); - frame->sf.gprs[6] = (unsigned long)p; + frame->sf.gprs[11 - 6] = (unsigned long)&frame->childregs; + frame->sf.gprs[12 - 6] = (unsigned long)p; /* new return point is ret_from_fork */ - frame->sf.gprs[8] = (unsigned long)ret_from_fork; + frame->sf.gprs[14 - 6] = (unsigned long)ret_from_fork; /* fake return stack for resume(), don't go back to schedule */ - frame->sf.gprs[9] = (unsigned long)frame; + frame->sf.gprs[15 - 6] = (unsigned long)frame; /* Store access registers to kernel stack of new process. */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); - frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | - PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; - frame->childregs.psw.addr = - (unsigned long)__ret_from_fork; - frame->childregs.gprs[9] = new_stackp; /* function */ - frame->childregs.gprs[10] = arg; - frame->childregs.gprs[11] = (unsigned long)do_exit; + frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | + PSW_MASK_EXT | PSW_MASK_MCHECK; + frame->childregs.gprs[9] = (unsigned long)args->fn; + frame->childregs.gprs[10] = (unsigned long)args->fn_arg; frame->childregs.orig_gpr2 = -1; - + frame->childregs.last_break = 1; return 0; } frame->childregs = *current_pt_regs(); @@ -149,22 +157,16 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, frame->childregs.flags = 0; if (new_stackp) frame->childregs.gprs[15] = new_stackp; - - /* Don't copy runtime instrumentation info */ - p->thread.ri_cb = NULL; + /* + * Clear the runtime instrumentation flag after the above childregs + * copy. The CB pointer was already cleared in arch_dup_task_struct(). + */ frame->childregs.psw.mask &= ~PSW_MASK_RI; - /* Don't copy guarded storage control block */ - p->thread.gs_cb = NULL; - p->thread.gs_bc_cb = NULL; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { - if (is_compat_task()) { - p->thread.acrs[0] = (unsigned int)tls; - } else { - p->thread.acrs[0] = (unsigned int)(tls >> 32); - p->thread.acrs[1] = (unsigned int)tls; - } + p->thread.acrs[0] = (unsigned int)(tls >> 32); + p->thread.acrs[1] = (unsigned int)tls; } /* * s390 stores the svc return address in arch_data when calling @@ -177,16 +179,31 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, void execve_tail(void) { - current->thread.fpu.fpc = 0; - asm volatile("sfpc %0" : : "d" (0)); + current->thread.ufpu.fpc = 0; + fpu_sfpc(0); +} + +struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next) +{ + save_user_fpu_regs(); + save_kernel_fpu_regs(&prev->thread); + save_access_regs(&prev->thread.acrs[0]); + save_ri_cb(prev->thread.ri_cb); + save_gs_cb(prev->thread.gs_cb); + update_cr_regs(next); + restore_kernel_fpu_regs(&next->thread); + restore_access_regs(&next->thread.acrs[0]); + restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); + restore_gs_cb(next->thread.gs_cb); + return __switch_to_asm(prev, next); } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { struct unwind_state state; unsigned long ip = 0; - if (!p || p == current || task_is_running(p) || !task_stack_page(p)) + if (!task_stack_page(p)) return 0; if (!try_get_task_stack(p)) @@ -213,13 +230,13 @@ unsigned long get_wchan(struct task_struct *p) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() & ~PAGE_MASK; + sp -= get_random_u32_below(PAGE_SIZE); return sp & ~0xf; } static inline unsigned long brk_rnd(void) { - return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; + return (get_random_u16() & BRK_RND_MASK) << PAGE_SHIFT; } unsigned long arch_randomize_brk(struct mm_struct *mm) diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 82df39b17bb5..e33a3eccda56 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -4,27 +4,33 @@ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) */ -#define KMSG_COMPONENT "cpu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpu: " fmt #include <linux/stop_machine.h> #include <linux/cpufeature.h> #include <linux/bitops.h> #include <linux/kernel.h> +#include <linux/random.h> #include <linux/sched/mm.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/mm_types.h> #include <linux/delay.h> #include <linux/cpu.h> - +#include <linux/smp.h> +#include <asm/text-patching.h> +#include <asm/machine.h> #include <asm/diag.h> #include <asm/facility.h> #include <asm/elf.h> #include <asm/lowcore.h> #include <asm/param.h> +#include <asm/sclp.h> #include <asm/smp.h> +unsigned long __read_mostly elf_hwcap; +char elf_platform[ELF_PLATFORM_SIZE]; + struct cpu_info { unsigned int cpu_mhz_dynamic; unsigned int cpu_mhz_static; @@ -67,7 +73,7 @@ void notrace stop_machine_yield(const struct cpumask *cpumask) this_cpu = smp_processor_id(); if (__this_cpu_inc_return(cpu_relax_retry) >= spin_retry) { __this_cpu_write(cpu_relax_retry, 0); - cpu = cpumask_next_wrap(this_cpu, cpumask, this_cpu, false); + cpu = cpumask_next_wrap(this_cpu, cpumask); if (cpu >= nr_cpu_ids) return; if (arch_vcpu_is_preempted(cpu)) @@ -75,6 +81,23 @@ void notrace stop_machine_yield(const struct cpumask *cpumask) } } +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +void text_poke_sync_lock(void) +{ + cpus_read_lock(); + text_poke_sync(); + cpus_read_unlock(); +} + /* * cpu_init - initializes state that is per-CPU. */ @@ -91,15 +114,6 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, current); } -/* - * cpu_have_feature - Test CPU features on module initialization - */ -int cpu_have_feature(unsigned int num) -{ - return elf_hwcap & (1UL << num); -} -EXPORT_SYMBOL(cpu_have_feature); - static void show_facilities(struct seq_file *m) { unsigned int bit; @@ -113,15 +127,33 @@ static void show_facilities(struct seq_file *m) static void show_cpu_summary(struct seq_file *m, void *v) { static const char *hwcap_str[] = { - "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", - "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs", - "vxe2", "vxp", "sort", "dflt" - }; - static const char * const int_hwcap_str[] = { - "sie" + [HWCAP_NR_ESAN3] = "esan3", + [HWCAP_NR_ZARCH] = "zarch", + [HWCAP_NR_STFLE] = "stfle", + [HWCAP_NR_MSA] = "msa", + [HWCAP_NR_LDISP] = "ldisp", + [HWCAP_NR_EIMM] = "eimm", + [HWCAP_NR_DFP] = "dfp", + [HWCAP_NR_HPAGE] = "edat", + [HWCAP_NR_ETF3EH] = "etf3eh", + [HWCAP_NR_HIGH_GPRS] = "highgprs", + [HWCAP_NR_TE] = "te", + [HWCAP_NR_VXRS] = "vx", + [HWCAP_NR_VXRS_BCD] = "vxd", + [HWCAP_NR_VXRS_EXT] = "vxe", + [HWCAP_NR_GS] = "gs", + [HWCAP_NR_VXRS_EXT2] = "vxe2", + [HWCAP_NR_VXRS_PDE] = "vxp", + [HWCAP_NR_SORT] = "sort", + [HWCAP_NR_DFLT] = "dflt", + [HWCAP_NR_VXRS_PDE2] = "vxp2", + [HWCAP_NR_NNPA] = "nnpa", + [HWCAP_NR_PCI_MIO] = "pcimio", + [HWCAP_NR_SIE] = "sie", }; int i, cpu; + BUILD_BUG_ON(ARRAY_SIZE(hwcap_str) != HWCAP_NR_MAX); seq_printf(m, "vendor_id : IBM/S390\n" "# processors : %i\n" "bogomips per cpu: %lu.%02lu\n", @@ -132,9 +164,6 @@ static void show_cpu_summary(struct seq_file *m, void *v) for (i = 0; i < ARRAY_SIZE(hwcap_str); i++) if (hwcap_str[i] && (elf_hwcap & (1UL << i))) seq_printf(m, "%s ", hwcap_str[i]); - for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++) - if (int_hwcap_str[i] && (int_hwcap & (1UL << i))) - seq_printf(m, "%s ", int_hwcap_str[i]); seq_puts(m, "\n"); show_facilities(m); show_cacheinfo(m); @@ -149,6 +178,130 @@ static void show_cpu_summary(struct seq_file *m, void *v) } } +static int __init setup_hwcaps(void) +{ + /* instructions named N3, "backported" to esa-mode */ + elf_hwcap |= HWCAP_ESAN3; + + /* z/Architecture mode active */ + elf_hwcap |= HWCAP_ZARCH; + + /* store-facility-list-extended */ + if (test_facility(7)) + elf_hwcap |= HWCAP_STFLE; + + /* message-security assist */ + if (test_facility(17)) + elf_hwcap |= HWCAP_MSA; + + /* long-displacement */ + if (test_facility(19)) + elf_hwcap |= HWCAP_LDISP; + + /* extended-immediate */ + elf_hwcap |= HWCAP_EIMM; + + /* extended-translation facility 3 enhancement */ + if (test_facility(22) && test_facility(30)) + elf_hwcap |= HWCAP_ETF3EH; + + /* decimal floating point & perform floating point operation */ + if (test_facility(42) && test_facility(44)) + elf_hwcap |= HWCAP_DFP; + + /* huge page support */ + if (cpu_has_edat1()) + elf_hwcap |= HWCAP_HPAGE; + + /* 64-bit register support for 31-bit processes */ + elf_hwcap |= HWCAP_HIGH_GPRS; + + /* transactional execution */ + if (machine_has_tx()) + elf_hwcap |= HWCAP_TE; + + /* vector */ + if (test_facility(129)) { + elf_hwcap |= HWCAP_VXRS; + if (test_facility(134)) + elf_hwcap |= HWCAP_VXRS_BCD; + if (test_facility(135)) + elf_hwcap |= HWCAP_VXRS_EXT; + if (test_facility(148)) + elf_hwcap |= HWCAP_VXRS_EXT2; + if (test_facility(152)) + elf_hwcap |= HWCAP_VXRS_PDE; + if (test_facility(192)) + elf_hwcap |= HWCAP_VXRS_PDE2; + } + + if (test_facility(150)) + elf_hwcap |= HWCAP_SORT; + + if (test_facility(151)) + elf_hwcap |= HWCAP_DFLT; + + if (test_facility(165)) + elf_hwcap |= HWCAP_NNPA; + + /* guarded storage */ + if (cpu_has_gs()) + elf_hwcap |= HWCAP_GS; + + if (test_machine_feature(MFEATURE_PCI_MIO)) + elf_hwcap |= HWCAP_PCI_MIO; + + /* virtualization support */ + if (sclp.has_sief2) + elf_hwcap |= HWCAP_SIE; + + return 0; +} +arch_initcall(setup_hwcaps); + +static int __init setup_elf_platform(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + add_device_randomness(&cpu_id, sizeof(cpu_id)); + switch (cpu_id.machine) { + default: /* Use "z10" as default. */ + strscpy(elf_platform, "z10"); + break; + case 0x2817: + case 0x2818: + strscpy(elf_platform, "z196"); + break; + case 0x2827: + case 0x2828: + strscpy(elf_platform, "zEC12"); + break; + case 0x2964: + case 0x2965: + strscpy(elf_platform, "z13"); + break; + case 0x3906: + case 0x3907: + strscpy(elf_platform, "z14"); + break; + case 0x8561: + case 0x8562: + strscpy(elf_platform, "z15"); + break; + case 0x3931: + case 0x3932: + strscpy(elf_platform, "z16"); + break; + case 0x9175: + case 0x9176: + strscpy(elf_platform, "z17"); + break; + } + return 0; +} +arch_initcall(setup_elf_platform); + static void show_cpu_topology(struct seq_file *m, unsigned long n) { #ifdef CONFIG_SCHED_TOPOLOGY @@ -210,7 +363,7 @@ static inline void *c_update(loff_t *pos) static void *c_start(struct seq_file *m, loff_t *pos) { - get_online_cpus(); + cpus_read_lock(); return c_update(pos); } @@ -222,7 +375,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) static void c_stop(struct seq_file *m, void *v) { - put_online_cpus(); + cpus_read_unlock(); } const struct seq_operations cpuinfo_op = { @@ -231,21 +384,3 @@ const struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; - -int s390_isolate_bp(void) -{ - if (!test_facility(82)) - return -EOPNOTSUPP; - set_thread_flag(TIF_ISOLATE_BP); - return 0; -} -EXPORT_SYMBOL(s390_isolate_bp); - -int s390_isolate_bp_guest(void) -{ - if (!test_facility(82)) - return -EOPNOTSUPP; - set_thread_flag(TIF_ISOLATE_BP_GUEST); - return 0; -} -EXPORT_SYMBOL(s390_isolate_bp_guest); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 0ea3d02b378d..ceaa1726e328 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -7,10 +7,10 @@ * Martin Schwidefsky (schwidefsky@de.ibm.com) */ -#include "asm/ptrace.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/cpufeature.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> @@ -21,38 +21,44 @@ #include <linux/signal.h> #include <linux/elf.h> #include <linux/regset.h> -#include <linux/tracehook.h> #include <linux/seccomp.h> -#include <linux/compat.h> #include <trace/syscall.h> +#include <asm/guarded_storage.h> +#include <asm/access-regs.h> #include <asm/page.h> #include <linux/uaccess.h> #include <asm/unistd.h> -#include <asm/switch_to.h> #include <asm/runtime_instr.h> #include <asm/facility.h> +#include <asm/machine.h> +#include <asm/ptrace.h> +#include <asm/rwonce.h> +#include <asm/fpu.h> #include "entry.h" -#ifdef CONFIG_COMPAT -#include "compat_ptrace.h" -#endif - void update_cr_regs(struct task_struct *task) { struct pt_regs *regs = task_pt_regs(task); struct thread_struct *thread = &task->thread; - struct per_regs old, new; union ctlreg0 cr0_old, cr0_new; union ctlreg2 cr2_old, cr2_new; int cr0_changed, cr2_changed; - - __ctl_store(cr0_old.val, 0, 0); - __ctl_store(cr2_old.val, 2, 2); + union { + struct ctlreg regs[3]; + struct { + struct ctlreg control; + struct ctlreg start; + struct ctlreg end; + }; + } old, new; + + local_ctl_store(0, &cr0_old.reg); + local_ctl_store(2, &cr2_old.reg); cr0_new = cr0_old; cr2_new = cr2_old; /* Take care of the enable/disable of transactional execution. */ - if (MACHINE_HAS_TE) { + if (machine_has_tx()) { /* Set or clear transaction execution TXC bit 8. */ cr0_new.tcx = 1; if (task->thread.per_flags & PER_FLAG_NO_TE) @@ -67,7 +73,7 @@ void update_cr_regs(struct task_struct *task) } } /* Take care of enable/disable of guarded storage. */ - if (MACHINE_HAS_GS) { + if (cpu_has_gs()) { cr2_new.gse = 0; if (task->thread.gs_cb) cr2_new.gse = 1; @@ -76,38 +82,38 @@ void update_cr_regs(struct task_struct *task) cr0_changed = cr0_new.val != cr0_old.val; cr2_changed = cr2_new.val != cr2_old.val; if (cr0_changed) - __ctl_load(cr0_new.val, 0, 0); + local_ctl_load(0, &cr0_new.reg); if (cr2_changed) - __ctl_load(cr2_new.val, 2, 2); + local_ctl_load(2, &cr2_new.reg); /* Copy user specified PER registers */ - new.control = thread->per_user.control; - new.start = thread->per_user.start; - new.end = thread->per_user.end; + new.control.val = thread->per_user.control; + new.start.val = thread->per_user.start; + new.end.val = thread->per_user.end; /* merge TIF_SINGLE_STEP into user specified PER registers. */ if (test_tsk_thread_flag(task, TIF_SINGLE_STEP) || test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) { if (test_tsk_thread_flag(task, TIF_BLOCK_STEP)) - new.control |= PER_EVENT_BRANCH; + new.control.val |= PER_EVENT_BRANCH; else - new.control |= PER_EVENT_IFETCH; - new.control |= PER_CONTROL_SUSPENSION; - new.control |= PER_EVENT_TRANSACTION_END; + new.control.val |= PER_EVENT_IFETCH; + new.control.val |= PER_CONTROL_SUSPENSION; + new.control.val |= PER_EVENT_TRANSACTION_END; if (test_tsk_thread_flag(task, TIF_UPROBE_SINGLESTEP)) - new.control |= PER_EVENT_IFETCH; - new.start = 0; - new.end = -1UL; + new.control.val |= PER_EVENT_IFETCH; + new.start.val = 0; + new.end.val = -1UL; } /* Take care of the PER enablement bit in the PSW. */ - if (!(new.control & PER_EVENT_MASK)) { + if (!(new.control.val & PER_EVENT_MASK)) { regs->psw.mask &= ~PSW_MASK_PER; return; } regs->psw.mask |= PSW_MASK_PER; - __ctl_store(old, 9, 11); + __local_ctl_store(9, 11, old.regs); if (memcmp(&new, &old, sizeof(struct per_regs)) != 0) - __ctl_load(new, 9, 11); + __local_ctl_load(9, 11, new.regs); } void user_enable_single_step(struct task_struct *task) @@ -147,38 +153,36 @@ void ptrace_disable(struct task_struct *task) static inline unsigned long __peek_user_per(struct task_struct *child, addr_t addr) { - struct per_struct_kernel *dummy = NULL; - - if (addr == (addr_t) &dummy->cr9) + if (addr == offsetof(struct per_struct_kernel, cr9)) /* Control bits of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? PER_EVENT_IFETCH : child->thread.per_user.control; - else if (addr == (addr_t) &dummy->cr10) + else if (addr == offsetof(struct per_struct_kernel, cr10)) /* Start address of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? 0 : child->thread.per_user.start; - else if (addr == (addr_t) &dummy->cr11) + else if (addr == offsetof(struct per_struct_kernel, cr11)) /* End address of the active per set. */ return test_thread_flag(TIF_SINGLE_STEP) ? -1UL : child->thread.per_user.end; - else if (addr == (addr_t) &dummy->bits) + else if (addr == offsetof(struct per_struct_kernel, bits)) /* Single-step bit. */ return test_thread_flag(TIF_SINGLE_STEP) ? (1UL << (BITS_PER_LONG - 1)) : 0; - else if (addr == (addr_t) &dummy->starting_addr) + else if (addr == offsetof(struct per_struct_kernel, starting_addr)) /* Start address of the user specified per set. */ return child->thread.per_user.start; - else if (addr == (addr_t) &dummy->ending_addr) + else if (addr == offsetof(struct per_struct_kernel, ending_addr)) /* End address of the user specified per set. */ return child->thread.per_user.end; - else if (addr == (addr_t) &dummy->perc_atmid) + else if (addr == offsetof(struct per_struct_kernel, perc_atmid)) /* PER code, ATMID and AI of the last PER trap */ return (unsigned long) child->thread.per_event.cause << (BITS_PER_LONG - 16); - else if (addr == (addr_t) &dummy->address) + else if (addr == offsetof(struct per_struct_kernel, address)) /* Address of the last PER trap */ return child->thread.per_event.address; - else if (addr == (addr_t) &dummy->access_id) + else if (addr == offsetof(struct per_struct_kernel, access_id)) /* Access id of the last PER trap */ return (unsigned long) child->thread.per_event.paid << (BITS_PER_LONG - 8); @@ -196,73 +200,65 @@ static inline unsigned long __peek_user_per(struct task_struct *child, */ static unsigned long __peek_user(struct task_struct *child, addr_t addr) { - struct user *dummy = NULL; addr_t offset, tmp; - if (addr < (addr_t) &dummy->regs.acrs) { + if (addr < offsetof(struct user, regs.acrs)) { /* * psw and gprs are stored on the stack */ tmp = *(addr_t *)((addr_t) &task_pt_regs(child)->psw + addr); - if (addr == (addr_t) &dummy->regs.psw.mask) { + if (addr == offsetof(struct user, regs.psw.mask)) { /* Return a clean psw mask. */ tmp &= PSW_MASK_USER | PSW_MASK_RI; tmp |= PSW_USER_BITS; } - } else if (addr < (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr < offsetof(struct user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy->regs.acrs; + offset = addr - offsetof(struct user, regs.acrs); /* * Very special case: old & broken 64 bit gdb reading * from acrs[15]. Result is a 64 bit value. Read the * 32 bit acrs[15] value and shift it by 32. Sick... */ - if (addr == (addr_t) &dummy->regs.acrs[15]) + if (addr == offsetof(struct user, regs.acrs[15])) tmp = ((unsigned long) child->thread.acrs[15]) << 32; else tmp = *(addr_t *)((addr_t) &child->thread.acrs + offset); - } else if (addr == (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr == offsetof(struct user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ tmp = (addr_t) task_pt_regs(child)->orig_gpr2; - } else if (addr < (addr_t) &dummy->regs.fp_regs) { + } else if (addr < offsetof(struct user, regs.fp_regs)) { /* * prevent reads of padding hole between * orig_gpr2 and fp_regs on s390. */ tmp = 0; - } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ - tmp = child->thread.fpu.fpc; + tmp = child->thread.ufpu.fpc; tmp <<= BITS_PER_LONG - 32; - } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* - * floating point regs. are either in child->thread.fpu - * or the child->thread.fpu.vxrs array + * floating point regs. are in the child->thread.ufpu.vxrs array */ - offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; - if (MACHINE_HAS_VX) - tmp = *(addr_t *) - ((addr_t) child->thread.fpu.vxrs + 2*offset); - else - tmp = *(addr_t *) - ((addr_t) child->thread.fpu.fprs + offset); - - } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { + offset = addr - offsetof(struct user, regs.fp_regs.fprs); + tmp = *(addr_t *)((addr_t)child->thread.ufpu.vxrs + 2 * offset); + } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy->regs.per_info; + addr -= offsetof(struct user, regs.per_info); tmp = __peek_user_per(child, addr); } else @@ -281,8 +277,8 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data) * an alignment of 4. Programmers from hell... */ mask = __ADDR_MASK; - if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs && - addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2) + if (addr >= offsetof(struct user, regs.acrs) && + addr < offsetof(struct user, regs.orig_gpr2)) mask = 3; if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) return -EIO; @@ -294,8 +290,6 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data) static inline void __poke_user_per(struct task_struct *child, addr_t addr, addr_t data) { - struct per_struct_kernel *dummy = NULL; - /* * There are only three fields in the per_info struct that the * debugger user can write to. @@ -308,14 +302,14 @@ static inline void __poke_user_per(struct task_struct *child, * addresses are used only if single stepping is not in effect. * Writes to any other field in per_info are ignored. */ - if (addr == (addr_t) &dummy->cr9) + if (addr == offsetof(struct per_struct_kernel, cr9)) /* PER event mask of the user specified per set. */ child->thread.per_user.control = data & (PER_EVENT_MASK | PER_CONTROL_MASK); - else if (addr == (addr_t) &dummy->starting_addr) + else if (addr == offsetof(struct per_struct_kernel, starting_addr)) /* Starting address of the user specified per set. */ child->thread.per_user.start = data; - else if (addr == (addr_t) &dummy->ending_addr) + else if (addr == offsetof(struct per_struct_kernel, ending_addr)) /* Ending address of the user specified per set. */ child->thread.per_user.end = data; } @@ -328,16 +322,15 @@ static inline void __poke_user_per(struct task_struct *child, */ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) { - struct user *dummy = NULL; addr_t offset; - if (addr < (addr_t) &dummy->regs.acrs) { + if (addr < offsetof(struct user, regs.acrs)) { struct pt_regs *regs = task_pt_regs(child); /* * psw and gprs are stored on the stack */ - if (addr == (addr_t) &dummy->regs.psw.mask) { + if (addr == offsetof(struct user, regs.psw.mask)) { unsigned long mask = PSW_MASK_USER; mask |= is_ri_task(child) ? PSW_MASK_RI : 0; @@ -359,62 +352,54 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) regs->int_code = 0x20000 | (data & 0xffff); } *(addr_t *)((addr_t) ®s->psw + addr) = data; - } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) { + } else if (addr < offsetof(struct user, regs.orig_gpr2)) { /* * access registers are stored in the thread structure */ - offset = addr - (addr_t) &dummy->regs.acrs; + offset = addr - offsetof(struct user, regs.acrs); /* * Very special case: old & broken 64 bit gdb writing * to acrs[15] with a 64 bit value. Ignore the lower * half of the value and write the upper 32 bit to * acrs[15]. Sick... */ - if (addr == (addr_t) &dummy->regs.acrs[15]) + if (addr == offsetof(struct user, regs.acrs[15])) child->thread.acrs[15] = (unsigned int) (data >> 32); else *(addr_t *)((addr_t) &child->thread.acrs + offset) = data; - } else if (addr == (addr_t) &dummy->regs.orig_gpr2) { + } else if (addr == offsetof(struct user, regs.orig_gpr2)) { /* * orig_gpr2 is stored on the kernel stack */ task_pt_regs(child)->orig_gpr2 = data; - } else if (addr < (addr_t) &dummy->regs.fp_regs) { + } else if (addr < offsetof(struct user, regs.fp_regs)) { /* * prevent writes of padding hole between * orig_gpr2 and fp_regs on s390. */ return 0; - } else if (addr == (addr_t) &dummy->regs.fp_regs.fpc) { + } else if (addr == offsetof(struct user, regs.fp_regs.fpc)) { /* * floating point control reg. is in the thread structure */ - if ((unsigned int) data != 0 || - test_fp_ctl(data >> (BITS_PER_LONG - 32))) + if ((unsigned int)data != 0) return -EINVAL; - child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32); + child->thread.ufpu.fpc = data >> (BITS_PER_LONG - 32); - } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + } else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) { /* - * floating point regs. are either in child->thread.fpu - * or the child->thread.fpu.vxrs array + * floating point regs. are in the child->thread.ufpu.vxrs array */ - offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; - if (MACHINE_HAS_VX) - *(addr_t *)((addr_t) - child->thread.fpu.vxrs + 2*offset) = data; - else - *(addr_t *)((addr_t) - child->thread.fpu.fprs + offset) = data; - - } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { + offset = addr - offsetof(struct user, regs.fp_regs.fprs); + *(addr_t *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = data; + } else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) { /* * Handle access to the per_info structure. */ - addr -= (addr_t) &dummy->regs.per_info; + addr -= offsetof(struct user, regs.per_info); __poke_user_per(child, addr, data); } @@ -431,8 +416,8 @@ static int poke_user(struct task_struct *child, addr_t addr, addr_t data) * an alignment of 4. Programmers from hell indeed... */ mask = __ADDR_MASK; - if (addr >= (addr_t) &((struct user *) NULL)->regs.acrs && - addr < (addr_t) &((struct user *) NULL)->regs.orig_gpr2) + if (addr >= offsetof(struct user, regs.acrs) && + addr < offsetof(struct user, regs.orig_gpr2)) mask = 3; if ((addr & mask) || addr > sizeof(struct user) - __ADDR_MASK) return -EIO; @@ -481,22 +466,20 @@ long arch_ptrace(struct task_struct *child, long request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(child->thread.last_break, - (unsigned long __user *) data); - return 0; + return put_user(child->thread.last_break, (unsigned long __user *)data); case PTRACE_ENABLE_TE: - if (!MACHINE_HAS_TE) + if (!machine_has_tx()) return -EIO; child->thread.per_flags &= ~PER_FLAG_NO_TE; return 0; case PTRACE_DISABLE_TE: - if (!MACHINE_HAS_TE) + if (!machine_has_tx()) return -EIO; child->thread.per_flags |= PER_FLAG_NO_TE; child->thread.per_flags &= ~PER_FLAG_TE_ABORT_RAND; return 0; case PTRACE_TE_ABORT_RAND: - if (!MACHINE_HAS_TE || (child->thread.per_flags & PER_FLAG_NO_TE)) + if (!machine_has_tx() || (child->thread.per_flags & PER_FLAG_NO_TE)) return -EIO; switch (data) { case 0UL: @@ -519,332 +502,6 @@ long arch_ptrace(struct task_struct *child, long request, } } -#ifdef CONFIG_COMPAT -/* - * Now the fun part starts... a 31 bit program running in the - * 31 bit emulation tracing another program. PTRACE_PEEKTEXT, - * PTRACE_PEEKDATA, PTRACE_POKETEXT and PTRACE_POKEDATA are easy - * to handle, the difference to the 64 bit versions of the requests - * is that the access is done in multiples of 4 byte instead of - * 8 bytes (sizeof(unsigned long) on 31/64 bit). - * The ugly part are PTRACE_PEEKUSR, PTRACE_PEEKUSR_AREA, - * PTRACE_POKEUSR and PTRACE_POKEUSR_AREA. If the traced program - * is a 31 bit program too, the content of struct user can be - * emulated. A 31 bit program peeking into the struct user of - * a 64 bit program is a no-no. - */ - -/* - * Same as peek_user_per but for a 31 bit program. - */ -static inline __u32 __peek_user_per_compat(struct task_struct *child, - addr_t addr) -{ - struct compat_per_struct_kernel *dummy32 = NULL; - - if (addr == (addr_t) &dummy32->cr9) - /* Control bits of the active per set. */ - return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? - PER_EVENT_IFETCH : child->thread.per_user.control; - else if (addr == (addr_t) &dummy32->cr10) - /* Start address of the active per set. */ - return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? - 0 : child->thread.per_user.start; - else if (addr == (addr_t) &dummy32->cr11) - /* End address of the active per set. */ - return test_thread_flag(TIF_SINGLE_STEP) ? - PSW32_ADDR_INSN : child->thread.per_user.end; - else if (addr == (addr_t) &dummy32->bits) - /* Single-step bit. */ - return (__u32) test_thread_flag(TIF_SINGLE_STEP) ? - 0x80000000 : 0; - else if (addr == (addr_t) &dummy32->starting_addr) - /* Start address of the user specified per set. */ - return (__u32) child->thread.per_user.start; - else if (addr == (addr_t) &dummy32->ending_addr) - /* End address of the user specified per set. */ - return (__u32) child->thread.per_user.end; - else if (addr == (addr_t) &dummy32->perc_atmid) - /* PER code, ATMID and AI of the last PER trap */ - return (__u32) child->thread.per_event.cause << 16; - else if (addr == (addr_t) &dummy32->address) - /* Address of the last PER trap */ - return (__u32) child->thread.per_event.address; - else if (addr == (addr_t) &dummy32->access_id) - /* Access id of the last PER trap */ - return (__u32) child->thread.per_event.paid << 24; - return 0; -} - -/* - * Same as peek_user but for a 31 bit program. - */ -static u32 __peek_user_compat(struct task_struct *child, addr_t addr) -{ - struct compat_user *dummy32 = NULL; - addr_t offset; - __u32 tmp; - - if (addr < (addr_t) &dummy32->regs.acrs) { - struct pt_regs *regs = task_pt_regs(child); - /* - * psw and gprs are stored on the stack - */ - if (addr == (addr_t) &dummy32->regs.psw.mask) { - /* Fake a 31 bit psw mask. */ - tmp = (__u32)(regs->psw.mask >> 32); - tmp &= PSW32_MASK_USER | PSW32_MASK_RI; - tmp |= PSW32_USER_BITS; - } else if (addr == (addr_t) &dummy32->regs.psw.addr) { - /* Fake a 31 bit psw address. */ - tmp = (__u32) regs->psw.addr | - (__u32)(regs->psw.mask & PSW_MASK_BA); - } else { - /* gpr 0-15 */ - tmp = *(__u32 *)((addr_t) ®s->psw + addr*2 + 4); - } - } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) { - /* - * access registers are stored in the thread structure - */ - offset = addr - (addr_t) &dummy32->regs.acrs; - tmp = *(__u32*)((addr_t) &child->thread.acrs + offset); - - } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) { - /* - * orig_gpr2 is stored on the kernel stack - */ - tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); - - } else if (addr < (addr_t) &dummy32->regs.fp_regs) { - /* - * prevent reads of padding hole between - * orig_gpr2 and fp_regs on s390. - */ - tmp = 0; - - } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) { - /* - * floating point control reg. is in the thread structure - */ - tmp = child->thread.fpu.fpc; - - } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { - /* - * floating point regs. are either in child->thread.fpu - * or the child->thread.fpu.vxrs array - */ - offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; - if (MACHINE_HAS_VX) - tmp = *(__u32 *) - ((addr_t) child->thread.fpu.vxrs + 2*offset); - else - tmp = *(__u32 *) - ((addr_t) child->thread.fpu.fprs + offset); - - } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { - /* - * Handle access to the per_info structure. - */ - addr -= (addr_t) &dummy32->regs.per_info; - tmp = __peek_user_per_compat(child, addr); - - } else - tmp = 0; - - return tmp; -} - -static int peek_user_compat(struct task_struct *child, - addr_t addr, addr_t data) -{ - __u32 tmp; - - if (!is_compat_task() || (addr & 3) || addr > sizeof(struct user) - 3) - return -EIO; - - tmp = __peek_user_compat(child, addr); - return put_user(tmp, (__u32 __user *) data); -} - -/* - * Same as poke_user_per but for a 31 bit program. - */ -static inline void __poke_user_per_compat(struct task_struct *child, - addr_t addr, __u32 data) -{ - struct compat_per_struct_kernel *dummy32 = NULL; - - if (addr == (addr_t) &dummy32->cr9) - /* PER event mask of the user specified per set. */ - child->thread.per_user.control = - data & (PER_EVENT_MASK | PER_CONTROL_MASK); - else if (addr == (addr_t) &dummy32->starting_addr) - /* Starting address of the user specified per set. */ - child->thread.per_user.start = data; - else if (addr == (addr_t) &dummy32->ending_addr) - /* Ending address of the user specified per set. */ - child->thread.per_user.end = data; -} - -/* - * Same as poke_user but for a 31 bit program. - */ -static int __poke_user_compat(struct task_struct *child, - addr_t addr, addr_t data) -{ - struct compat_user *dummy32 = NULL; - __u32 tmp = (__u32) data; - addr_t offset; - - if (addr < (addr_t) &dummy32->regs.acrs) { - struct pt_regs *regs = task_pt_regs(child); - /* - * psw, gprs, acrs and orig_gpr2 are stored on the stack - */ - if (addr == (addr_t) &dummy32->regs.psw.mask) { - __u32 mask = PSW32_MASK_USER; - - mask |= is_ri_task(child) ? PSW32_MASK_RI : 0; - /* Build a 64 bit psw mask from 31 bit mask. */ - if ((tmp ^ PSW32_USER_BITS) & ~mask) - /* Invalid psw mask. */ - return -EINVAL; - if ((data & PSW32_MASK_ASC) == PSW32_ASC_HOME) - /* Invalid address-space-control bits */ - return -EINVAL; - regs->psw.mask = (regs->psw.mask & ~PSW_MASK_USER) | - (regs->psw.mask & PSW_MASK_BA) | - (__u64)(tmp & mask) << 32; - } else if (addr == (addr_t) &dummy32->regs.psw.addr) { - /* Build a 64 bit psw address from 31 bit address. */ - regs->psw.addr = (__u64) tmp & PSW32_ADDR_INSN; - /* Transfer 31 bit amode bit to psw mask. */ - regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) | - (__u64)(tmp & PSW32_ADDR_AMODE); - } else { - if (test_pt_regs_flag(regs, PIF_SYSCALL) && - addr == offsetof(struct compat_user, regs.gprs[2])) { - struct pt_regs *regs = task_pt_regs(child); - - regs->int_code = 0x20000 | (data & 0xffff); - } - /* gpr 0-15 */ - *(__u32*)((addr_t) ®s->psw + addr*2 + 4) = tmp; - } - } else if (addr < (addr_t) (&dummy32->regs.orig_gpr2)) { - /* - * access registers are stored in the thread structure - */ - offset = addr - (addr_t) &dummy32->regs.acrs; - *(__u32*)((addr_t) &child->thread.acrs + offset) = tmp; - - } else if (addr == (addr_t) (&dummy32->regs.orig_gpr2)) { - /* - * orig_gpr2 is stored on the kernel stack - */ - *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; - - } else if (addr < (addr_t) &dummy32->regs.fp_regs) { - /* - * prevent writess of padding hole between - * orig_gpr2 and fp_regs on s390. - */ - return 0; - - } else if (addr == (addr_t) &dummy32->regs.fp_regs.fpc) { - /* - * floating point control reg. is in the thread structure - */ - if (test_fp_ctl(tmp)) - return -EINVAL; - child->thread.fpu.fpc = data; - - } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { - /* - * floating point regs. are either in child->thread.fpu - * or the child->thread.fpu.vxrs array - */ - offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; - if (MACHINE_HAS_VX) - *(__u32 *)((addr_t) - child->thread.fpu.vxrs + 2*offset) = tmp; - else - *(__u32 *)((addr_t) - child->thread.fpu.fprs + offset) = tmp; - - } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { - /* - * Handle access to the per_info structure. - */ - addr -= (addr_t) &dummy32->regs.per_info; - __poke_user_per_compat(child, addr, data); - } - - return 0; -} - -static int poke_user_compat(struct task_struct *child, - addr_t addr, addr_t data) -{ - if (!is_compat_task() || (addr & 3) || - addr > sizeof(struct compat_user) - 3) - return -EIO; - - return __poke_user_compat(child, addr, data); -} - -long compat_arch_ptrace(struct task_struct *child, compat_long_t request, - compat_ulong_t caddr, compat_ulong_t cdata) -{ - unsigned long addr = caddr; - unsigned long data = cdata; - compat_ptrace_area parea; - int copied, ret; - - switch (request) { - case PTRACE_PEEKUSR: - /* read the word at location addr in the USER area. */ - return peek_user_compat(child, addr, data); - - case PTRACE_POKEUSR: - /* write the word at location addr in the USER area */ - return poke_user_compat(child, addr, data); - - case PTRACE_PEEKUSR_AREA: - case PTRACE_POKEUSR_AREA: - if (copy_from_user(&parea, (void __force __user *) addr, - sizeof(parea))) - return -EFAULT; - addr = parea.kernel_addr; - data = parea.process_addr; - copied = 0; - while (copied < parea.len) { - if (request == PTRACE_PEEKUSR_AREA) - ret = peek_user_compat(child, addr, data); - else { - __u32 utmp; - if (get_user(utmp, - (__u32 __force __user *) data)) - return -EFAULT; - ret = poke_user_compat(child, addr, utmp); - } - if (ret) - return ret; - addr += sizeof(unsigned int); - data += sizeof(unsigned int); - copied += sizeof(unsigned int); - } - return 0; - case PTRACE_GET_LAST_BREAK: - put_user(child->thread.last_break, - (unsigned int __user *) data); - return 0; - } - return compat_ptrace_request(child, request, addr, data); -} -#endif - /* * user_regset definitions. */ @@ -905,10 +562,10 @@ static int s390_fpregs_get(struct task_struct *target, _s390_fp_regs fp_regs; if (target == current) - save_fpu_regs(); + save_user_fpu_regs(); - fp_regs.fpc = target->thread.fpu.fpc; - fpregs_store(&fp_regs, &target->thread.fpu); + fp_regs.fpc = target->thread.ufpu.fpc; + fpregs_store(&fp_regs, &target->thread.ufpu); return membuf_write(&to, &fp_regs, sizeof(fp_regs)); } @@ -922,23 +579,17 @@ static int s390_fpregs_set(struct task_struct *target, freg_t fprs[__NUM_FPRS]; if (target == current) - save_fpu_regs(); - - if (MACHINE_HAS_VX) - convert_vx_to_fp(fprs, target->thread.fpu.vxrs); - else - memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs)); - - /* If setting FPC, must validate it first. */ + save_user_fpu_regs(); + convert_vx_to_fp(fprs, target->thread.ufpu.vxrs); if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) { - u32 ufpc[2] = { target->thread.fpu.fpc, 0 }; + u32 ufpc[2] = { target->thread.ufpu.fpc, 0 }; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc, 0, offsetof(s390_fp_regs, fprs)); if (rc) return rc; - if (ufpc[1] != 0 || test_fp_ctl(ufpc[0])) + if (ufpc[1] != 0) return -EINVAL; - target->thread.fpu.fpc = ufpc[0]; + target->thread.ufpu.fpc = ufpc[0]; } if (rc == 0 && count > 0) @@ -946,12 +597,7 @@ static int s390_fpregs_set(struct task_struct *target, fprs, offsetof(s390_fp_regs, fprs), -1); if (rc) return rc; - - if (MACHINE_HAS_VX) - convert_fp_to_vx(target->thread.fpu.vxrs, fprs); - else - memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs)); - + convert_fp_to_vx(target->thread.ufpu.vxrs, fprs); return rc; } @@ -998,12 +644,12 @@ static int s390_vxrs_low_get(struct task_struct *target, __u64 vxrs[__NUM_VXRS_LOW]; int i; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) - save_fpu_regs(); + save_user_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + vxrs[i] = target->thread.ufpu.vxrs[i].low; return membuf_write(&to, vxrs, sizeof(vxrs)); } @@ -1015,18 +661,18 @@ static int s390_vxrs_low_set(struct task_struct *target, __u64 vxrs[__NUM_VXRS_LOW]; int i, rc; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) - save_fpu_regs(); + save_user_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); + vxrs[i] = target->thread.ufpu.vxrs[i].low; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); if (rc == 0) for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i]; + target->thread.ufpu.vxrs[i].low = vxrs[i]; return rc; } @@ -1035,11 +681,11 @@ static int s390_vxrs_high_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) - save_fpu_regs(); - return membuf_write(&to, target->thread.fpu.vxrs + __NUM_VXRS_LOW, + save_user_fpu_regs(); + return membuf_write(&to, target->thread.ufpu.vxrs + __NUM_VXRS_LOW, __NUM_VXRS_HIGH * sizeof(__vector128)); } @@ -1050,13 +696,13 @@ static int s390_vxrs_high_set(struct task_struct *target, { int rc; - if (!MACHINE_HAS_VX) + if (!cpu_has_vx()) return -ENODEV; if (target == current) - save_fpu_regs(); + save_user_fpu_regs(); rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1); + target->thread.ufpu.vxrs + __NUM_VXRS_LOW, 0, -1); return rc; } @@ -1083,7 +729,7 @@ static int s390_gs_cb_get(struct task_struct *target, { struct gs_cb *data = target->thread.gs_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) return -ENODATA; @@ -1100,7 +746,7 @@ static int s390_gs_cb_set(struct task_struct *target, struct gs_cb gs_cb = { }, *data = NULL; int rc; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!target->thread.gs_cb) { data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -1124,7 +770,7 @@ static int s390_gs_cb_set(struct task_struct *target, target->thread.gs_cb = data; *target->thread.gs_cb = gs_cb; if (target == current) { - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); restore_gs_cb(target->thread.gs_cb); } preempt_enable(); @@ -1137,7 +783,7 @@ static int s390_gs_bc_get(struct task_struct *target, { struct gs_cb *data = target->thread.gs_bc_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) return -ENODATA; @@ -1151,7 +797,7 @@ static int s390_gs_bc_set(struct task_struct *target, { struct gs_cb *data = target->thread.gs_bc_cb; - if (!MACHINE_HAS_GS) + if (!cpu_has_gs()) return -ENODEV; if (!data) { data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -1256,7 +902,7 @@ static int s390_runtime_instr_set(struct task_struct *target, static const struct user_regset s390_regsets[] = { { - .core_note_type = NT_PRSTATUS, + USER_REGSET_NOTE_TYPE(PRSTATUS), .n = sizeof(s390_regs) / sizeof(long), .size = sizeof(long), .align = sizeof(long), @@ -1264,7 +910,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_regs_set, }, { - .core_note_type = NT_PRFPREG, + USER_REGSET_NOTE_TYPE(PRFPREG), .n = sizeof(s390_fp_regs) / sizeof(long), .size = sizeof(long), .align = sizeof(long), @@ -1272,7 +918,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_fpregs_set, }, { - .core_note_type = NT_S390_SYSTEM_CALL, + USER_REGSET_NOTE_TYPE(S390_SYSTEM_CALL), .n = 1, .size = sizeof(unsigned int), .align = sizeof(unsigned int), @@ -1280,7 +926,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_system_call_set, }, { - .core_note_type = NT_S390_LAST_BREAK, + USER_REGSET_NOTE_TYPE(S390_LAST_BREAK), .n = 1, .size = sizeof(long), .align = sizeof(long), @@ -1288,7 +934,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_last_break_set, }, { - .core_note_type = NT_S390_TDB, + USER_REGSET_NOTE_TYPE(S390_TDB), .n = 1, .size = 256, .align = 1, @@ -1296,7 +942,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_tdb_set, }, { - .core_note_type = NT_S390_VXRS_LOW, + USER_REGSET_NOTE_TYPE(S390_VXRS_LOW), .n = __NUM_VXRS_LOW, .size = sizeof(__u64), .align = sizeof(__u64), @@ -1304,7 +950,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_vxrs_low_set, }, { - .core_note_type = NT_S390_VXRS_HIGH, + USER_REGSET_NOTE_TYPE(S390_VXRS_HIGH), .n = __NUM_VXRS_HIGH, .size = sizeof(__vector128), .align = sizeof(__vector128), @@ -1312,7 +958,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_vxrs_high_set, }, { - .core_note_type = NT_S390_GS_CB, + USER_REGSET_NOTE_TYPE(S390_GS_CB), .n = sizeof(struct gs_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), @@ -1320,7 +966,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_gs_cb_set, }, { - .core_note_type = NT_S390_GS_BC, + USER_REGSET_NOTE_TYPE(S390_GS_BC), .n = sizeof(struct gs_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), @@ -1328,7 +974,7 @@ static const struct user_regset s390_regsets[] = { .set = s390_gs_bc_set, }, { - .core_note_type = NT_S390_RI_CB, + USER_REGSET_NOTE_TYPE(S390_RI_CB), .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), .size = sizeof(__u64), .align = sizeof(__u64), @@ -1344,225 +990,8 @@ static const struct user_regset_view user_s390_view = { .n = ARRAY_SIZE(s390_regsets) }; -#ifdef CONFIG_COMPAT -static int s390_compat_regs_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) -{ - unsigned n; - - if (target == current) - save_access_regs(target->thread.acrs); - - for (n = 0; n < sizeof(s390_compat_regs); n += sizeof(compat_ulong_t)) - membuf_store(&to, __peek_user_compat(target, n)); - return 0; -} - -static int s390_compat_regs_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int rc = 0; - - if (target == current) - save_access_regs(target->thread.acrs); - - if (kbuf) { - const compat_ulong_t *k = kbuf; - while (count > 0 && !rc) { - rc = __poke_user_compat(target, pos, *k++); - count -= sizeof(*k); - pos += sizeof(*k); - } - } else { - const compat_ulong_t __user *u = ubuf; - while (count > 0 && !rc) { - compat_ulong_t word; - rc = __get_user(word, u++); - if (rc) - break; - rc = __poke_user_compat(target, pos, word); - count -= sizeof(*u); - pos += sizeof(*u); - } - } - - if (rc == 0 && target == current) - restore_access_regs(target->thread.acrs); - - return rc; -} - -static int s390_compat_regs_high_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) -{ - compat_ulong_t *gprs_high; - int i; - - gprs_high = (compat_ulong_t *)task_pt_regs(target)->gprs; - for (i = 0; i < NUM_GPRS; i++, gprs_high += 2) - membuf_store(&to, *gprs_high); - return 0; -} - -static int s390_compat_regs_high_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - compat_ulong_t *gprs_high; - int rc = 0; - - gprs_high = (compat_ulong_t *) - &task_pt_regs(target)->gprs[pos / sizeof(compat_ulong_t)]; - if (kbuf) { - const compat_ulong_t *k = kbuf; - while (count > 0) { - *gprs_high = *k++; - *gprs_high += 2; - count -= sizeof(*k); - } - } else { - const compat_ulong_t __user *u = ubuf; - while (count > 0 && !rc) { - unsigned long word; - rc = __get_user(word, u++); - if (rc) - break; - *gprs_high = word; - *gprs_high += 2; - count -= sizeof(*u); - } - } - - return rc; -} - -static int s390_compat_last_break_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) -{ - compat_ulong_t last_break = target->thread.last_break; - - return membuf_store(&to, (unsigned long)last_break); -} - -static int s390_compat_last_break_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - return 0; -} - -static const struct user_regset s390_compat_regsets[] = { - { - .core_note_type = NT_PRSTATUS, - .n = sizeof(s390_compat_regs) / sizeof(compat_long_t), - .size = sizeof(compat_long_t), - .align = sizeof(compat_long_t), - .regset_get = s390_compat_regs_get, - .set = s390_compat_regs_set, - }, - { - .core_note_type = NT_PRFPREG, - .n = sizeof(s390_fp_regs) / sizeof(compat_long_t), - .size = sizeof(compat_long_t), - .align = sizeof(compat_long_t), - .regset_get = s390_fpregs_get, - .set = s390_fpregs_set, - }, - { - .core_note_type = NT_S390_SYSTEM_CALL, - .n = 1, - .size = sizeof(compat_uint_t), - .align = sizeof(compat_uint_t), - .regset_get = s390_system_call_get, - .set = s390_system_call_set, - }, - { - .core_note_type = NT_S390_LAST_BREAK, - .n = 1, - .size = sizeof(long), - .align = sizeof(long), - .regset_get = s390_compat_last_break_get, - .set = s390_compat_last_break_set, - }, - { - .core_note_type = NT_S390_TDB, - .n = 1, - .size = 256, - .align = 1, - .regset_get = s390_tdb_get, - .set = s390_tdb_set, - }, - { - .core_note_type = NT_S390_VXRS_LOW, - .n = __NUM_VXRS_LOW, - .size = sizeof(__u64), - .align = sizeof(__u64), - .regset_get = s390_vxrs_low_get, - .set = s390_vxrs_low_set, - }, - { - .core_note_type = NT_S390_VXRS_HIGH, - .n = __NUM_VXRS_HIGH, - .size = sizeof(__vector128), - .align = sizeof(__vector128), - .regset_get = s390_vxrs_high_get, - .set = s390_vxrs_high_set, - }, - { - .core_note_type = NT_S390_HIGH_GPRS, - .n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t), - .size = sizeof(compat_long_t), - .align = sizeof(compat_long_t), - .regset_get = s390_compat_regs_high_get, - .set = s390_compat_regs_high_set, - }, - { - .core_note_type = NT_S390_GS_CB, - .n = sizeof(struct gs_cb) / sizeof(__u64), - .size = sizeof(__u64), - .align = sizeof(__u64), - .regset_get = s390_gs_cb_get, - .set = s390_gs_cb_set, - }, - { - .core_note_type = NT_S390_GS_BC, - .n = sizeof(struct gs_cb) / sizeof(__u64), - .size = sizeof(__u64), - .align = sizeof(__u64), - .regset_get = s390_gs_bc_get, - .set = s390_gs_bc_set, - }, - { - .core_note_type = NT_S390_RI_CB, - .n = sizeof(struct runtime_instr_cb) / sizeof(__u64), - .size = sizeof(__u64), - .align = sizeof(__u64), - .regset_get = s390_runtime_instr_get, - .set = s390_runtime_instr_set, - }, -}; - -static const struct user_regset_view user_s390_compat_view = { - .name = "s390", - .e_machine = EM_S390, - .regsets = s390_compat_regsets, - .n = ARRAY_SIZE(s390_compat_regsets) -}; -#endif - const struct user_regset_view *task_user_regset_view(struct task_struct *task) { -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_31BIT)) - return &user_s390_compat_view; -#endif return &user_s390_view; } @@ -1571,13 +1000,6 @@ static const char *gpr_names[NUM_GPRS] = { "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", }; -unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) -{ - if (offset >= NUM_GPRS) - return 0; - return regs->gprs[offset]; -} - int regs_query_register_offset(const char *name) { unsigned long offset; @@ -1597,29 +1019,3 @@ const char *regs_query_register_name(unsigned int offset) return NULL; return gpr_names[offset]; } - -static int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) -{ - unsigned long ksp = kernel_stack_pointer(regs); - - return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1)); -} - -/** - * regs_get_kernel_stack_nth() - get Nth entry of the stack - * @regs:pt_regs which contains kernel stack pointer. - * @n:stack entry number. - * - * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which - * is specifined by @regs. If the @n th entry is NOT in the kernel stack, - * this returns 0. - */ -unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) -{ - unsigned long addr; - - addr = kernel_stack_pointer(regs) + n * sizeof(long); - if (!regs_within_kernel_stack(regs, addr)) - return 0; - return *(unsigned long *)addr; -} diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 4a22163962eb..69fcaf54d5ca 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -9,6 +9,7 @@ #include <asm/asm-offsets.h> #include <asm/nospec-insn.h> #include <asm/sigp.h> +#include <asm/lowcore.h> GEN_BR_THUNK %r9 @@ -19,21 +20,16 @@ # r2 = Function to be called after store status # r3 = Parameter for function # -ENTRY(store_status) - /* Save register one and load save area base */ - stg %r1,__LC_SAVE_AREA_RESTART +SYM_CODE_START(store_status) + STMG_LC %r0,%r15,__LC_GPREGS_SAVE_AREA /* General purpose registers */ - lghi %r1,__LC_GPREGS_SAVE_AREA - stmg %r0,%r15,0(%r1) - mvc 8(8,%r1),__LC_SAVE_AREA_RESTART + GET_LC %r13 /* Control registers */ - lghi %r1,__LC_CREGS_SAVE_AREA - stctg %c0,%c15,0(%r1) + stctg %c0,%c15,__LC_CREGS_SAVE_AREA(%r13) /* Access registers */ - lghi %r1,__LC_AREGS_SAVE_AREA - stam %a0,%a15,0(%r1) + stamy %a0,%a15,__LC_AREGS_SAVE_AREA(%r13) /* Floating point registers */ - lghi %r1,__LC_FPREGS_SAVE_AREA + lay %r1,__LC_FPREGS_SAVE_AREA(%r13) std %f0, 0x00(%r1) std %f1, 0x08(%r1) std %f2, 0x10(%r1) @@ -51,21 +47,21 @@ ENTRY(store_status) std %f14,0x70(%r1) std %f15,0x78(%r1) /* Floating point control register */ - lghi %r1,__LC_FP_CREG_SAVE_AREA + lay %r1,__LC_FP_CREG_SAVE_AREA(%r13) stfpc 0(%r1) /* CPU timer */ - lghi %r1,__LC_CPU_TIMER_SAVE_AREA + lay %r1,__LC_CPU_TIMER_SAVE_AREA(%r13) stpt 0(%r1) /* Store prefix register */ - lghi %r1,__LC_PREFIX_SAVE_AREA + lay %r1,__LC_PREFIX_SAVE_AREA(%r13) stpx 0(%r1) /* Clock comparator - seven bytes */ - lghi %r1,__LC_CLOCK_COMP_SAVE_AREA - larl %r4,.Lclkcmp + larl %r4,clkcmp stckc 0(%r4) + lay %r1,__LC_CLOCK_COMP_SAVE_AREA(%r13) mvc 1(7,%r1),1(%r4) /* Program status word */ - lghi %r1,__LC_PSW_SAVE_AREA + lay %r1,__LC_PSW_SAVE_AREA(%r13) epsw %r4,%r5 st %r4,0(%r1) st %r5,4(%r1) @@ -73,9 +69,9 @@ ENTRY(store_status) lgr %r9,%r2 lgr %r2,%r3 BR_EX %r9 -ENDPROC(store_status) +SYM_CODE_END(store_status) .section .bss - .align 8 -.Lclkcmp: .quad 0x0000000000000000 + .balign 8 +SYM_DATA_LOCAL(clkcmp, .quad 0x0000000000000000) .previous diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index fe396673e8a6..0ae297c82afd 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -2,8 +2,7 @@ /* * Copyright IBM Corp. 2005 * - * Author(s): Rolf Adelsberger, - * Heiko Carstens <heiko.carstens@de.ibm.com> + * Author(s): Rolf Adelsberger * */ @@ -15,6 +14,7 @@ * moves the new kernel to its destination... * %r2 = pointer to first kimage_entry_t * %r3 = start address - where to jump to after the job is done... + * %r4 = subcode * * %r5 will be used as temp. storage * %r6 holds the destination address @@ -26,53 +26,51 @@ */ .text -ENTRY(relocate_kernel) - basr %r13,0 # base address - .base: - lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 - lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 - lg %r5,0(%r2) # read another word for indirection page - aghi %r2,8 # increment pointer - tml %r5,0x1 # is it a destination page? - je .indir_check # NO, goto "indir_check" - lgr %r6,%r5 # r6 = r5 - nill %r6,0xf000 # mask it out and... - j .base # ...next iteration - .indir_check: - tml %r5,0x2 # is it a indirection page? - je .done_test # NO, goto "done_test" - nill %r5,0xf000 # YES, mask out, - lgr %r2,%r5 # move it into the right register, - j .base # and read next... - .done_test: - tml %r5,0x4 # is it the done indicator? - je .source_test # NO! Well, then it should be the source indicator... - j .done # ok, lets finish it here... - .source_test: - tml %r5,0x8 # it should be a source indicator... - je .base # NO, ignore it... - lgr %r8,%r5 # r8 = r5 - nill %r8,0xf000 # masking - 0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0 - jo 0b - j .base - .done: - sgr %r0,%r0 # clear register r0 - cghi %r3,0 - je .diag - la %r4,load_psw-.base(%r13) # load psw-address into the register - o %r3,4(%r4) # or load address into psw - st %r3,4(%r4) - mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 - .diag: - diag %r0,%r0,0x308 -ENDPROC(relocate_kernel) +SYM_CODE_START(relocate_kernel) + basr %r13,0 # base address +.base: + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 + lg %r5,0(%r2) # read another word for indirection page + aghi %r2,8 # increment pointer + tml %r5,0x1 # is it a destination page? + je .indir_check # NO, goto "indir_check" + lgr %r6,%r5 # r6 = r5 + nill %r6,0xf000 # mask it out and... + j .base # ...next iteration +.indir_check: + tml %r5,0x2 # is it a indirection page? + je .done_test # NO, goto "done_test" + nill %r5,0xf000 # YES, mask out, + lgr %r2,%r5 # move it into the right register, + j .base # and read next... +.done_test: + tml %r5,0x4 # is it the done indicator? + je .source_test # NO! Well, then it should be the source indicator... + j .done # ok, lets finish it here... +.source_test: + tml %r5,0x8 # it should be a source indicator... + je .base # NO, ignore it... + lgr %r8,%r5 # r8 = r5 + nill %r8,0xf000 # masking +0: mvcle %r6,%r8,0x0 # copy PAGE_SIZE bytes from r8 to r6 - pad with 0 + jo 0b + j .base +.done: + lgr %r0,%r4 # subcode + cghi %r3,0 + je .diag + la %r4,load_psw-.base(%r13) # load psw-address into the register + o %r3,4(%r4) # or load address into psw + st %r3,4(%r4) + mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 +.diag: + diag %r0,%r0,0x308 +SYM_CODE_END(relocate_kernel) - .align 8 - load_psw: - .long 0x00080000,0x80000000 - relocate_kernel_end: - .align 8 - .globl relocate_kernel_len - relocate_kernel_len: - .quad relocate_kernel_end - relocate_kernel + .balign 8 +SYM_DATA_START_LOCAL(load_psw) + .long 0x00080000,0x80000000 +SYM_DATA_END_LABEL(load_psw, SYM_L_LOCAL, relocate_kernel_end) + .balign 8 +SYM_DATA(relocate_kernel_len, .quad relocate_kernel_end - relocate_kernel) diff --git a/arch/s390/kernel/rethook.c b/arch/s390/kernel/rethook.c new file mode 100644 index 000000000000..af10e6bdd34e --- /dev/null +++ b/arch/s390/kernel/rethook.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/rethook.h> +#include <linux/kprobes.h> +#include "rethook.h" + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + rh->ret_addr = regs->gprs[14]; + rh->frame = regs->gprs[15]; + + /* Replace the return addr with trampoline addr */ + regs->gprs[14] = (unsigned long)&arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); + +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* Replace fake return address with real one. */ + regs->gprs[14] = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +/* + * Called from arch_rethook_trampoline + */ +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + return rethook_trampoline_handler(regs, regs->gprs[15]); +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* assembler function that handles the rethook must not be probed itself */ +NOKPROBE_SYMBOL(arch_rethook_trampoline); diff --git a/arch/s390/kernel/rethook.h b/arch/s390/kernel/rethook.h new file mode 100644 index 000000000000..32f069eed3f3 --- /dev/null +++ b/arch/s390/kernel/rethook.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __S390_RETHOOK_H +#define __S390_RETHOOK_H + +unsigned long arch_rethook_trampoline_callback(struct pt_regs *regs); + +#endif diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ff0f9e838916..c1fe0b53c5ac 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -13,8 +13,7 @@ * This file handles the architecture-dependent parts of initialization */ -#define KMSG_COMPONENT "setup" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "setup: " fmt #include <linux/errno.h> #include <linux/export.h> @@ -47,17 +46,19 @@ #include <linux/kexec.h> #include <linux/crash_dump.h> #include <linux/memory.h> -#include <linux/compat.h> #include <linux/start_kernel.h> #include <linux/hugetlb.h> +#include <linux/kmemleak.h> +#include <asm/archrandom.h> #include <asm/boot_data.h> +#include <asm/machine.h> #include <asm/ipl.h> #include <asm/facility.h> #include <asm/smp.h> #include <asm/mmu_context.h> #include <asm/cpcmd.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/nmi.h> #include <asm/irq.h> #include <asm/page.h> @@ -72,7 +73,8 @@ #include <asm/numa.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> -#include <asm/mem_detect.h> +#include <asm/physmem_info.h> +#include <asm/maccess.h> #include <asm/uv.h> #include <asm/asm-offsets.h> #include "entry.h" @@ -89,40 +91,89 @@ EXPORT_SYMBOL(console_devno); unsigned int console_irq = -1; EXPORT_SYMBOL(console_irq); -unsigned long elf_hwcap __read_mostly = 0; -char elf_platform[ELF_PLATFORM_SIZE]; +/* + * Some code and data needs to stay below 2 GB, even when the kernel would be + * relocated above 2 GB, because it has to use 31 bit addresses. + * Such code and data is part of the .amode31 section. + */ +char __amode31_ref *__samode31 = _samode31; +char __amode31_ref *__eamode31 = _eamode31; +char __amode31_ref *__stext_amode31 = _stext_amode31; +char __amode31_ref *__etext_amode31 = _etext_amode31; +struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table; +struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table; + +/* + * Control registers CR2, CR5 and CR15 are initialized with addresses + * of tables that must be placed below 2G which is handled by the AMODE31 + * sections. + * Because the AMODE31 sections are relocated below 2G at startup, + * the content of control registers CR2, CR5 and CR15 must be updated + * with new addresses after the relocation. The initial initialization of + * control registers occurs in head.S and then gets updated again after AMODE31 + * relocation. We must access the relevant AMODE31 tables indirectly via + * pointers placed in the .amode31.refs linker section. Those pointers get + * updated automatically during AMODE31 relocation and always contain a valid + * address within AMODE31 sections. + */ + +static __amode31_data u32 __ctl_duct_amode31[16] __aligned(64); + +static __amode31_data u64 __ctl_aste_amode31[8] __aligned(64) = { + [1] = 0xffffffffffffffff +}; + +static __amode31_data u32 __ctl_duald_amode31[32] __aligned(128) = { + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0, + 0x80000000, 0, 0, 0 +}; -unsigned long int_hwcap = 0; +static __amode31_data u32 __ctl_linkage_stack_amode31[8] __aligned(64) = { + 0, 0, 0x89000000, 0, + 0, 0, 0x8a000000, 0 +}; -int __bootdata(noexec_disabled); -unsigned long __bootdata(ident_map_size); -struct mem_detect_info __bootdata(mem_detect); +static u64 __amode31_ref *__ctl_aste = __ctl_aste_amode31; +static u32 __amode31_ref *__ctl_duald = __ctl_duald_amode31; +static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31; +static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31; -struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table); -struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table); -unsigned long __bootdata_preserved(__stext_dma); -unsigned long __bootdata_preserved(__etext_dma); -unsigned long __bootdata_preserved(__sdma); -unsigned long __bootdata_preserved(__edma); -unsigned long __bootdata_preserved(__kaslr_offset); +unsigned long __bootdata_preserved(max_mappable); +struct physmem_info __bootdata(physmem_info); + +struct vm_layout __bootdata_preserved(vm_layout); +EXPORT_SYMBOL(vm_layout); +int __bootdata_preserved(__kaslr_enabled); unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); u64 __bootdata_preserved(stfle_fac_list[16]); EXPORT_SYMBOL(stfle_fac_list); -u64 __bootdata_preserved(alt_stfle_fac_list[16]); +struct oldmem_data __bootdata_preserved(oldmem_data); + +char __bootdata(boot_rb)[PAGE_SIZE * 2]; +bool __bootdata(boot_earlyprintk); +size_t __bootdata(boot_rb_off); +char __bootdata(bootdebug_filter)[128]; +bool __bootdata(bootdebug); -unsigned long VMALLOC_START; +unsigned long __bootdata_preserved(VMALLOC_START); EXPORT_SYMBOL(VMALLOC_START); -unsigned long VMALLOC_END; +unsigned long __bootdata_preserved(VMALLOC_END); EXPORT_SYMBOL(VMALLOC_END); -struct page *vmemmap; +struct page *__bootdata_preserved(vmemmap); EXPORT_SYMBOL(vmemmap); -unsigned long vmemmap_size; +unsigned long __bootdata_preserved(vmemmap_size); -unsigned long MODULES_VADDR; -unsigned long MODULES_END; +unsigned long __bootdata_preserved(MODULES_VADDR); +unsigned long __bootdata_preserved(MODULES_END); /* An array with a pointer to the lowcore of every CPU. */ struct lowcore *lowcore_ptr[NR_CPUS]; @@ -197,7 +248,7 @@ static void __init conmode_default(void) char query_buffer[1024]; char *ptr; - if (MACHINE_IS_VM) { + if (machine_is_vm()) { cpcmd("QUERY CONSOLE", query_buffer, 1024, NULL); console_devno = simple_strtoul(query_buffer + 5, NULL, 16); ptr = strstr(query_buffer, "SUBCHANNEL ="); @@ -235,7 +286,7 @@ static void __init conmode_default(void) SET_CONSOLE_SCLP; #endif } - } else if (MACHINE_IS_KVM) { + } else if (machine_is_kvm()) { if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE)) SET_CONSOLE_VT220; else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE)) @@ -254,9 +305,9 @@ static void __init setup_zfcpdump(void) { if (!is_ipl_type_dump()) return; - if (OLDMEM_BASE) + if (oldmem_data.start) return; - strcat(boot_command_line, " cio_ignore=all,!ipldev,!condev"); + strlcat(boot_command_line, " cio_ignore=all,!ipldev,!condev", COMMAND_LINE_SIZE); console_loglevel = 2; } #else @@ -311,60 +362,30 @@ void *restart_stack; unsigned long stack_alloc(void) { -#ifdef CONFIG_VMAP_STACK - return (unsigned long)__vmalloc_node(THREAD_SIZE, THREAD_SIZE, - THREADINFO_GFP, NUMA_NO_NODE, - __builtin_return_address(0)); -#else - return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); -#endif + void *stack; + + stack = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP, + NUMA_NO_NODE, __builtin_return_address(0)); + kmemleak_not_leak(stack); + return (unsigned long)stack; } void stack_free(unsigned long stack) { -#ifdef CONFIG_VMAP_STACK - vfree((void *) stack); -#else - free_pages(stack, THREAD_SIZE_ORDER); -#endif + vfree((void *)stack); } -int __init arch_early_irq_init(void) +static unsigned long __init stack_alloc_early(void) { unsigned long stack; - stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); - if (!stack) - panic("Couldn't allocate async stack"); - S390_lowcore.async_stack = stack + STACK_INIT_OFFSET; - return 0; + stack = (unsigned long)memblock_alloc_or_panic(THREAD_SIZE, THREAD_SIZE); + return stack; } -void __init arch_call_rest_init(void) +static void __init setup_lowcore(void) { - unsigned long stack; - - stack = stack_alloc(); - if (!stack) - panic("Couldn't allocate kernel stack"); - current->stack = (void *) stack; -#ifdef CONFIG_VMAP_STACK - current->stack_vm_area = (void *) stack; -#endif - set_task_stack_end_magic(current); - stack += STACK_INIT_OFFSET; - S390_lowcore.kernel_stack = stack; - call_on_stack_noreturn(rest_init, stack); -} - -static void __init setup_lowcore_dat_off(void) -{ - unsigned long int_psw_mask = PSW_KERNEL_BITS; - unsigned long mcck_stack; - struct lowcore *lc; - - if (IS_ENABLED(CONFIG_KASAN)) - int_psw_mask |= PSW_MASK_DAT; + struct lowcore *lc, *abs_lc; /* * Setup lowcore for boot cpu @@ -375,44 +396,40 @@ static void __init setup_lowcore_dat_off(void) panic("%s: Failed to allocate %zu bytes align=%zx\n", __func__, sizeof(*lc), sizeof(*lc)); - lc->restart_psw.mask = PSW_KERNEL_BITS; - lc->restart_psw.addr = (unsigned long) restart_int_handler; - lc->external_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->pcpu = (unsigned long)per_cpu_ptr(&pcpu_devices, 0); + lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; + lc->restart_psw.addr = __pa(restart_int_handler); + lc->external_new_psw.mask = PSW_KERNEL_BITS; lc->external_new_psw.addr = (unsigned long) ext_int_handler; - lc->svc_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->svc_new_psw.mask = PSW_KERNEL_BITS; lc->svc_new_psw.addr = (unsigned long) system_call; - lc->program_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->program_new_psw.mask = PSW_KERNEL_BITS; lc->program_new_psw.addr = (unsigned long) pgm_check_handler; lc->mcck_new_psw.mask = PSW_KERNEL_BITS; lc->mcck_new_psw.addr = (unsigned long) mcck_int_handler; - lc->io_new_psw.mask = int_psw_mask | PSW_MASK_MCHECK; + lc->io_new_psw.mask = PSW_KERNEL_BITS; lc->io_new_psw.addr = (unsigned long) io_int_handler; lc->clock_comparator = clock_comparator_max; - lc->nodat_stack = ((unsigned long) &init_thread_union) - + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->current_task = (unsigned long)&init_task; lc->lpp = LPP_MAGIC; - lc->machine_flags = S390_lowcore.machine_flags; - lc->preempt_count = S390_lowcore.preempt_count; - nmi_alloc_boot_cpu(lc); - lc->sys_enter_timer = S390_lowcore.sys_enter_timer; - lc->exit_timer = S390_lowcore.exit_timer; - lc->user_timer = S390_lowcore.user_timer; - lc->system_timer = S390_lowcore.system_timer; - lc->steal_timer = S390_lowcore.steal_timer; - lc->last_update_timer = S390_lowcore.last_update_timer; - lc->last_update_clock = S390_lowcore.last_update_clock; - + lc->preempt_count = get_lowcore()->preempt_count; + nmi_alloc_mcesa_early(&lc->mcesad); + lc->sys_enter_timer = get_lowcore()->sys_enter_timer; + lc->exit_timer = get_lowcore()->exit_timer; + lc->user_timer = get_lowcore()->user_timer; + lc->system_timer = get_lowcore()->system_timer; + lc->steal_timer = get_lowcore()->steal_timer; + lc->last_update_timer = get_lowcore()->last_update_timer; + lc->last_update_clock = get_lowcore()->last_update_clock; /* * Allocate the global restart stack which is the same for - * all CPUs in cast *one* of them does a PSW restart. + * all CPUs in case *one* of them does a PSW restart. */ - restart_stack = memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!restart_stack) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - restart_stack += STACK_INIT_OFFSET; - + restart_stack = (void *)(stack_alloc_early() + STACK_INIT_OFFSET); + lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET; + lc->kernel_stack = get_lowcore()->kernel_stack; /* * Set up PSW restart to call ipl.c:do_restart(). Copy the relevant * restart data to the absolute zero lowcore. This is necessary if @@ -421,41 +438,32 @@ static void __init setup_lowcore_dat_off(void) lc->restart_stack = (unsigned long) restart_stack; lc->restart_fn = (unsigned long) do_restart; lc->restart_data = 0; - lc->restart_source = -1UL; - - mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!mcck_stack) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET; - - /* Setup absolute zero lowcore */ - mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack); - mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn); - mem_assign_absolute(S390_lowcore.restart_data, lc->restart_data); - mem_assign_absolute(S390_lowcore.restart_source, lc->restart_source); - mem_assign_absolute(S390_lowcore.restart_psw, lc->restart_psw); - + lc->restart_source = -1U; lc->spinlock_lockval = arch_spin_lockval(0); lc->spinlock_index = 0; arch_spin_lock_setup(0); - lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; - - set_prefix((u32)(unsigned long) lc); + lc->kernel_asce = get_lowcore()->kernel_asce; + lc->user_asce = get_lowcore()->user_asce; + + system_ctlreg_init_save_area(lc); + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = lc->restart_stack; + abs_lc->restart_fn = lc->restart_fn; + abs_lc->restart_data = lc->restart_data; + abs_lc->restart_source = lc->restart_source; + abs_lc->restart_psw = lc->restart_psw; + abs_lc->restart_flags = RESTART_FLAG_CTLREGS; + abs_lc->program_new_psw = lc->program_new_psw; + abs_lc->mcesad = lc->mcesad; + put_abs_lowcore(abs_lc); + + set_prefix(__pa(lc)); lowcore_ptr[0] = lc; -} - -static void __init setup_lowcore_dat_on(void) -{ - __ctl_clear_bit(0, 28); - S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.svc_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.program_new_psw.mask |= PSW_MASK_DAT; - S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT; - __ctl_set_bit(0, 28); + if (abs_lowcore_map(0, lowcore_ptr[0], false)) + panic("Couldn't setup absolute lowcore"); } static struct resource code_resource = { @@ -486,25 +494,22 @@ static void __init setup_resources(void) int j; u64 i; - code_resource.start = (unsigned long) _text; - code_resource.end = (unsigned long) _etext - 1; - data_resource.start = (unsigned long) _etext; - data_resource.end = (unsigned long) _edata - 1; - bss_resource.start = (unsigned long) __bss_start; - bss_resource.end = (unsigned long) __bss_stop - 1; + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext) - 1; + data_resource.start = __pa_symbol(_etext); + data_resource.end = __pa_symbol(_edata) - 1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop) - 1; for_each_mem_range(i, &start, &end) { - res = memblock_alloc(sizeof(*res), 8); - if (!res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*res), 8); + res = memblock_alloc_or_panic(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; res->start = start; /* * In memblock, end points to the first byte after the - * range while in resourses, end points to the last byte in + * range while in resources, end points to the last byte in * the range. */ res->end = end - 1; @@ -516,10 +521,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = memblock_alloc(sizeof(*sub_res), 8); - if (!sub_res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*sub_res), 8); + sub_res = memblock_alloc_or_panic(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; @@ -538,7 +540,8 @@ static void __init setup_resources(void) * part of the System RAM resource. */ if (crashk_res.end) { - memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0); + memblock_add_node(crashk_res.start, resource_size(&crashk_res), + 0, MEMBLOCK_NONE); memblock_reserve(crashk_res.start, resource_size(&crashk_res)); insert_resource(&iomem_resource, &crashk_res); } @@ -547,7 +550,6 @@ static void __init setup_resources(void) static void __init setup_memory_end(void) { - memblock_remove(ident_map_size, ULONG_MAX); max_pfn = max_low_pfn = PFN_DOWN(ident_map_size); pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20); } @@ -579,11 +581,15 @@ static struct notifier_block kdump_mem_nb = { #endif /* - * Make sure that the area above identity mapping is protected + * Reserve page tables created by decompressor */ -static void __init reserve_above_ident_map(void) +static void __init reserve_pgtables(void) { - memblock_reserve(ident_map_size, ULONG_MAX); + unsigned long start, end; + struct reserved_range *range; + + for_each_physmem_reserved_type_range(RR_VMEM, range, &start, &end) + memblock_reserve(start, end - start); } /* @@ -596,8 +602,8 @@ static void __init reserve_crashkernel(void) phys_addr_t low, high; int rc; - rc = parse_crashkernel(boot_command_line, ident_map_size, &crash_size, - &crash_base); + rc = parse_crashkernel(boot_command_line, ident_map_size, + &crash_size, &crash_base, NULL, NULL, NULL); crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); @@ -610,9 +616,9 @@ static void __init reserve_crashkernel(void) return; } - low = crash_base ?: OLDMEM_BASE; + low = crash_base ?: oldmem_data.start; high = low + crash_size; - if (low >= OLDMEM_BASE && high <= OLDMEM_BASE + OLDMEM_SIZE) { + if (low >= oldmem_data.start && high <= oldmem_data.start + oldmem_data.size) { /* The crashkernel fits into OLDMEM, reuse OLDMEM */ crash_base = low; } else { @@ -626,8 +632,9 @@ static void __init reserve_crashkernel(void) return; } low = crash_base ?: low; - crash_base = memblock_find_in_range(low, high, crash_size, - KEXEC_CRASH_MEM_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + KEXEC_CRASH_MEM_ALIGN, + low, high); } if (!crash_base) { @@ -636,10 +643,12 @@ static void __init reserve_crashkernel(void) return; } - if (register_memory_notifier(&kdump_mem_nb)) + if (register_memory_notifier(&kdump_mem_nb)) { + memblock_phys_free(crash_base, crash_size); return; + } - if (!OLDMEM_BASE && MACHINE_IS_VM) + if (!oldmem_data.start && machine_is_vm()) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; @@ -657,13 +666,13 @@ static void __init reserve_crashkernel(void) */ static void __init reserve_initrd(void) { -#ifdef CONFIG_BLK_DEV_INITRD - if (!INITRD_START || !INITRD_SIZE) + unsigned long addr, size; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD) || !get_physmem_reserved(RR_INITRD, &addr, &size)) return; - initrd_start = INITRD_START; - initrd_end = initrd_start + INITRD_SIZE; - memblock_reserve(INITRD_START, INITRD_SIZE); -#endif + initrd_start = (unsigned long)__va(addr); + initrd_end = initrd_start + size; + memblock_reserve(addr, size); } /* @@ -675,83 +684,71 @@ static void __init reserve_certificate_list(void) memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size); } -static void __init reserve_mem_detect_info(void) +static void __init reserve_physmem_info(void) { - unsigned long start, size; + unsigned long addr, size; - get_mem_detect_reserved(&start, &size); - if (size) - memblock_reserve(start, size); + if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size)) + memblock_reserve(addr, size); } -static void __init free_mem_detect_info(void) +static void __init free_physmem_info(void) { - unsigned long start, size; - - get_mem_detect_reserved(&start, &size); - if (size) - memblock_free(start, size); -} + unsigned long addr, size; -static const char * __init get_mem_info_source(void) -{ - switch (mem_detect.info_source) { - case MEM_DETECT_SCLP_STOR_INFO: - return "sclp storage info"; - case MEM_DETECT_DIAG260: - return "diag260"; - case MEM_DETECT_SCLP_READ_INFO: - return "sclp read info"; - case MEM_DETECT_BIN_SEARCH: - return "binary search"; - } - return "none"; + if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size)) + memblock_phys_free(addr, size); } -static void __init memblock_add_mem_detect_info(void) +static void __init memblock_add_physmem_info(void) { unsigned long start, end; int i; pr_debug("physmem info source: %s (%hhd)\n", - get_mem_info_source(), mem_detect.info_source); + get_physmem_info_source(), physmem_info.info_source); /* keep memblock lists close to the kernel */ memblock_set_bottom_up(true); - for_each_mem_detect_block(i, &start, &end) { + for_each_physmem_usable_range(i, &start, &end) memblock_add(start, end - start); + for_each_physmem_online_range(i, &start, &end) memblock_physmem_add(start, end - start); - } memblock_set_bottom_up(false); memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); - memblock_dump_all(); +} + +static void __init setup_high_memory(void) +{ + high_memory = __va(ident_map_size); } /* - * Check for initrd being in usable memory + * Reserve memory used for lowcore. */ -static void __init check_initrd(void) +static void __init reserve_lowcore(void) { -#ifdef CONFIG_BLK_DEV_INITRD - if (INITRD_START && INITRD_SIZE && - !memblock_is_region_memory(INITRD_START, INITRD_SIZE)) { - pr_err("The initial RAM disk does not fit into the memory\n"); - memblock_free(INITRD_START, INITRD_SIZE); - initrd_start = initrd_end = 0; + void *lowcore_start = get_lowcore(); + void *lowcore_end = lowcore_start + sizeof(struct lowcore); + void *start, *end; + + if (absolute_pointer(__identity_base) < lowcore_end) { + start = max(lowcore_start, (void *)__identity_base); + end = min(lowcore_end, (void *)(__identity_base + ident_map_size)); + memblock_reserve(__pa(start), __pa(end)); } -#endif } /* - * Reserve memory used for lowcore/command line/kernel image. + * Reserve memory used for absolute lowcore/command line/kernel image. */ static void __init reserve_kernel(void) { - unsigned long start_pfn = PFN_UP(__pa(_end)); - - memblock_reserve(0, HEAD_END); - memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - - (unsigned long)_stext); - memblock_reserve(__sdma, __edma - __sdma); + memblock_reserve(0, STARTUP_NORMAL_OFFSET); + memblock_reserve(OLDMEM_BASE, sizeof(unsigned long)); + memblock_reserve(OLDMEM_SIZE, sizeof(unsigned long)); + memblock_reserve(physmem_info.reserved[RR_AMODE31].start, __eamode31 - __samode31); + memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP); + memblock_reserve(__pa(_stext), _end - _stext); } static void __init setup_memory(void) @@ -766,157 +763,48 @@ static void __init setup_memory(void) storage_key_init_range(start, end); psw_set_key(PAGE_DEFAULT_KEY); - - /* Only cosmetics */ - memblock_enforce_memory_limit(memblock_end_of_DRAM()); } -/* - * Setup hardware capabilities. - */ -static int __init setup_hwcaps(void) +static void __init relocate_amode31_section(void) { - static const int stfl_bits[6] = { 0, 2, 7, 17, 19, 21 }; - struct cpuid cpu_id; - int i; - - /* - * The store facility list bits numbers as found in the principles - * of operation are numbered with bit 1UL<<31 as number 0 to - * bit 1UL<<0 as number 31. - * Bit 0: instructions named N3, "backported" to esa-mode - * Bit 2: z/Architecture mode is active - * Bit 7: the store-facility-list-extended facility is installed - * Bit 17: the message-security assist is installed - * Bit 19: the long-displacement facility is installed - * Bit 21: the extended-immediate facility is installed - * Bit 22: extended-translation facility 3 is installed - * Bit 30: extended-translation facility 3 enhancement facility - * These get translated to: - * HWCAP_S390_ESAN3 bit 0, HWCAP_S390_ZARCH bit 1, - * HWCAP_S390_STFLE bit 2, HWCAP_S390_MSA bit 3, - * HWCAP_S390_LDISP bit 4, HWCAP_S390_EIMM bit 5 and - * HWCAP_S390_ETF3EH bit 8 (22 && 30). - */ - for (i = 0; i < 6; i++) - if (test_facility(stfl_bits[i])) - elf_hwcap |= 1UL << i; - - if (test_facility(22) && test_facility(30)) - elf_hwcap |= HWCAP_S390_ETF3EH; - - /* - * Check for additional facilities with store-facility-list-extended. - * stfle stores doublewords (8 byte) with bit 1ULL<<63 as bit 0 - * and 1ULL<<0 as bit 63. Bits 0-31 contain the same information - * as stored by stfl, bits 32-xxx contain additional facilities. - * How many facility words are stored depends on the number of - * doublewords passed to the instruction. The additional facilities - * are: - * Bit 42: decimal floating point facility is installed - * Bit 44: perform floating point operation facility is installed - * translated to: - * HWCAP_S390_DFP bit 6 (42 && 44). - */ - if ((elf_hwcap & (1UL << 2)) && test_facility(42) && test_facility(44)) - elf_hwcap |= HWCAP_S390_DFP; + unsigned long amode31_size = __eamode31 - __samode31; + long amode31_offset, *ptr; - /* - * Huge page support HWCAP_S390_HPAGE is bit 7. - */ - if (MACHINE_HAS_EDAT1) - elf_hwcap |= HWCAP_S390_HPAGE; + amode31_offset = AMODE31_START - (unsigned long)__samode31; + pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size); - /* - * 64-bit register support for 31-bit processes - * HWCAP_S390_HIGH_GPRS is bit 9. - */ - elf_hwcap |= HWCAP_S390_HIGH_GPRS; + /* Move original AMODE31 section to the new one */ + memmove((void *)physmem_info.reserved[RR_AMODE31].start, __samode31, amode31_size); + /* Zero out the old AMODE31 section to catch invalid accesses within it */ + memset(__samode31, 0, amode31_size); - /* - * Transactional execution support HWCAP_S390_TE is bit 10. - */ - if (MACHINE_HAS_TE) - elf_hwcap |= HWCAP_S390_TE; - - /* - * Vector extension HWCAP_S390_VXRS is bit 11. The Vector extension - * can be disabled with the "novx" parameter. Use MACHINE_HAS_VX - * instead of facility bit 129. - */ - if (MACHINE_HAS_VX) { - elf_hwcap |= HWCAP_S390_VXRS; - if (test_facility(134)) - elf_hwcap |= HWCAP_S390_VXRS_BCD; - if (test_facility(135)) - elf_hwcap |= HWCAP_S390_VXRS_EXT; - if (test_facility(148)) - elf_hwcap |= HWCAP_S390_VXRS_EXT2; - if (test_facility(152)) - elf_hwcap |= HWCAP_S390_VXRS_PDE; - } - if (test_facility(150)) - elf_hwcap |= HWCAP_S390_SORT; - if (test_facility(151)) - elf_hwcap |= HWCAP_S390_DFLT; + /* Update all AMODE31 region references */ + for (ptr = _start_amode31_refs; ptr != _end_amode31_refs; ptr++) + *ptr += amode31_offset; +} - /* - * Guarded storage support HWCAP_S390_GS is bit 12. - */ - if (MACHINE_HAS_GS) - elf_hwcap |= HWCAP_S390_GS; - - get_cpu_id(&cpu_id); - add_device_randomness(&cpu_id, sizeof(cpu_id)); - switch (cpu_id.machine) { - case 0x2064: - case 0x2066: - default: /* Use "z900" as default for 64 bit kernels. */ - strcpy(elf_platform, "z900"); - break; - case 0x2084: - case 0x2086: - strcpy(elf_platform, "z990"); - break; - case 0x2094: - case 0x2096: - strcpy(elf_platform, "z9-109"); - break; - case 0x2097: - case 0x2098: - strcpy(elf_platform, "z10"); - break; - case 0x2817: - case 0x2818: - strcpy(elf_platform, "z196"); - break; - case 0x2827: - case 0x2828: - strcpy(elf_platform, "zEC12"); - break; - case 0x2964: - case 0x2965: - strcpy(elf_platform, "z13"); - break; - case 0x3906: - case 0x3907: - strcpy(elf_platform, "z14"); - break; - case 0x8561: - case 0x8562: - strcpy(elf_platform, "z15"); - break; - } +/* This must be called after AMODE31 relocation */ +static void __init setup_cr(void) +{ + union ctlreg2 cr2; + union ctlreg5 cr5; + union ctlreg15 cr15; - /* - * Virtualization support HWCAP_INT_SIE is bit 0. - */ - if (sclp.has_sief2) - int_hwcap |= HWCAP_INT_SIE; + __ctl_duct[1] = (unsigned long)__ctl_aste; + __ctl_duct[2] = (unsigned long)__ctl_aste; + __ctl_duct[4] = (unsigned long)__ctl_duald; - return 0; + /* Update control registers CR2, CR5 and CR15 */ + local_ctl_store(2, &cr2.reg); + local_ctl_store(5, &cr5.reg); + local_ctl_store(15, &cr15.reg); + cr2.ducto = (unsigned long)__ctl_duct >> 6; + cr5.pasteo = (unsigned long)__ctl_duct >> 6; + cr15.lsea = (unsigned long)__ctl_linkage_stack >> 3; + system_ctl_load(2, &cr2.reg); + system_ctl_load(5, &cr5.reg); + system_ctl_load(15, &cr15.reg); } -arch_initcall(setup_hwcaps); /* * Add system information as device randomness @@ -925,30 +813,13 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE, - PAGE_SIZE); - if (!vmms) - panic("Failed to allocate memory for sysinfo structure\n"); - + vmms = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); - memblock_free((unsigned long) vmms, PAGE_SIZE); -} + memblock_free(vmms, PAGE_SIZE); -/* - * Find the correct size for the task_struct. This depends on - * the size of the struct fpu at the end of the thread_struct - * which is embedded in the task_struct. - */ -static void __init setup_task_size(void) -{ - int task_size = sizeof(struct task_struct); - - if (!MACHINE_HAS_VX) { - task_size -= sizeof(__vector128) * __NUM_VXRS; - task_size += sizeof(freg_t) * __NUM_FPRS; - } - arch_task_struct_size = task_size; + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + static_branch_enable(&s390_arch_random_available); } /* @@ -966,7 +837,7 @@ static void __init setup_control_program_code(void) return; diag_stat_inc(DIAG_STAT_X318); - asm volatile("diag %0,0,0x318\n" : : "d" (diag318_info.val)); + asm volatile("diag %0,0,0x318" : : "d" (diag318_info.val)); } /* @@ -983,7 +854,7 @@ static void __init log_component_list(void) pr_info("Linux is running with Secure-IPL enabled\n"); else pr_info("Linux is running with Secure-IPL disabled\n"); - ptr = (void *) early_ipl_comp_list_addr; + ptr = __va(early_ipl_comp_list_addr); end = (void *) ptr + early_ipl_comp_list_size; pr_info("The IPL report contains the following components:\n"); while (ptr < end) { @@ -1002,6 +873,23 @@ static void __init log_component_list(void) } /* + * Print avoiding interpretation of % in buf and taking bootdebug option + * into consideration. + */ +static void __init print_rb_entry(const char *buf) +{ + char fmt[] = KERN_SOH "0boot: %s"; + int level = printk_get_level(buf); + + buf = skip_timestamp(printk_skip_level(buf)); + if (level == KERN_DEBUG[1] && (!bootdebug || !bootdebug_filter_match(buf))) + return; + + fmt[1] = level; + printk(fmt, buf); +} + +/* * Setup function called from init/main.c just after the banner * was printed. */ @@ -1011,15 +899,21 @@ void __init setup_arch(char **cmdline_p) /* * print what head.S has found out about the machine */ - if (MACHINE_IS_VM) + if (machine_is_vm()) pr_info("Linux is running as a z/VM " "guest operating system in 64-bit mode\n"); - else if (MACHINE_IS_KVM) + else if (machine_is_kvm()) pr_info("Linux is running under KVM in 64-bit mode\n"); - else if (MACHINE_IS_LPAR) + else if (machine_is_lpar()) pr_info("Linux is running natively in 64-bit mode\n"); else pr_info("Linux is running as a guest in 64-bit mode\n"); + /* Print decompressor messages if not already printed */ + if (!boot_earlyprintk) + boot_rb_foreach(print_rb_entry); + + if (machine_has_relocated_lowcore()) + pr_info("Lowcore relocated to 0x%px\n", get_lowcore()); log_component_list(); @@ -1043,51 +937,55 @@ void __init setup_arch(char **cmdline_p) os_info_init(); setup_ipl(); - setup_task_size(); setup_control_program_code(); /* Do some memory reservations *before* memory is added to memblock */ - reserve_above_ident_map(); + reserve_pgtables(); + reserve_lowcore(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); - reserve_mem_detect_info(); + reserve_physmem_info(); + memblock_set_current_limit(ident_map_size); memblock_allow_resize(); /* Get information about *all* installed memory */ - memblock_add_mem_detect_info(); + memblock_add_physmem_info(); - free_mem_detect_info(); - - setup_uv(); + free_physmem_info(); setup_memory_end(); + setup_high_memory(); + memblock_dump_all(); setup_memory(); + + relocate_amode31_section(); + setup_cr(); + setup_uv(); dma_contiguous_reserve(ident_map_size); vmcp_cma_reserve(); - if (MACHINE_HAS_EDAT2) + if (cpu_has_edat2()) hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); - check_initrd(); reserve_crashkernel(); #ifdef CONFIG_CRASH_DUMP /* - * Be aware that smp_save_dump_cpus() triggers a system reset. + * Be aware that smp_save_dump_secondary_cpus() triggers a system reset. * Therefore CPU and device initialization should be done afterwards. */ - smp_save_dump_cpus(); + smp_save_dump_secondary_cpus(); #endif setup_resources(); - setup_lowcore_dat_off(); + setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); cpu_init(); numa_setup(); smp_detect_cpus(); topology_init_early(); - + setup_protection_map(); /* - * Create kernel page tables and switch to virtual addressing. + * Create kernel page tables. */ paging_init(); @@ -1095,7 +993,9 @@ void __init setup_arch(char **cmdline_p) * After paging_init created the kernel page table, the new PSWs * in lowcore can now run with DAT enabled. */ - setup_lowcore_dat_on(); +#ifdef CONFIG_CRASH_DUMP + smp_save_dump_ipl_cpu(); +#endif /* Setup default console */ conmode_default(); @@ -1111,3 +1011,8 @@ void __init setup_arch(char **cmdline_p) /* Add system specific data to the random pool */ setup_randomness(); } + +void __init arch_cpu_finalize_init(void) +{ + sclp_init(); +} diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 78ef53b29958..4874de5edea0 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -12,6 +12,7 @@ #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/rseq.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> @@ -25,14 +26,12 @@ #include <linux/tty.h> #include <linux/personality.h> #include <linux/binfmts.h> -#include <linux/tracehook.h> #include <linux/syscalls.h> -#include <linux/compat.h> #include <asm/ucontext.h> #include <linux/uaccess.h> +#include <asm/vdso-symbols.h> +#include <asm/access-regs.h> #include <asm/lowcore.h> -#include <asm/switch_to.h> -#include <asm/vdso.h> #include "entry.h" /* @@ -109,7 +108,7 @@ struct rt_sigframe static void store_sigregs(void) { save_access_regs(current->thread.acrs); - save_fpu_regs(); + save_user_fpu_regs(); } /* Load registers after signal return */ @@ -131,7 +130,7 @@ static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs) memcpy(&user_sregs.regs.gprs, ®s->gprs, sizeof(sregs->regs.gprs)); memcpy(&user_sregs.regs.acrs, current->thread.acrs, sizeof(user_sregs.regs.acrs)); - fpregs_store(&user_sregs.fpregs, ¤t->thread.fpu); + fpregs_store(&user_sregs.fpregs, ¤t->thread.ufpu); if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs))) return -EFAULT; return 0; @@ -141,7 +140,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) { _sigregs user_sregs; - /* Alwys make any pending restarted system call return -EINTR */ + /* Always make any pending restarted system call return -EINTR */ current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) @@ -150,10 +149,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI)) return -EINVAL; - /* Test the floating-point-control word. */ - if (test_fp_ctl(user_sregs.fpregs.fpc)) - return -EINVAL; - /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ regs->psw.mask = (regs->psw.mask & ~(PSW_MASK_USER | PSW_MASK_RI)) | (user_sregs.regs.psw.mask & (PSW_MASK_USER | PSW_MASK_RI)); @@ -169,7 +164,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, sizeof(current->thread.acrs)); - fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); + fpregs_load(&user_sregs.fpregs, ¤t->thread.ufpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ return 0; @@ -183,13 +178,13 @@ static int save_sigregs_ext(struct pt_regs *regs, int i; /* Save vector registers to signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); + vxrs[i] = current->thread.ufpu.vxrs[i].low; if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, - current->thread.fpu.vxrs + __NUM_VXRS_LOW, + current->thread.ufpu.vxrs + __NUM_VXRS_LOW, sizeof(sregs_ext->vxrs_high))) return -EFAULT; } @@ -203,15 +198,15 @@ static int restore_sigregs_ext(struct pt_regs *regs, int i; /* Restore vector registers from signal stack */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, sizeof(sregs_ext->vxrs_low)) || - __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, + __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW, &sregs_ext->vxrs_high, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; + current->thread.ufpu.vxrs[i].low = vxrs[i]; } return 0; } @@ -226,7 +221,7 @@ SYSCALL_DEFINE0(sigreturn) if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE)) goto badframe; set_current_blocked(&set); - save_fpu_regs(); + save_user_fpu_regs(); if (restore_sigregs(regs, &frame->sregs)) goto badframe; if (restore_sigregs_ext(regs, &frame->sregs_ext)) @@ -250,7 +245,7 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); if (restore_altstack(&frame->uc.uc_stack)) goto badframe; - save_fpu_regs(); + save_user_fpu_regs(); if (restore_sigregs(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) @@ -294,14 +289,8 @@ static int setup_frame(int sig, struct k_sigaction *ka, unsigned long restorer; size_t frame_size; - /* - * gprs_high are only present for a 31-bit task running on - * a 64-bit kernel (see compat_signal.c) but the space for - * gprs_high need to be allocated if vector registers are - * included in the signal frame on a 31-bit system. - */ frame_size = sizeof(*frame) - sizeof(frame->sregs_ext); - if (MACHINE_HAS_VX) + if (cpu_has_vx()) frame_size += sizeof(frame->sregs_ext); frame = get_sigframe(ka, regs, frame_size); if (frame == (void __user *) -1UL) @@ -337,7 +326,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, if (ka->sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ka->sa.sa_restorer; else - restorer = VDSO64_SYMBOL(current, sigreturn); + restorer = VDSO_SYMBOL(current, sigreturn); /* Set up registers for signal handler */ regs->gprs[14] = restorer; @@ -371,14 +360,8 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, size_t frame_size; frame_size = sizeof(struct rt_sigframe) - sizeof(_sigregs_ext); - /* - * gprs_high are only present for a 31-bit task running on - * a 64-bit kernel (see compat_signal.c) but the space for - * gprs_high need to be allocated if vector registers are - * included in the signal frame on a 31-bit system. - */ uc_flags = 0; - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { frame_size += sizeof(_sigregs_ext); uc_flags |= UC_VXRS; } @@ -395,7 +378,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ksig->ka.sa.sa_restorer; else - restorer = VDSO64_SYMBOL(current, rt_sigreturn); + restorer = VDSO_SYMBOL(current, rt_sigreturn); /* Create siginfo on the signal stack */ if (copy_siginfo_to_user(&frame->info, &ksig->info)) @@ -453,7 +436,7 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset, * stack-frames in one go after that. */ -void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) +void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; sigset_t *oldset = sigmask_to_save(); @@ -466,7 +449,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) current->thread.system_call = test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0; - if (has_signal && get_signal(&ksig)) { + if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ if (current->thread.system_call) { regs->int_code = current->thread.system_call; @@ -494,10 +477,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) clear_pt_regs_flag(regs, PIF_SYSCALL); rseq_signal_deliver(&ksig, regs); - if (is_compat_task()) - handle_signal32(&ksig, oldset, regs); - else - handle_signal(&ksig, oldset, regs); + handle_signal(&ksig, oldset, regs); return; } @@ -510,10 +490,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) /* Restart with sys_restart_syscall */ regs->gprs[2] = regs->orig_gpr2; current->restart_block.arch_data = regs->psw.addr; - if (is_compat_task()) - regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall); - else - regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall); + regs->psw.addr = VDSO_SYMBOL(current, restart_syscall); if (test_thread_flag(TIF_SINGLE_STEP)) clear_thread_flag(TIF_PER_TRAP); break; @@ -533,9 +510,3 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) */ restore_saved_sigmask(); } - -void do_notify_resume(struct pt_regs *regs) -{ - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); -} diff --git a/arch/s390/kernel/skey.c b/arch/s390/kernel/skey.c new file mode 100644 index 000000000000..cc869de6e3a5 --- /dev/null +++ b/arch/s390/kernel/skey.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/rwonce.h> +#include <asm/page.h> +#include <asm/skey.h> + +int skey_regions_initialized; + +static inline unsigned long load_real_address(unsigned long address) +{ + unsigned long real; + + asm volatile( + " lra %[real],0(%[address])" + : [real] "=d" (real) + : [address] "a" (address) + : "cc"); + return real; +} + +/* + * Initialize storage keys of registered memory regions with the + * default key. This is useful for code which is executed with a + * non-default access key. + */ +void __skey_regions_initialize(void) +{ + unsigned long address, real; + struct skey_region *r, *end; + + r = __skey_region_start; + end = __skey_region_end; + while (r < end) { + address = r->start & PAGE_MASK; + do { + real = load_real_address(address); + page_set_storage_key(real, PAGE_DEFAULT_KEY, 1); + address += PAGE_SIZE; + } while (address < r->end); + r++; + } + /* + * Make sure storage keys are initialized before + * skey_regions_initialized is changed. + */ + barrier(); + WRITE_ONCE(skey_regions_initialized, 1); +} diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 8984711f72ed..b7429f30afc1 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -5,7 +5,6 @@ * Copyright IBM Corp. 1999, 2012 * Author(s): Denis Joseph Barrow, * Martin Schwidefsky <schwidefsky@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, * * based on other smp stuff by * (c) 1995 Alan Cox, CymruNET Ltd <alan@cymru.net> @@ -16,9 +15,9 @@ * operates on physical cpu numbers needs to go into smp.c. */ -#define KMSG_COMPONENT "cpu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpu: " fmt +#include <linux/cpufeature.h> #include <linux/workqueue.h> #include <linux/memblock.h> #include <linux/export.h> @@ -37,16 +36,20 @@ #include <linux/sched/task_stack.h> #include <linux/crash_dump.h> #include <linux/kprobes.h> +#include <asm/access-regs.h> #include <asm/asm-offsets.h> +#include <asm/machine.h> +#include <asm/ctlreg.h> +#include <asm/pfault.h> #include <asm/diag.h> -#include <asm/switch_to.h> #include <asm/facility.h> +#include <asm/fpu.h> #include <asm/ipl.h> #include <asm/setup.h> #include <asm/irq.h> #include <asm/tlbflush.h> #include <asm/vtimer.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/sclp.h> #include <asm/debug.h> #include <asm/os_info.h> @@ -56,6 +59,7 @@ #include <asm/stacktrace.h> #include <asm/topology.h> #include <asm/vdso.h> +#include <asm/maccess.h> #include "entry.h" enum { @@ -71,18 +75,15 @@ enum { CPU_STATE_CONFIGURED, }; -static DEFINE_PER_CPU(struct cpu *, cpu_device); - -struct pcpu { - unsigned long ec_mask; /* bit mask for ec_xxx functions */ - unsigned long ec_clk; /* sigp timestamp for ec_xxx */ - signed char state; /* physical cpu state */ - signed char polarization; /* physical polarization */ - u16 address; /* physical cpu address */ -}; - static u8 boot_core_type; -static struct pcpu pcpu_devices[NR_CPUS]; +DEFINE_PER_CPU(struct pcpu, pcpu_devices); +/* + * Pointer to the pcpu area of the boot CPU. This is required when a restart + * interrupt is triggered on an offline CPU. For that case accessing percpu + * data with the common primitives does not work, since the percpu offset is + * stored in a non existent lowcore. + */ +static struct pcpu *ipl_pcpu; unsigned int smp_cpu_mt_shift; EXPORT_SYMBOL(smp_cpu_mt_shift); @@ -95,13 +96,7 @@ __vector128 __initdata boot_cpu_vector_save_area[__NUM_VXRS]; #endif static unsigned int smp_max_threads __initdata = -1U; - -static int __init early_nosmt(char *s) -{ - smp_max_threads = 1; - return 0; -} -early_param("nosmt", early_nosmt); +cpumask_t cpu_setup_mask; static int __init early_smt(char *s) { @@ -112,7 +107,7 @@ early_param("smt", early_smt); /* * The smp_cpu_state_mutex must be held when changing the state or polarization - * member of a pcpu data structure within the pcpu_devices arreay. + * member of a pcpu data structure within the pcpu_devices array. */ DEFINE_MUTEX(smp_cpu_state_mutex); @@ -172,20 +167,17 @@ static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address) int cpu; for_each_cpu(cpu, mask) - if (pcpu_devices[cpu].address == address) - return pcpu_devices + cpu; + if (per_cpu(pcpu_devices, cpu).address == address) + return &per_cpu(pcpu_devices, cpu); return NULL; } static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit) { - int order; - if (test_and_set_bit(ec_bit, &pcpu->ec_mask)) return; - order = pcpu_running(pcpu) ? SIGP_EXTERNAL_CALL : SIGP_EMERGENCY_SIGNAL; pcpu->ec_clk = get_tod_clock_fast(); - pcpu_sigp_retry(pcpu, order, 0); + pcpu_sigp_retry(pcpu, SIGP_EXTERNAL_CALL, 0); } static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) @@ -199,7 +191,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) mcck_stack = stack_alloc(); if (!lc || !nodat_stack || !async_stack || !mcck_stack) goto out; - memcpy(lc, &S390_lowcore, 512); + memcpy(lc, get_lowcore(), 512); memset((char *) lc + 512, 0, sizeof(*lc) - 512); lc->async_stack = async_stack + STACK_INIT_OFFSET; lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET; @@ -207,16 +199,19 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; - lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; - if (nmi_alloc_per_cpu(lc)) + if (nmi_alloc_mcesa(&lc->mcesad)) goto out; + if (abs_lowcore_map(cpu, lc, true)) + goto out_mcesa; lowcore_ptr[cpu] = lc; - pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc); + pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc)); return 0; +out_mcesa: + nmi_free_mcesa(&lc->mcesad); out: stack_free(mcck_stack); stack_free(async_stack); @@ -225,20 +220,19 @@ out: return -ENOMEM; } -static void pcpu_free_lowcore(struct pcpu *pcpu) +static void pcpu_free_lowcore(struct pcpu *pcpu, int cpu) { unsigned long async_stack, nodat_stack, mcck_stack; struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET; async_stack = lc->async_stack - STACK_INIT_OFFSET; mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); lowcore_ptr[cpu] = NULL; - nmi_free_per_cpu(lc); + abs_lowcore_unmap(cpu); + nmi_free_mcesa(&lc->mcesad); stack_free(async_stack); stack_free(mcck_stack); free_pages(nodat_stack, THREAD_SIZE_ORDER); @@ -247,36 +241,37 @@ static void pcpu_free_lowcore(struct pcpu *pcpu) static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) { - struct lowcore *lc = lowcore_ptr[cpu]; + struct lowcore *lc, *abs_lc; + lc = lowcore_ptr[cpu]; cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); lc->cpu_nr = cpu; + lc->pcpu = (unsigned long)pcpu; + lc->restart_flags = RESTART_FLAG_CTLREGS; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; lc->percpu_offset = __per_cpu_offset[cpu]; - lc->kernel_asce = S390_lowcore.kernel_asce; + lc->kernel_asce = get_lowcore()->kernel_asce; lc->user_asce = s390_invalid_asce; - lc->machine_flags = S390_lowcore.machine_flags; lc->user_timer = lc->system_timer = lc->steal_timer = lc->avg_steal_timer = 0; - __ctl_store(lc->cregs_save_area, 0, 15); - lc->cregs_save_area[1] = lc->kernel_asce; + abs_lc = get_abs_lowcore(); + memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area)); + put_abs_lowcore(abs_lc); + lc->cregs_save_area[1] = lc->user_asce; lc->cregs_save_area[7] = lc->user_asce; save_access_regs((unsigned int *) lc->access_regs_save_area); arch_spin_lock_setup(cpu); } -static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) +static void pcpu_attach_task(int cpu, struct task_struct *tsk) { struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; - lc->kernel_stack = (unsigned long) task_stack_page(tsk) - + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); - lc->current_task = (unsigned long) tsk; + lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET; + lc->current_task = (unsigned long)tsk; lc->lpp = LPP_MAGIC; lc->current_pid = tsk->pid; lc->user_timer = tsk->thread.user_timer; @@ -285,20 +280,21 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) lc->hardirq_timer = tsk->thread.hardirq_timer; lc->softirq_timer = tsk->thread.softirq_timer; lc->steal_timer = 0; +#ifdef CONFIG_STACKPROTECTOR + lc->stack_canary = tsk->stack_canary; +#endif } -static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) +static void pcpu_start_fn(int cpu, void (*func)(void *), void *data) { struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; - lc->restart_stack = lc->nodat_stack; + lc->restart_stack = lc->kernel_stack; lc->restart_fn = (unsigned long) func; lc->restart_data = (unsigned long) data; - lc->restart_source = -1UL; - pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); + lc->restart_source = -1U; + pcpu_sigp_retry(per_cpu_ptr(&pcpu_devices, cpu), SIGP_RESTART, 0); } typedef void (pcpu_delegate_fn)(void *); @@ -311,31 +307,42 @@ static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) func(data); /* should not return */ } -static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu, - pcpu_delegate_fn *func, - void *data, unsigned long stack) +static void __noreturn pcpu_delegate(struct pcpu *pcpu, int cpu, + pcpu_delegate_fn *func, + void *data, unsigned long stack) { - struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; - unsigned long source_cpu = stap(); + struct lowcore *lc, *abs_lc; + unsigned int source_cpu; + + lc = lowcore_ptr[cpu]; + source_cpu = stap(); - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); if (pcpu->address == source_cpu) { call_on_stack(2, stack, void, __pcpu_delegate, pcpu_delegate_fn *, func, void *, data); } /* Stop target cpu (if func returns this stops the current cpu). */ pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + pcpu_sigp_retry(pcpu, SIGP_CPU_RESET, 0); /* Restart func on the target cpu and stop the current cpu. */ - mem_assign_absolute(lc->restart_stack, stack); - mem_assign_absolute(lc->restart_fn, (unsigned long) func); - mem_assign_absolute(lc->restart_data, (unsigned long) data); - mem_assign_absolute(lc->restart_source, source_cpu); - __bpon(); + if (lc) { + lc->restart_stack = stack; + lc->restart_fn = (unsigned long)func; + lc->restart_data = (unsigned long)data; + lc->restart_source = source_cpu; + } else { + abs_lc = get_abs_lowcore(); + abs_lc->restart_stack = stack; + abs_lc->restart_fn = (unsigned long)func; + abs_lc->restart_data = (unsigned long)data; + abs_lc->restart_source = source_cpu; + put_abs_lowcore(abs_lc); + } asm volatile( "0: sigp 0,%0,%2 # sigp restart to target cpu\n" " brc 2,0b # busy, try again\n" "1: sigp 0,%1,%3 # sigp stop to current cpu\n" - " brc 2,1b # busy, try again\n" + " brc 2,1b # busy, try again" : : "d" (pcpu->address), "d" (source_cpu), "K" (SIGP_RESTART), "K" (SIGP_STOP) : "0", "1", "cc"); @@ -357,38 +364,22 @@ static int pcpu_set_smt(unsigned int mtid) smp_cpu_mt_shift = 0; while (smp_cpu_mtid >= (1U << smp_cpu_mt_shift)) smp_cpu_mt_shift++; - pcpu_devices[0].address = stap(); + per_cpu(pcpu_devices, 0).address = stap(); } return cc; } /* - * Call function on an online CPU. - */ -void smp_call_online_cpu(void (*func)(void *), void *data) -{ - struct pcpu *pcpu; - - /* Use the current cpu if it is online. */ - pcpu = pcpu_find_address(cpu_online_mask, stap()); - if (!pcpu) - /* Use the first online cpu. */ - pcpu = pcpu_devices + cpumask_first(cpu_online_mask); - pcpu_delegate(pcpu, func, data, (unsigned long) restart_stack); -} - -/* * Call function on the ipl CPU. */ -void smp_call_ipl_cpu(void (*func)(void *), void *data) +void __noreturn smp_call_ipl_cpu(void (*func)(void *), void *data) { struct lowcore *lc = lowcore_ptr[0]; - if (pcpu_devices[0].address == stap()) - lc = &S390_lowcore; + if (ipl_pcpu->address == stap()) + lc = get_lowcore(); - pcpu_delegate(&pcpu_devices[0], func, data, - lc->nodat_stack); + pcpu_delegate(ipl_pcpu, 0, func, data, lc->nodat_stack); } int smp_find_processor_id(u16 address) @@ -396,21 +387,21 @@ int smp_find_processor_id(u16 address) int cpu; for_each_present_cpu(cpu) - if (pcpu_devices[cpu].address == address) + if (per_cpu(pcpu_devices, cpu).address == address) return cpu; return -1; } void schedule_mcck_handler(void) { - pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending); + pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_mcck_pending); } bool notrace arch_vcpu_is_preempted(int cpu) { if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) return false; - if (pcpu_running(pcpu_devices + cpu)) + if (pcpu_running(per_cpu_ptr(&pcpu_devices, cpu))) return false; return true; } @@ -418,11 +409,11 @@ EXPORT_SYMBOL(arch_vcpu_is_preempted); void notrace smp_yield_cpu(int cpu) { - if (!MACHINE_HAS_DIAG9C) + if (!machine_has_diag9c()) return; diag_stat_inc_norecursion(DIAG_STAT_X09C); asm volatile("diag %0,0,0x9c" - : : "d" (pcpu_devices[cpu].address)); + : : "d" (per_cpu(pcpu_devices, cpu).address)); } EXPORT_SYMBOL_GPL(smp_yield_cpu); @@ -441,18 +432,18 @@ void notrace smp_emergency_stop(void) cpumask_copy(&cpumask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpumask); - end = get_tod_clock() + (1000000UL << 12); + end = get_tod_clock_monotonic() + (1000000UL << 12); for_each_cpu(cpu, &cpumask) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); set_bit(ec_stop_cpu, &pcpu->ec_mask); while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL, 0, NULL) == SIGP_CC_BUSY && - get_tod_clock() < end) + get_tod_clock_monotonic() < end) cpu_relax(); } - while (get_tod_clock() < end) { + while (get_tod_clock_monotonic() < end) { for_each_cpu(cpu, &cpumask) - if (pcpu_stopped(pcpu_devices + cpu)) + if (pcpu_stopped(per_cpu_ptr(&pcpu_devices, cpu))) cpumask_clear_cpu(cpu, &cpumask); if (cpumask_empty(&cpumask)) break; @@ -467,10 +458,11 @@ NOKPROBE_SYMBOL(smp_emergency_stop); */ void smp_send_stop(void) { + struct pcpu *pcpu; int cpu; /* Disable all interrupts/machine checks */ - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); + __load_psw_mask(PSW_KERNEL_BITS); trace_hardirqs_off(); debug_set_critical(); @@ -482,8 +474,9 @@ void smp_send_stop(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0); - while (!pcpu_stopped(pcpu_devices + cpu)) + pcpu = per_cpu_ptr(&pcpu_devices, cpu); + pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + while (!pcpu_stopped(pcpu)) cpu_relax(); } } @@ -497,7 +490,7 @@ static void smp_handle_ext_call(void) unsigned long bits; /* handle bit signal external calls */ - bits = xchg(&pcpu_devices[smp_processor_id()].ec_mask, 0); + bits = this_cpu_xchg(pcpu_devices.ec_mask, 0); if (test_bit(ec_stop_cpu, &bits)) smp_stop_cpu(); if (test_bit(ec_schedule, &bits)) @@ -505,7 +498,7 @@ static void smp_handle_ext_call(void) if (test_bit(ec_call_function_single, &bits)) generic_smp_call_function_single_interrupt(); if (test_bit(ec_mcck_pending, &bits)) - __s390_handle_mcck(); + s390_handle_mcck(); if (test_bit(ec_irq_work, &bits)) irq_work_run(); } @@ -522,12 +515,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) int cpu; for_each_cpu(cpu, mask) - pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single); } void arch_send_call_function_single_ipi(int cpu) { - pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single); } /* @@ -535,62 +528,18 @@ void arch_send_call_function_single_ipi(int cpu) * it goes straight through and wastes no time serializing * anything. Worst case is that we lose a reschedule ... */ -void smp_send_reschedule(int cpu) +void arch_smp_send_reschedule(int cpu) { - pcpu_ec_call(pcpu_devices + cpu, ec_schedule); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_schedule); } #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { - pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work); + pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_irq_work); } #endif -/* - * parameter area for the set/clear control bit callbacks - */ -struct ec_creg_mask_parms { - unsigned long orval; - unsigned long andval; - int cr; -}; - -/* - * callback for setting/clearing control bits - */ -static void smp_ctl_bit_callback(void *info) -{ - struct ec_creg_mask_parms *pp = info; - unsigned long cregs[16]; - - __ctl_store(cregs, 0, 15); - cregs[pp->cr] = (cregs[pp->cr] & pp->andval) | pp->orval; - __ctl_load(cregs, 0, 15); -} - -/* - * Set a bit in a control register of all cpus - */ -void smp_ctl_set_bit(int cr, int bit) -{ - struct ec_creg_mask_parms parms = { 1UL << bit, -1UL, cr }; - - on_each_cpu(smp_ctl_bit_callback, &parms, 1); -} -EXPORT_SYMBOL(smp_ctl_set_bit); - -/* - * Clear a bit in a control register of all cpus - */ -void smp_ctl_clear_bit(int cr, int bit) -{ - struct ec_creg_mask_parms parms = { 0, ~(1UL << bit), cr }; - - on_each_cpu(smp_ctl_bit_callback, &parms, 1); -} -EXPORT_SYMBOL(smp_ctl_clear_bit); - #ifdef CONFIG_CRASH_DUMP int smp_store_status(int cpu) @@ -599,16 +548,16 @@ int smp_store_status(int cpu) struct pcpu *pcpu; unsigned long pa; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); lc = lowcore_ptr[cpu]; pa = __pa(&lc->floating_pt_save_area); if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) return -EIO; - if (!MACHINE_HAS_VX && !MACHINE_HAS_GS) + if (!cpu_has_vx() && !cpu_has_gs()) return 0; - pa = __pa(lc->mcesad & MCESA_ORIGIN_MASK); - if (MACHINE_HAS_GS) + pa = lc->mcesad & MCESA_ORIGIN_MASK; + if (cpu_has_gs()) pa |= lc->mcesad & MCESA_LC_MASK; if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_ADDITIONAL_STATUS, pa) != SIGP_CC_ORDER_CODE_ACCEPTED) @@ -618,7 +567,7 @@ int smp_store_status(int cpu) /* * Collect CPU state of the previous, crashed system. - * There are four cases: + * There are three cases: * 1) standard zfcp/nvme dump * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true * The state for all CPUs except the boot CPU needs to be collected @@ -631,53 +580,48 @@ int smp_store_status(int cpu) * with sigp stop-and-store-status. The firmware or the boot-loader * stored the registers of the boot CPU in the absolute lowcore in the * memory of the old system. - * 3) kdump and the old kernel did not store the CPU state, - * or stand-alone kdump for DASD - * condition: OLDMEM_BASE != NULL && !is_kdump_kernel() + * 3) kdump or stand-alone kdump for DASD + * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == false * The state for all CPUs except the boot CPU needs to be collected * with sigp stop-and-store-status. The kexec code or the boot-loader * stored the registers of the boot CPU in the memory of the old system. - * 4) kdump and the old kernel stored the CPU state - * condition: OLDMEM_BASE != NULL && is_kdump_kernel() - * This case does not exist for s390 anymore, setup_arch explicitly - * deactivates the elfcorehdr= kernel parameter + * + * Note that the legacy kdump mode where the old kernel stored the CPU states + * does no longer exist: setup_arch() explicitly deactivates the elfcorehdr= + * kernel parameter. The is_kdump_kernel() implementation on s390 is independent + * of the elfcorehdr= parameter. */ -static __init void smp_save_cpu_vxrs(struct save_area *sa, u16 addr, - bool is_boot_cpu, unsigned long page) +static bool dump_available(void) { - __vector128 *vxrs = (__vector128 *) page; - - if (is_boot_cpu) - vxrs = boot_cpu_vector_save_area; - else - __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, page); - save_area_add_vxrs(sa, vxrs); + return oldmem_data.start || is_ipl_type_dump(); } -static __init void smp_save_cpu_regs(struct save_area *sa, u16 addr, - bool is_boot_cpu, unsigned long page) +void __init smp_save_dump_ipl_cpu(void) { - void *regs = (void *) page; + struct save_area *sa; + void *regs; - if (is_boot_cpu) - copy_oldmem_kernel(regs, (void *) __LC_FPREGS_SAVE_AREA, 512); - else - __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, page); + if (!dump_available()) + return; + sa = save_area_alloc(true); + regs = memblock_alloc_or_panic(512, 8); + copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); + memblock_free(regs, 512); + if (cpu_has_vx()) + save_area_add_vxrs(sa, boot_cpu_vector_save_area); } -void __init smp_save_dump_cpus(void) +void __init smp_save_dump_secondary_cpus(void) { int addr, boot_cpu_addr, max_cpu_addr; struct save_area *sa; - unsigned long page; - bool is_boot_cpu; + void *page; - if (!(OLDMEM_BASE || is_ipl_type_dump())) - /* No previous system present, normal boot. */ + if (!dump_available()) return; /* Allocate a page as dumping area for the store status sigps */ - page = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 1UL << 31); + page = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!page) panic("ERROR: Failed to allocate %lx bytes below %lx\n", PAGE_SIZE, 1UL << 31); @@ -687,46 +631,57 @@ void __init smp_save_dump_cpus(void) boot_cpu_addr = stap(); max_cpu_addr = SCLP_MAX_CORES << sclp.mtid_prev; for (addr = 0; addr <= max_cpu_addr; addr++) { + if (addr == boot_cpu_addr) + continue; if (__pcpu_sigp_relax(addr, SIGP_SENSE, 0) == SIGP_CC_NOT_OPERATIONAL) continue; - is_boot_cpu = (addr == boot_cpu_addr); - /* Allocate save area */ - sa = save_area_alloc(is_boot_cpu); - if (!sa) - panic("could not allocate memory for save area\n"); - if (MACHINE_HAS_VX) - /* Get the vector registers */ - smp_save_cpu_vxrs(sa, addr, is_boot_cpu, page); - /* - * For a zfcp/nvme dump OLDMEM_BASE == NULL and the registers - * of the boot CPU are stored in the HSA. To retrieve - * these registers an SCLP request is required which is - * done by drivers/s390/char/zcore.c:init_cpu_info() - */ - if (!is_boot_cpu || OLDMEM_BASE) - /* Get the CPU registers */ - smp_save_cpu_regs(sa, addr, is_boot_cpu, page); + sa = save_area_alloc(false); + __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); + save_area_add_regs(sa, page); + if (cpu_has_vx()) { + __pcpu_sigp_relax(addr, SIGP_STORE_ADDITIONAL_STATUS, __pa(page)); + save_area_add_vxrs(sa, page); + } } memblock_free(page, PAGE_SIZE); - diag_dma_ops.diag308_reset(); + diag_amode31_ops.diag308_reset(); pcpu_set_smt(0); } #endif /* CONFIG_CRASH_DUMP */ void smp_cpu_set_polarization(int cpu, int val) { - pcpu_devices[cpu].polarization = val; + per_cpu(pcpu_devices, cpu).polarization = val; } int smp_cpu_get_polarization(int cpu) { - return pcpu_devices[cpu].polarization; + return per_cpu(pcpu_devices, cpu).polarization; +} + +void smp_cpu_set_capacity(int cpu, unsigned long val) +{ + per_cpu(pcpu_devices, cpu).capacity = val; +} + +unsigned long smp_cpu_get_capacity(int cpu) +{ + return per_cpu(pcpu_devices, cpu).capacity; +} + +void smp_set_core_capacity(int cpu, unsigned long val) +{ + int i; + + cpu = smp_get_base_cpu(cpu); + for (i = cpu; (i <= cpu + smp_cpu_mtid) && (i < nr_cpu_ids); i++) + smp_cpu_set_capacity(i, val); } int smp_cpu_get_cpu_address(int cpu) { - return pcpu_devices[cpu].address; + return per_cpu(pcpu_devices, cpu).address; } static void __ref smp_get_core_info(struct sclp_core_info *info, int early) @@ -744,14 +699,13 @@ static void __ref smp_get_core_info(struct sclp_core_info *info, int early) continue; info->core[info->configured].core_id = address >> smp_cpu_mt_shift; + info->core[info->configured].type = boot_core_type; info->configured++; } info->combined = info->configured; } } -static int smp_add_present_cpu(int cpu); - static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, bool configured, bool early) { @@ -767,15 +721,16 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, for (i = 0; (i <= smp_cpu_mtid) && (cpu < nr_cpu_ids); i++) { if (pcpu_find_address(cpu_present_mask, address + i)) continue; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); pcpu->address = address + i; if (configured) pcpu->state = CPU_STATE_CONFIGURED; else pcpu->state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); set_cpu_present(cpu, true); - if (!early && smp_add_present_cpu(cpu) != 0) + if (!early && arch_register_cpu(cpu)) set_cpu_present(cpu, false); else nr++; @@ -793,7 +748,7 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) u16 core_id; int nr, i; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&smp_cpu_state_mutex); nr = 0; cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask); @@ -802,7 +757,7 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) * that all SMT threads get subsequent logical CPU numbers. */ if (early) { - core_id = pcpu_devices[0].address >> smp_cpu_mt_shift; + core_id = per_cpu(pcpu_devices, 0).address >> smp_cpu_mt_shift; for (i = 0; i < info->configured; i++) { core = &info->core[i]; if (core->core_id == core_id) { @@ -816,7 +771,7 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) nr += smp_add_core(&info->core[i], &avail, configured, early); } mutex_unlock(&smp_cpu_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return nr; } @@ -827,10 +782,7 @@ void __init smp_detect_cpus(void) u16 address; /* Get CPU information */ - info = memblock_alloc(sizeof(*info), 8); - if (!info) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*info), 8); + info = memblock_alloc_or_panic(sizeof(*info), 8); smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { @@ -849,6 +801,7 @@ void __init smp_detect_cpus(void) mtid = boot_core_type ? sclp.mtid : sclp.mtid_cp; mtid = (mtid < smp_max_threads) ? mtid : smp_max_threads - 1; pcpu_set_smt(mtid); + cpu_smt_set_num_threads(smp_cpu_mtid + 1, smp_cpu_mtid + 1); /* Print number of CPUs */ c_cpus = s_cpus = 0; @@ -862,54 +815,47 @@ void __init smp_detect_cpus(void) s_cpus += smp_cpu_mtid + 1; } pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); - - /* Add CPUs present at boot */ - __smp_rescan_cpus(info, true); - memblock_free_early((unsigned long)info, sizeof(*info)); + memblock_free(info, sizeof(*info)); } -static void smp_init_secondary(void) +/* + * Activate a secondary processor. + */ +static void smp_start_secondary(void *cpuvoid) { + struct lowcore *lc = get_lowcore(); int cpu = raw_smp_processor_id(); - S390_lowcore.last_update_clock = get_tod_clock(); - restore_access_regs(S390_lowcore.access_regs_save_area); + lc->last_update_clock = get_tod_clock(); + lc->restart_stack = (unsigned long)restart_stack; + lc->restart_fn = (unsigned long)do_restart; + lc->restart_data = 0; + lc->restart_source = -1U; + lc->restart_flags = 0; + restore_access_regs(lc->access_regs_save_area); cpu_init(); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); init_cpu_timer(); vtime_init(); vdso_getcpu_init(); pfault_init(); + cpumask_set_cpu(cpu, &cpu_setup_mask); + update_cpu_masks(); notify_cpu_starting(cpu); if (topology_cpu_dedicated(cpu)) set_cpu_flag(CIF_DEDICATED_CPU); else clear_cpu_flag(CIF_DEDICATED_CPU); set_cpu_online(cpu, true); - update_cpu_masks(); inc_irq_stat(CPU_RST); local_irq_enable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -/* - * Activate a secondary processor. - */ -static void __no_sanitize_address smp_start_secondary(void *cpuvoid) -{ - S390_lowcore.restart_stack = (unsigned long) restart_stack; - S390_lowcore.restart_fn = (unsigned long) do_restart; - S390_lowcore.restart_data = 0; - S390_lowcore.restart_source = -1UL; - __ctl_load(S390_lowcore.cregs_save_area, 0, 15); - __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - call_on_stack_noreturn(smp_init_secondary, S390_lowcore.kernel_stack); -} - /* Upping and downing of CPUs */ int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); int rc; if (pcpu->state != CPU_STATE_CONFIGURED) @@ -921,12 +867,18 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) rc = pcpu_alloc_lowcore(pcpu, cpu); if (rc) return rc; + /* + * Make sure global control register contents do not change + * until new CPU has initialized control registers. + */ + system_ctlreg_lock(); pcpu_prepare_secondary(pcpu, cpu); - pcpu_attach_task(pcpu, tidle); - pcpu_start_fn(pcpu, smp_start_secondary, NULL); + pcpu_attach_task(cpu, tidle); + pcpu_start_fn(cpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ while (!cpu_online(cpu)) cpu_relax(); + system_ctlreg_unlock(); return 0; } @@ -941,20 +893,23 @@ early_param("possible_cpus", _setup_possible_cpus); int __cpu_disable(void) { - unsigned long cregs[16]; + struct ctlreg cregs[16]; + int cpu; /* Handle possible pending IPIs */ smp_handle_ext_call(); - set_cpu_online(smp_processor_id(), false); + cpu = smp_processor_id(); + set_cpu_online(cpu, false); + cpumask_clear_cpu(cpu, &cpu_setup_mask); update_cpu_masks(); /* Disable pseudo page faults on this cpu. */ pfault_fini(); /* Disable interrupt sources via control register. */ - __ctl_store(cregs, 0, 15); - cregs[0] &= ~0x0000ee70UL; /* disable all external interrupts */ - cregs[6] &= ~0xff000000UL; /* disable all I/O interrupts */ - cregs[14] &= ~0x1f000000UL; /* disable most machine checks */ - __ctl_load(cregs, 0, 15); + __local_ctl_store(0, 15, cregs); + cregs[0].val &= ~0x0000ee70UL; /* disable all external interrupts */ + cregs[6].val &= ~0xff000000UL; /* disable all I/O interrupts */ + cregs[14].val &= ~0x1f000000UL; /* disable most machine checks */ + __local_ctl_load(0, 15, cregs); clear_cpu_flag(CIF_NOHZ_DELAY); return 0; } @@ -964,19 +919,19 @@ void __cpu_die(unsigned int cpu) struct pcpu *pcpu; /* Wait until target cpu is down */ - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); while (!pcpu_stopped(pcpu)) cpu_relax(); - pcpu_free_lowcore(pcpu); + pcpu_free_lowcore(pcpu, cpu); cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); + pcpu->flags = 0; } void __noreturn cpu_die(void) { idle_task_exit(); - __bpon(); - pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0); + pcpu_sigp_retry(this_cpu_ptr(&pcpu_devices), SIGP_STOP, 0); for (;;) ; } @@ -995,30 +950,36 @@ void __init smp_fill_possible_mask(void) void __init smp_prepare_cpus(unsigned int max_cpus) { - /* request the 0x1201 emergency signal external interrupt */ if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1201"); - /* request the 0x1202 external call external interrupt */ + system_ctl_set_bit(0, 14); if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); + system_ctl_set_bit(0, 13); + smp_rescan_cpus(true); } void __init smp_prepare_boot_cpu(void) { - struct pcpu *pcpu = pcpu_devices; + struct lowcore *lc = get_lowcore(); WARN_ON(!cpu_present(0) || !cpu_online(0)); - pcpu->state = CPU_STATE_CONFIGURED; - S390_lowcore.percpu_offset = __per_cpu_offset[0]; + lc->percpu_offset = __per_cpu_offset[0]; + ipl_pcpu = per_cpu_ptr(&pcpu_devices, 0); + ipl_pcpu->state = CPU_STATE_CONFIGURED; + lc->pcpu = (unsigned long)ipl_pcpu; smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); + smp_cpu_set_capacity(0, CPU_CAPACITY_HIGH); } void __init smp_setup_processor_id(void) { - pcpu_devices[0].address = stap(); - S390_lowcore.cpu_nr = 0; - S390_lowcore.spinlock_lockval = arch_spin_lockval(0); - S390_lowcore.spinlock_index = 0; + struct lowcore *lc = get_lowcore(); + + lc->cpu_nr = 0; + per_cpu(pcpu_devices, 0).address = stap(); + lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; } /* @@ -1038,7 +999,7 @@ static ssize_t cpu_configure_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", pcpu_devices[dev->id].state); + count = sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).state); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -1055,18 +1016,16 @@ static ssize_t cpu_configure_store(struct device *dev, return -EINVAL; if (val != 0 && val != 1) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&smp_cpu_state_mutex); rc = -EBUSY; - /* disallow configuration changes of online cpus and cpu 0 */ + /* disallow configuration changes of online cpus */ cpu = dev->id; cpu = smp_get_base_cpu(cpu); - if (cpu == 0) - goto out; for (i = 0; i <= smp_cpu_mtid; i++) if (cpu_online(cpu + i)) goto out; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); rc = 0; switch (val) { case 0: @@ -1078,7 +1037,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) { if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) continue; - pcpu[i].state = CPU_STATE_STANDBY; + per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu + i, POLARIZATION_UNKNOWN); } @@ -1093,7 +1052,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) { if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) continue; - pcpu[i].state = CPU_STATE_CONFIGURED; + per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_CONFIGURED; smp_cpu_set_polarization(cpu + i, POLARIZATION_UNKNOWN); } @@ -1104,7 +1063,7 @@ static ssize_t cpu_configure_store(struct device *dev, } out: mutex_unlock(&smp_cpu_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return rc ? rc : count; } static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); @@ -1112,7 +1071,7 @@ static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); static ssize_t show_cpu_address(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pcpu_devices[dev->id].address); + return sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).address); } static DEVICE_ATTR(address, 0444, show_cpu_address, NULL); @@ -1138,35 +1097,34 @@ static struct attribute_group cpu_online_attr_group = { static int smp_cpu_online(unsigned int cpu) { - struct device *s = &per_cpu(cpu_device, cpu)->dev; + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); - return sysfs_create_group(&s->kobj, &cpu_online_attr_group); + return sysfs_create_group(&c->dev.kobj, &cpu_online_attr_group); } static int smp_cpu_pre_down(unsigned int cpu) { - struct device *s = &per_cpu(cpu_device, cpu)->dev; + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); - sysfs_remove_group(&s->kobj, &cpu_online_attr_group); + sysfs_remove_group(&c->dev.kobj, &cpu_online_attr_group); return 0; } -static int smp_add_present_cpu(int cpu) +bool arch_cpu_is_hotpluggable(int cpu) +{ + return !!cpu; +} + +int arch_register_cpu(int cpu) { - struct device *s; - struct cpu *c; + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); int rc; - c = kzalloc(sizeof(*c), GFP_KERNEL); - if (!c) - return -ENOMEM; - per_cpu(cpu_device, cpu) = c; - s = &c->dev; - c->hotpluggable = 1; + c->hotpluggable = arch_cpu_is_hotpluggable(cpu); rc = register_cpu(c, cpu); if (rc) goto out; - rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group); + rc = sysfs_create_group(&c->dev.kobj, &cpu_common_attr_group); if (rc) goto out_cpu; rc = topology_cpu_init(c); @@ -1175,14 +1133,14 @@ static int smp_add_present_cpu(int cpu) return 0; out_topology: - sysfs_remove_group(&s->kobj, &cpu_common_attr_group); + sysfs_remove_group(&c->dev.kobj, &cpu_common_attr_group); out_cpu: unregister_cpu(c); out: return rc; } -int __ref smp_rescan_cpus(void) +int __ref smp_rescan_cpus(bool early) { struct sclp_core_info *info; int nr; @@ -1191,7 +1149,7 @@ int __ref smp_rescan_cpus(void) if (!info) return -ENOMEM; smp_get_core_info(info, 0); - nr = __smp_rescan_cpus(info, false); + nr = __smp_rescan_cpus(info, early); kfree(info); if (nr) topology_schedule_update(); @@ -1208,7 +1166,7 @@ static ssize_t __ref rescan_store(struct device *dev, rc = lock_device_hotplug_sysfs(); if (rc) return rc; - rc = smp_rescan_cpus(); + rc = smp_rescan_cpus(false); unlock_device_hotplug(); return rc ? rc : count; } @@ -1216,72 +1174,19 @@ static DEVICE_ATTR_WO(rescan); static int __init s390_smp_init(void) { - int cpu, rc = 0; + struct device *dev_root; + int rc; - rc = device_create_file(cpu_subsys.dev_root, &dev_attr_rescan); - if (rc) - return rc; - for_each_present_cpu(cpu) { - rc = smp_add_present_cpu(cpu); + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_rescan); + put_device(dev_root); if (rc) - goto out; + return rc; } - rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", smp_cpu_online, smp_cpu_pre_down); rc = rc <= 0 ? rc : 0; -out: return rc; } subsys_initcall(s390_smp_init); - -static __always_inline void set_new_lowcore(struct lowcore *lc) -{ - union register_pair dst, src; - u32 pfx; - - src.even = (unsigned long) &S390_lowcore; - src.odd = sizeof(S390_lowcore); - dst.even = (unsigned long) lc; - dst.odd = sizeof(*lc); - pfx = (unsigned long) lc; - - asm volatile( - " mvcl %[dst],%[src]\n" - " spx %[pfx]\n" - : [dst] "+&d" (dst.pair), [src] "+&d" (src.pair) - : [pfx] "Q" (pfx) - : "memory", "cc"); -} - -static int __init smp_reinit_ipl_cpu(void) -{ - unsigned long async_stack, nodat_stack, mcck_stack; - struct lowcore *lc, *lc_ipl; - unsigned long flags; - - lc_ipl = lowcore_ptr[0]; - lc = (struct lowcore *) __get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER); - nodat_stack = __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); - async_stack = stack_alloc(); - mcck_stack = stack_alloc(); - if (!lc || !nodat_stack || !async_stack || !mcck_stack) - panic("Couldn't allocate memory"); - - local_irq_save(flags); - local_mcck_disable(); - set_new_lowcore(lc); - S390_lowcore.nodat_stack = nodat_stack + STACK_INIT_OFFSET; - S390_lowcore.async_stack = async_stack + STACK_INIT_OFFSET; - S390_lowcore.mcck_stack = mcck_stack + STACK_INIT_OFFSET; - lowcore_ptr[0] = lc; - local_mcck_enable(); - local_irq_restore(flags); - - free_pages(lc_ipl->async_stack - STACK_INIT_OFFSET, THREAD_SIZE_ORDER); - memblock_free_late(lc_ipl->mcck_stack - STACK_INIT_OFFSET, THREAD_SIZE); - memblock_free_late((unsigned long) lc_ipl, sizeof(*lc_ipl)); - - return 0; -} -early_initcall(smp_reinit_ipl_cpu); diff --git a/arch/s390/kernel/stackprotector.c b/arch/s390/kernel/stackprotector.c new file mode 100644 index 000000000000..d4e40483f008 --- /dev/null +++ b/arch/s390/kernel/stackprotector.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef pr_fmt +#define pr_fmt(fmt) "stackprot: " fmt +#endif + +#include <linux/export.h> +#include <linux/uaccess.h> +#include <linux/printk.h> +#include <asm/abs_lowcore.h> +#include <asm/sections.h> +#include <asm/machine.h> +#include <asm/asm-offsets.h> +#include <asm/arch-stackprotector.h> + +#ifdef __DECOMPRESSOR + +#define DEBUGP boot_debug +#define EMERGP boot_emerg +#define PANIC boot_panic + +#else /* __DECOMPRESSOR */ + +#define DEBUGP pr_debug +#define EMERGP pr_emerg +#define PANIC panic + +#endif /* __DECOMPRESSOR */ + +int __bootdata_preserved(stack_protector_debug); + +unsigned long __stack_chk_guard; +EXPORT_SYMBOL(__stack_chk_guard); + +struct insn_ril { + u8 opc1 : 8; + u8 r1 : 4; + u8 opc2 : 4; + u32 imm; +} __packed; + +/* + * Convert a virtual instruction address to a real instruction address. The + * decompressor needs to patch instructions within the kernel image based on + * their virtual addresses, while dynamic address translation is still + * disabled. Therefore a translation from virtual kernel image addresses to + * the corresponding physical addresses is required. + * + * After dynamic address translation is enabled and when the kernel needs to + * patch instructions such a translation is not required since the addresses + * are identical. + */ +static struct insn_ril *vaddress_to_insn(unsigned long vaddress) +{ +#ifdef __DECOMPRESSOR + return (struct insn_ril *)__kernel_pa(vaddress); +#else + return (struct insn_ril *)vaddress; +#endif +} + +static unsigned long insn_to_vaddress(struct insn_ril *insn) +{ +#ifdef __DECOMPRESSOR + return (unsigned long)__kernel_va(insn); +#else + return (unsigned long)insn; +#endif +} + +#define INSN_RIL_STRING_SIZE (sizeof(struct insn_ril) * 2 + 1) + +static void insn_ril_to_string(char *str, struct insn_ril *insn) +{ + u8 *ptr = (u8 *)insn; + int i; + + for (i = 0; i < sizeof(*insn); i++) + hex_byte_pack(&str[2 * i], ptr[i]); + str[2 * i] = 0; +} + +static void stack_protector_dump(struct insn_ril *old, struct insn_ril *new) +{ + char ostr[INSN_RIL_STRING_SIZE]; + char nstr[INSN_RIL_STRING_SIZE]; + + insn_ril_to_string(ostr, old); + insn_ril_to_string(nstr, new); + DEBUGP("%016lx: %s -> %s\n", insn_to_vaddress(old), ostr, nstr); +} + +static int stack_protector_verify(struct insn_ril *insn, unsigned long kernel_start) +{ + char istr[INSN_RIL_STRING_SIZE]; + unsigned long vaddress, offset; + + /* larl */ + if (insn->opc1 == 0xc0 && insn->opc2 == 0x0) + return 0; + /* lgrl */ + if (insn->opc1 == 0xc4 && insn->opc2 == 0x8) + return 0; + insn_ril_to_string(istr, insn); + vaddress = insn_to_vaddress(insn); + if (__is_defined(__DECOMPRESSOR)) { + offset = (unsigned long)insn - kernel_start + TEXT_OFFSET; + EMERGP("Unexpected instruction at %016lx/%016lx: %s\n", vaddress, offset, istr); + PANIC("Stackprotector error\n"); + } else { + EMERGP("Unexpected instruction at %016lx: %s\n", vaddress, istr); + } + return -EINVAL; +} + +int __stack_protector_apply(unsigned long *start, unsigned long *end, unsigned long kernel_start) +{ + unsigned long canary, *loc; + struct insn_ril *insn, new; + int rc; + + /* + * Convert LARL/LGRL instructions to LLILF so register R1 contains the + * address of the per-cpu / per-process stack canary: + * + * LARL/LGRL R1,__stack_chk_guard => LLILF R1,__lc_stack_canary + */ + canary = __LC_STACK_CANARY; + if (machine_has_relocated_lowcore()) + canary += LOWCORE_ALT_ADDRESS; + for (loc = start; loc < end; loc++) { + insn = vaddress_to_insn(*loc); + rc = stack_protector_verify(insn, kernel_start); + if (rc) + return rc; + new = *insn; + new.opc1 = 0xc0; + new.opc2 = 0xf; + new.imm = canary; + if (stack_protector_debug) + stack_protector_dump(insn, &new); + s390_kernel_write(insn, &new, sizeof(*insn)); + } + return 0; +} + +#ifdef __DECOMPRESSOR +void __stack_protector_apply_early(unsigned long kernel_start) +{ + unsigned long *start, *end; + + start = (unsigned long *)vmlinux.stack_prot_start; + end = (unsigned long *)vmlinux.stack_prot_end; + __stack_protector_apply(start, end, kernel_start); +} +#endif diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 101477b3e263..3aae7f70e6ab 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -3,13 +3,16 @@ * Stack trace management functions * * Copyright IBM Corp. 2006 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ +#include <linux/perf_event.h> #include <linux/stacktrace.h> +#include <linux/uaccess.h> +#include <asm/asm-offsets.h> #include <asm/stacktrace.h> #include <asm/unwind.h> #include <asm/kprobes.h> +#include <asm/ptrace.h> void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) @@ -41,12 +44,12 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (!addr) return -EINVAL; -#ifdef CONFIG_KPROBES +#ifdef CONFIG_RETHOOK /* - * Mark stacktraces with kretprobed functions on them + * Mark stacktraces with krethook functions on them * as unreliable. */ - if (state.ip == (unsigned long)kretprobe_trampoline) + if (state.ip == (unsigned long)arch_rethook_trampoline) return -EINVAL; #endif @@ -59,3 +62,101 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, return -EINVAL; return 0; } + +static inline bool store_ip(stack_trace_consume_fn consume_entry, void *cookie, + struct perf_callchain_entry_ctx *entry, bool perf, + unsigned long ip) +{ +#ifdef CONFIG_PERF_EVENTS + if (perf) { + if (perf_callchain_store(entry, ip)) + return false; + return true; + } +#endif + return consume_entry(cookie, ip); +} + +static inline bool ip_invalid(unsigned long ip) +{ + /* + * Perform some basic checks if an instruction address taken + * from unreliable source is invalid. + */ + if (ip & 1) + return true; + if (ip < mmap_min_addr) + return true; + if (ip >= current->mm->context.asce_limit) + return true; + return false; +} + +static inline bool ip_within_vdso(unsigned long ip) +{ + return in_range(ip, current->mm->context.vdso_base, vdso_text_size()); +} + +void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *cookie, + struct perf_callchain_entry_ctx *entry, + const struct pt_regs *regs, bool perf) +{ + struct stack_frame_vdso_wrapper __user *sf_vdso; + struct stack_frame_user __user *sf; + unsigned long ip, sp; + bool first = true; + + if (!current->mm) + return; + ip = instruction_pointer(regs); + if (!store_ip(consume_entry, cookie, entry, perf, ip)) + return; + sf = (void __user *)user_stack_pointer(regs); + pagefault_disable(); + while (1) { + if (__get_user(sp, &sf->back_chain)) + break; + /* + * VDSO entry code has a non-standard stack frame layout. + * See VDSO user wrapper code for details. + */ + if (!sp && ip_within_vdso(ip)) { + sf_vdso = (void __user *)sf; + if (__get_user(ip, &sf_vdso->return_address)) + break; + sp = (unsigned long)sf + STACK_FRAME_VDSO_OVERHEAD; + sf = (void __user *)sp; + if (__get_user(sp, &sf->back_chain)) + break; + } else { + sf = (void __user *)sp; + if (__get_user(ip, &sf->gprs[8])) + break; + } + /* Sanity check: ABI requires SP to be 8 byte aligned. */ + if (sp & 0x7) + break; + if (ip_invalid(ip)) { + /* + * If the instruction address is invalid, and this + * is the first stack frame, assume r14 has not + * been written to the stack yet. Otherwise exit. + */ + if (!first) + break; + ip = regs->gprs[14]; + if (ip_invalid(ip)) + break; + } + if (!store_ip(consume_entry, cookie, entry, perf, ip)) + break; + first = false; + } + pagefault_enable(); +} + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + arch_stack_walk_user_common(consume_entry, cookie, NULL, regs, false); +} diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 4d141e2c132e..5eae2e25997a 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -5,6 +5,8 @@ * Copyright IBM Corp. 2016 * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com> */ + +#include <linux/export.h> #include <linux/errno.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> @@ -17,6 +19,7 @@ #include <asm/ebcdic.h> #include <asm/facility.h> #include <asm/sthyi.h> +#include <asm/asm.h> #include "entry.h" #define DED_WEIGHT 0xffff @@ -250,7 +253,7 @@ static void fill_diag_mac(struct sthyi_sctns *sctns, sctns->mac.infmval1 |= MAC_CNT_VLD; } -/* Returns a pointer to the the next partition block. */ +/* Returns a pointer to the next partition block. */ static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, bool this_lpar, void *diag224_buf, @@ -300,32 +303,57 @@ static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, return (struct diag204_x_part_block *)&block->cpus[i]; } -static void fill_diag(struct sthyi_sctns *sctns) +static void *diag204_get_data(bool diag204_allow_busy) { - int i, r, pages; - bool this_lpar; + unsigned long subcode; void *diag204_buf; + int pages, rc; + + subcode = DIAG204_SUBC_RSI; + subcode |= DIAG204_INFO_EXT; + pages = diag204(subcode, 0, NULL); + if (pages < 0) + return ERR_PTR(pages); + if (pages == 0) + return ERR_PTR(-ENODATA); + diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!diag204_buf) + return ERR_PTR(-ENOMEM); + subcode = DIAG204_SUBC_STIB7; + subcode |= DIAG204_INFO_EXT; + if (diag204_has_bif() && diag204_allow_busy) + subcode |= DIAG204_BIF_BIT; + rc = diag204(subcode, pages, diag204_buf); + if (rc < 0) { + vfree(diag204_buf); + return ERR_PTR(rc); + } + return diag204_buf; +} + +static bool is_diag204_cached(struct sthyi_sctns *sctns) +{ + /* + * Check if validity bits are set when diag204 data + * is gathered. + */ + if (sctns->par.infpval1) + return true; + return false; +} + +static void fill_diag(struct sthyi_sctns *sctns, void *diag204_buf) +{ + int i; + bool this_lpar; void *diag224_buf = NULL; struct diag204_x_info_blk_hdr *ti_hdr; struct diag204_x_part_block *part_block; struct diag204_x_phys_block *phys_block; struct lpar_cpu_inf lpar_inf = {}; - /* Errors are handled through the validity bits in the response. */ - pages = diag204((unsigned long)DIAG204_SUBC_RSI | - (unsigned long)DIAG204_INFO_EXT, 0, NULL); - if (pages <= 0) - return; - - diag204_buf = vmalloc(array_size(pages, PAGE_SIZE)); - if (!diag204_buf) - return; - - r = diag204((unsigned long)DIAG204_SUBC_STIB7 | - (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); - if (r < 0) - goto out; - diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); if (!diag224_buf || diag224(diag224_buf)) goto out; @@ -390,7 +418,6 @@ static void fill_diag(struct sthyi_sctns *sctns) out: free_page((unsigned long)diag224_buf); - vfree(diag204_buf); } static int sthyi(u64 vaddr, u64 *rc) @@ -401,30 +428,41 @@ static int sthyi(u64 vaddr, u64 *rc) asm volatile( ".insn rre,0xB2560000,%[r1],%[r2]\n" - "ipm %[cc]\n" - "srl %[cc],28\n" - : [cc] "=&d" (cc), [r2] "+&d" (r2.pair) + CC_IPM(cc) + : CC_OUT(cc, cc), [r2] "+&d" (r2.pair) : [r1] "d" (r1.pair) - : "memory", "cc"); + : CC_CLOBBER_LIST("memory")); *rc = r2.odd; - return cc; + return CC_TRANSFORM(cc); } static int fill_dst(void *dst, u64 *rc) { + void *diag204_buf; + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; /* * If the facility is on, we don't want to emulate the instruction. * We ask the hypervisor to provide the data. */ - if (test_facility(74)) + if (test_facility(74)) { + memset(dst, 0, PAGE_SIZE); return sthyi((u64)dst, rc); - + } + /* + * When emulating, if diag204 returns BUSY don't reset dst buffer + * and use cached data. + */ + *rc = 0; + diag204_buf = diag204_get_data(is_diag204_cached(sctns)); + if (IS_ERR(diag204_buf)) + return PTR_ERR(diag204_buf); + memset(dst, 0, PAGE_SIZE); fill_hdr(sctns); fill_stsi(sctns); - fill_diag(sctns); - *rc = 0; + fill_diag(sctns, diag204_buf); + vfree(diag204_buf); return 0; } @@ -443,11 +481,14 @@ static int sthyi_update_cache(u64 *rc) { int r; - memset(sthyi_cache.info, 0, PAGE_SIZE); r = fill_dst(sthyi_cache.info, rc); - if (r) - return r; - sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + if (r == 0) { + sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + } else if (r == -EBUSY) { + /* mark as expired and return 0 to keep using cached data */ + sthyi_cache.end = jiffies - 1; + r = 0; + } return r; } @@ -459,9 +500,9 @@ static int sthyi_update_cache(u64 *rc) * * Fills the destination with system information returned by the STHYI * instruction. The data is generated by emulation or execution of STHYI, - * if available. The return value is the condition code that would be - * returned, the rc parameter is the return code which is passed in - * register R2 + 1. + * if available. The return value is either a negative error value or + * the condition code that would be returned, the rc parameter is the + * return code which is passed in register R2 + 1. */ int sthyi_fill(void *dst, u64 *rc) { diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 8fe2d23b64f4..795b6cca74c9 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -12,6 +12,7 @@ * platform. */ +#include <linux/cpufeature.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/mm.h> @@ -38,32 +39,15 @@ #include "entry.h" -/* - * Perform the mmap() system call. Linux for S/390 isn't able to handle more - * than 5 system call parameters, so this system call uses a memory block - * for parameter passing. - */ +#define __SYSCALL(nr, sym) long __s390x_##sym(struct pt_regs *); +#include <asm/syscall_table.h> +#undef __SYSCALL -struct s390_mmap_arg_struct { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; +#define __SYSCALL(nr, sym) [nr] = (__s390x_##sym), +const sys_call_ptr_t sys_call_table[__NR_syscalls] = { +#include <asm/syscall_table.h> }; - -SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) -{ - struct s390_mmap_arg_struct a; - int error = -EFAULT; - - if (copy_from_user(&a, arg, sizeof(a))) - goto out; - error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); -out: - return error; -} +#undef __SYSCALL #ifdef CONFIG_SYSVIPC /* @@ -108,25 +92,35 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -static void do_syscall(struct pt_regs *regs) +void noinstr __do_syscall(struct pt_regs *regs, int per_trap) { unsigned long nr; + add_random_kstack_offset(); + enter_from_user_mode(regs); + regs->psw = get_lowcore()->svc_old_psw; + regs->int_code = get_lowcore()->svc_int_code; + update_timer_sys(); + if (cpu_has_bear()) + current->thread.last_break = regs->last_break; + local_irq_enable(); + regs->orig_gpr2 = regs->gprs[2]; + if (unlikely(per_trap)) + set_thread_flag(TIF_PER_TRAP); + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); nr = regs->int_code & 0xffff; - if (!nr) { + if (likely(!nr)) { nr = regs->gprs[1] & 0xffff; regs->int_code &= ~0xffffUL; regs->int_code |= nr; } - regs->gprs[2] = nr; - if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) { regs->psw.addr = current->restart_block.arch_data; current->restart_block.arch_data = 1; } nr = syscall_enter_from_user_mode_work(regs, nr); - /* * In the s390 ptrace ABI, both the syscall number and the return value * use gpr2. However, userspace puts the syscall number either in the @@ -134,35 +128,11 @@ static void do_syscall(struct pt_regs *regs) * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here * and if set, the syscall will be skipped. */ - if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) goto out; regs->gprs[2] = -ENOSYS; - if (likely(nr >= NR_syscalls)) - goto out; - do { - regs->gprs[2] = current->thread.sys_call_table[nr](regs); - } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART)); + if (likely(nr < NR_syscalls)) + regs->gprs[2] = sys_call_table[nr](regs); out: - syscall_exit_to_user_mode_work(regs); -} - -void noinstr __do_syscall(struct pt_regs *regs, int per_trap) -{ - add_random_kstack_offset(); - enter_from_user_mode(regs); - regs->psw = S390_lowcore.svc_old_psw; - regs->int_code = S390_lowcore.svc_int_code; - update_timer_sys(); - - local_irq_enable(); - regs->orig_gpr2 = regs->gprs[2]; - - if (per_trap) - set_thread_flag(TIF_PER_TRAP); - - regs->flags = 0; - set_pt_regs_flag(regs, PIF_SYSCALL); - do_syscall(regs); - exit_to_user_mode(); + syscall_exit_to_user_mode(regs); } diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile index b98f25029b8e..d5fca0ca0890 100644 --- a/arch/s390/kernel/syscalls/Makefile +++ b/arch/s390/kernel/syscalls/Makefile @@ -1,46 +1,32 @@ # SPDX-License-Identifier: GPL-2.0 +kapi := arch/$(SRCARCH)/include/generated/asm +uapi := arch/$(SRCARCH)/include/generated/uapi/asm -gen := arch/$(ARCH)/include/generated -kapi := $(gen)/asm -uapi := $(gen)/uapi/asm +$(shell mkdir -p $(uapi) $(kapi)) -syscall := $(srctree)/$(src)/syscall.tbl -systbl := $(srctree)/$(src)/syscalltbl +syscall := $(src)/syscall.tbl +syshdr := $(srctree)/scripts/syscallhdr.sh +systbl := $(srctree)/scripts/syscalltbl.sh -gen-y := $(kapi)/syscall_table.h -kapi-hdrs-y := $(kapi)/unistd_nr.h -uapi-hdrs-y := $(uapi)/unistd_32.h -uapi-hdrs-y += $(uapi)/unistd_64.h +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis common,$* $< $@ -targets += $(addprefix ../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y)) +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis common,$* $< $@ -PHONY += kapi uapi +$(uapi)/unistd_%.h: $(syscall) $(syshdr) FORCE + $(call if_changed,syshdr) -kapi: $(gen-y) $(kapi-hdrs-y) -uapi: $(uapi-hdrs-y) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE + $(call if_changed,systbl) +uapisyshdr-y += unistd_64.h +kapisyshdr-y += syscall_table.h -# Create output directory if not already present -_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ - $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') +uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) +kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) +targets += $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y)) -filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $< - -filechk_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< - -filechk_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< - -syshdr_abi_unistd_32 := common,32 -$(uapi)/unistd_32.h: $(syscall) FORCE - $(call filechk,syshdr,$@) - -syshdr_abi_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall) FORCE - $(call filechk,syshdr,$@) - -$(kapi)/syscall_table.h: $(syscall) FORCE - $(call filechk,syscalls) - -sysnr_abi_unistd_nr := common,32,64 -$(kapi)/unistd_nr.h: $(syscall) FORCE - $(call filechk,sysnr) +PHONY += all +all: $(uapisyshdr-y) $(kapisyshdr-y) + @: diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 64d51ab5a8b4..417ed16b3c63 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -3,449 +3,397 @@ # System call table for s390 # # Format: +# <nr> <abi> <syscall> <entry> # -# <nr> <abi> <syscall> <entry-64bit> <compat-entry> -# -# where <abi> can be common, 64, or 32 +# <abi> is always common. -1 common exit sys_exit sys_exit -2 common fork sys_fork sys_fork -3 common read sys_read compat_sys_s390_read -4 common write sys_write compat_sys_s390_write -5 common open sys_open compat_sys_open -6 common close sys_close sys_close -7 common restart_syscall sys_restart_syscall sys_restart_syscall -8 common creat sys_creat sys_creat -9 common link sys_link sys_link -10 common unlink sys_unlink sys_unlink -11 common execve sys_execve compat_sys_execve -12 common chdir sys_chdir sys_chdir -13 32 time - sys_time32 -14 common mknod sys_mknod sys_mknod -15 common chmod sys_chmod sys_chmod -16 32 lchown - sys_lchown16 -19 common lseek sys_lseek compat_sys_lseek -20 common getpid sys_getpid sys_getpid -21 common mount sys_mount sys_mount -22 common umount sys_oldumount sys_oldumount -23 32 setuid - sys_setuid16 -24 32 getuid - sys_getuid16 -25 32 stime - sys_stime32 -26 common ptrace sys_ptrace compat_sys_ptrace -27 common alarm sys_alarm sys_alarm -29 common pause sys_pause sys_pause -30 common utime sys_utime sys_utime32 -33 common access sys_access sys_access -34 common nice sys_nice sys_nice -36 common sync sys_sync sys_sync -37 common kill sys_kill sys_kill -38 common rename sys_rename sys_rename -39 common mkdir sys_mkdir sys_mkdir -40 common rmdir sys_rmdir sys_rmdir -41 common dup sys_dup sys_dup -42 common pipe sys_pipe sys_pipe -43 common times sys_times compat_sys_times -45 common brk sys_brk sys_brk -46 32 setgid - sys_setgid16 -47 32 getgid - sys_getgid16 -48 common signal sys_signal sys_signal -49 32 geteuid - sys_geteuid16 -50 32 getegid - sys_getegid16 -51 common acct sys_acct sys_acct -52 common umount2 sys_umount sys_umount -54 common ioctl sys_ioctl compat_sys_ioctl -55 common fcntl sys_fcntl compat_sys_fcntl -57 common setpgid sys_setpgid sys_setpgid -60 common umask sys_umask sys_umask -61 common chroot sys_chroot sys_chroot -62 common ustat sys_ustat compat_sys_ustat -63 common dup2 sys_dup2 sys_dup2 -64 common getppid sys_getppid sys_getppid -65 common getpgrp sys_getpgrp sys_getpgrp -66 common setsid sys_setsid sys_setsid -67 common sigaction sys_sigaction compat_sys_sigaction -70 32 setreuid - sys_setreuid16 -71 32 setregid - sys_setregid16 -72 common sigsuspend sys_sigsuspend sys_sigsuspend -73 common sigpending sys_sigpending compat_sys_sigpending -74 common sethostname sys_sethostname sys_sethostname -75 common setrlimit sys_setrlimit compat_sys_setrlimit -76 32 getrlimit - compat_sys_old_getrlimit -77 common getrusage sys_getrusage compat_sys_getrusage -78 common gettimeofday sys_gettimeofday compat_sys_gettimeofday -79 common settimeofday sys_settimeofday compat_sys_settimeofday -80 32 getgroups - sys_getgroups16 -81 32 setgroups - sys_setgroups16 -83 common symlink sys_symlink sys_symlink -85 common readlink sys_readlink sys_readlink -86 common uselib sys_uselib sys_uselib -87 common swapon sys_swapon sys_swapon -88 common reboot sys_reboot sys_reboot -89 common readdir - compat_sys_old_readdir -90 common mmap sys_old_mmap compat_sys_s390_old_mmap -91 common munmap sys_munmap sys_munmap -92 common truncate sys_truncate compat_sys_truncate -93 common ftruncate sys_ftruncate compat_sys_ftruncate -94 common fchmod sys_fchmod sys_fchmod -95 32 fchown - sys_fchown16 -96 common getpriority sys_getpriority sys_getpriority -97 common setpriority sys_setpriority sys_setpriority -99 common statfs sys_statfs compat_sys_statfs -100 common fstatfs sys_fstatfs compat_sys_fstatfs -101 32 ioperm - - -102 common socketcall sys_socketcall compat_sys_socketcall -103 common syslog sys_syslog sys_syslog -104 common setitimer sys_setitimer compat_sys_setitimer -105 common getitimer sys_getitimer compat_sys_getitimer -106 common stat sys_newstat compat_sys_newstat -107 common lstat sys_newlstat compat_sys_newlstat -108 common fstat sys_newfstat compat_sys_newfstat -110 common lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie -111 common vhangup sys_vhangup sys_vhangup -112 common idle - - -114 common wait4 sys_wait4 compat_sys_wait4 -115 common swapoff sys_swapoff sys_swapoff -116 common sysinfo sys_sysinfo compat_sys_sysinfo -117 common ipc sys_s390_ipc compat_sys_s390_ipc -118 common fsync sys_fsync sys_fsync -119 common sigreturn sys_sigreturn compat_sys_sigreturn -120 common clone sys_clone sys_clone -121 common setdomainname sys_setdomainname sys_setdomainname -122 common uname sys_newuname sys_newuname -124 common adjtimex sys_adjtimex sys_adjtimex_time32 -125 common mprotect sys_mprotect sys_mprotect -126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask -127 common create_module - - -128 common init_module sys_init_module sys_init_module -129 common delete_module sys_delete_module sys_delete_module -130 common get_kernel_syms - - -131 common quotactl sys_quotactl sys_quotactl -132 common getpgid sys_getpgid sys_getpgid -133 common fchdir sys_fchdir sys_fchdir -134 common bdflush sys_bdflush sys_bdflush -135 common sysfs sys_sysfs sys_sysfs -136 common personality sys_s390_personality sys_s390_personality -137 common afs_syscall - - -138 32 setfsuid - sys_setfsuid16 -139 32 setfsgid - sys_setfsgid16 -140 32 _llseek - sys_llseek -141 common getdents sys_getdents compat_sys_getdents -142 32 _newselect - compat_sys_select -142 64 select sys_select - -143 common flock sys_flock sys_flock -144 common msync sys_msync sys_msync -145 common readv sys_readv sys_readv -146 common writev sys_writev sys_writev -147 common getsid sys_getsid sys_getsid -148 common fdatasync sys_fdatasync sys_fdatasync -149 common _sysctl - - -150 common mlock sys_mlock sys_mlock -151 common munlock sys_munlock sys_munlock -152 common mlockall sys_mlockall sys_mlockall -153 common munlockall sys_munlockall sys_munlockall -154 common sched_setparam sys_sched_setparam sys_sched_setparam -155 common sched_getparam sys_sched_getparam sys_sched_getparam -156 common sched_setscheduler sys_sched_setscheduler sys_sched_setscheduler -157 common sched_getscheduler sys_sched_getscheduler sys_sched_getscheduler -158 common sched_yield sys_sched_yield sys_sched_yield -159 common sched_get_priority_max sys_sched_get_priority_max sys_sched_get_priority_max -160 common sched_get_priority_min sys_sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 -162 common nanosleep sys_nanosleep sys_nanosleep_time32 -163 common mremap sys_mremap sys_mremap -164 32 setresuid - sys_setresuid16 -165 32 getresuid - sys_getresuid16 -167 common query_module - - -168 common poll sys_poll sys_poll -169 common nfsservctl - - -170 32 setresgid - sys_setresgid16 -171 32 getresgid - sys_getresgid16 -172 common prctl sys_prctl sys_prctl -173 common rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn -174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction -175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask -176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 -178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo -179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend -180 common pread64 sys_pread64 compat_sys_s390_pread64 -181 common pwrite64 sys_pwrite64 compat_sys_s390_pwrite64 -182 32 chown - sys_chown16 -183 common getcwd sys_getcwd sys_getcwd -184 common capget sys_capget sys_capget -185 common capset sys_capset sys_capset -186 common sigaltstack sys_sigaltstack compat_sys_sigaltstack -187 common sendfile sys_sendfile64 compat_sys_sendfile -188 common getpmsg - - -189 common putpmsg - - -190 common vfork sys_vfork sys_vfork -191 32 ugetrlimit - compat_sys_getrlimit -191 64 getrlimit sys_getrlimit - -192 32 mmap2 - compat_sys_s390_mmap2 -193 32 truncate64 - compat_sys_s390_truncate64 -194 32 ftruncate64 - compat_sys_s390_ftruncate64 -195 32 stat64 - compat_sys_s390_stat64 -196 32 lstat64 - compat_sys_s390_lstat64 -197 32 fstat64 - compat_sys_s390_fstat64 -198 32 lchown32 - sys_lchown -198 64 lchown sys_lchown - -199 32 getuid32 - sys_getuid -199 64 getuid sys_getuid - -200 32 getgid32 - sys_getgid -200 64 getgid sys_getgid - -201 32 geteuid32 - sys_geteuid -201 64 geteuid sys_geteuid - -202 32 getegid32 - sys_getegid -202 64 getegid sys_getegid - -203 32 setreuid32 - sys_setreuid -203 64 setreuid sys_setreuid - -204 32 setregid32 - sys_setregid -204 64 setregid sys_setregid - -205 32 getgroups32 - sys_getgroups -205 64 getgroups sys_getgroups - -206 32 setgroups32 - sys_setgroups -206 64 setgroups sys_setgroups - -207 32 fchown32 - sys_fchown -207 64 fchown sys_fchown - -208 32 setresuid32 - sys_setresuid -208 64 setresuid sys_setresuid - -209 32 getresuid32 - sys_getresuid -209 64 getresuid sys_getresuid - -210 32 setresgid32 - sys_setresgid -210 64 setresgid sys_setresgid - -211 32 getresgid32 - sys_getresgid -211 64 getresgid sys_getresgid - -212 32 chown32 - sys_chown -212 64 chown sys_chown - -213 32 setuid32 - sys_setuid -213 64 setuid sys_setuid - -214 32 setgid32 - sys_setgid -214 64 setgid sys_setgid - -215 32 setfsuid32 - sys_setfsuid -215 64 setfsuid sys_setfsuid - -216 32 setfsgid32 - sys_setfsgid -216 64 setfsgid sys_setfsgid - -217 common pivot_root sys_pivot_root sys_pivot_root -218 common mincore sys_mincore sys_mincore -219 common madvise sys_madvise sys_madvise -220 common getdents64 sys_getdents64 sys_getdents64 -221 32 fcntl64 - compat_sys_fcntl64 -222 common readahead sys_readahead compat_sys_s390_readahead -223 32 sendfile64 - compat_sys_sendfile64 -224 common setxattr sys_setxattr sys_setxattr -225 common lsetxattr sys_lsetxattr sys_lsetxattr -226 common fsetxattr sys_fsetxattr sys_fsetxattr -227 common getxattr sys_getxattr sys_getxattr -228 common lgetxattr sys_lgetxattr sys_lgetxattr -229 common fgetxattr sys_fgetxattr sys_fgetxattr -230 common listxattr sys_listxattr sys_listxattr -231 common llistxattr sys_llistxattr sys_llistxattr -232 common flistxattr sys_flistxattr sys_flistxattr -233 common removexattr sys_removexattr sys_removexattr -234 common lremovexattr sys_lremovexattr sys_lremovexattr -235 common fremovexattr sys_fremovexattr sys_fremovexattr -236 common gettid sys_gettid sys_gettid -237 common tkill sys_tkill sys_tkill -238 common futex sys_futex sys_futex_time32 -239 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity -240 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity -241 common tgkill sys_tgkill sys_tgkill -243 common io_setup sys_io_setup compat_sys_io_setup -244 common io_destroy sys_io_destroy sys_io_destroy -245 common io_getevents sys_io_getevents sys_io_getevents_time32 -246 common io_submit sys_io_submit compat_sys_io_submit -247 common io_cancel sys_io_cancel sys_io_cancel -248 common exit_group sys_exit_group sys_exit_group -249 common epoll_create sys_epoll_create sys_epoll_create -250 common epoll_ctl sys_epoll_ctl sys_epoll_ctl -251 common epoll_wait sys_epoll_wait sys_epoll_wait -252 common set_tid_address sys_set_tid_address sys_set_tid_address -253 common fadvise64 sys_fadvise64_64 compat_sys_s390_fadvise64 -254 common timer_create sys_timer_create compat_sys_timer_create -255 common timer_settime sys_timer_settime sys_timer_settime32 -256 common timer_gettime sys_timer_gettime sys_timer_gettime32 -257 common timer_getoverrun sys_timer_getoverrun sys_timer_getoverrun -258 common timer_delete sys_timer_delete sys_timer_delete -259 common clock_settime sys_clock_settime sys_clock_settime32 -260 common clock_gettime sys_clock_gettime sys_clock_gettime32 -261 common clock_getres sys_clock_getres sys_clock_getres_time32 -262 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 -264 32 fadvise64_64 - compat_sys_s390_fadvise64_64 -265 common statfs64 sys_statfs64 compat_sys_statfs64 -266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages -268 common mbind sys_mbind compat_sys_mbind -269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy -270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy -271 common mq_open sys_mq_open compat_sys_mq_open -272 common mq_unlink sys_mq_unlink sys_mq_unlink -273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 -274 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 -275 common mq_notify sys_mq_notify compat_sys_mq_notify -276 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr -277 common kexec_load sys_kexec_load compat_sys_kexec_load -278 common add_key sys_add_key sys_add_key -279 common request_key sys_request_key sys_request_key -280 common keyctl sys_keyctl compat_sys_keyctl -281 common waitid sys_waitid compat_sys_waitid -282 common ioprio_set sys_ioprio_set sys_ioprio_set -283 common ioprio_get sys_ioprio_get sys_ioprio_get -284 common inotify_init sys_inotify_init sys_inotify_init -285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch -286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch -287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages -288 common openat sys_openat compat_sys_openat -289 common mkdirat sys_mkdirat sys_mkdirat -290 common mknodat sys_mknodat sys_mknodat -291 common fchownat sys_fchownat sys_fchownat -292 common futimesat sys_futimesat sys_futimesat_time32 -293 32 fstatat64 - compat_sys_s390_fstatat64 -293 64 newfstatat sys_newfstatat - -294 common unlinkat sys_unlinkat sys_unlinkat -295 common renameat sys_renameat sys_renameat -296 common linkat sys_linkat sys_linkat -297 common symlinkat sys_symlinkat sys_symlinkat -298 common readlinkat sys_readlinkat sys_readlinkat -299 common fchmodat sys_fchmodat sys_fchmodat -300 common faccessat sys_faccessat sys_faccessat -301 common pselect6 sys_pselect6 compat_sys_pselect6_time32 -302 common ppoll sys_ppoll compat_sys_ppoll_time32 -303 common unshare sys_unshare sys_unshare -304 common set_robust_list sys_set_robust_list compat_sys_set_robust_list -305 common get_robust_list sys_get_robust_list compat_sys_get_robust_list -306 common splice sys_splice sys_splice -307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range -308 common tee sys_tee sys_tee -309 common vmsplice sys_vmsplice sys_vmsplice -310 common move_pages sys_move_pages compat_sys_move_pages -311 common getcpu sys_getcpu sys_getcpu -312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait -313 common utimes sys_utimes sys_utimes_time32 -314 common fallocate sys_fallocate compat_sys_s390_fallocate -315 common utimensat sys_utimensat sys_utimensat_time32 -316 common signalfd sys_signalfd compat_sys_signalfd -317 common timerfd - - -318 common eventfd sys_eventfd sys_eventfd -319 common timerfd_create sys_timerfd_create sys_timerfd_create -320 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 -321 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 -322 common signalfd4 sys_signalfd4 compat_sys_signalfd4 -323 common eventfd2 sys_eventfd2 sys_eventfd2 -324 common inotify_init1 sys_inotify_init1 sys_inotify_init1 -325 common pipe2 sys_pipe2 sys_pipe2 -326 common dup3 sys_dup3 sys_dup3 -327 common epoll_create1 sys_epoll_create1 sys_epoll_create1 -328 common preadv sys_preadv compat_sys_preadv -329 common pwritev sys_pwritev compat_sys_pwritev -330 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -331 common perf_event_open sys_perf_event_open sys_perf_event_open -332 common fanotify_init sys_fanotify_init sys_fanotify_init -333 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark -334 common prlimit64 sys_prlimit64 sys_prlimit64 -335 common name_to_handle_at sys_name_to_handle_at sys_name_to_handle_at -336 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 -338 common syncfs sys_syncfs sys_syncfs -339 common setns sys_setns sys_setns -340 common process_vm_readv sys_process_vm_readv sys_process_vm_readv -341 common process_vm_writev sys_process_vm_writev sys_process_vm_writev -342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr -343 common kcmp sys_kcmp sys_kcmp -344 common finit_module sys_finit_module sys_finit_module -345 common sched_setattr sys_sched_setattr sys_sched_setattr -346 common sched_getattr sys_sched_getattr sys_sched_getattr -347 common renameat2 sys_renameat2 sys_renameat2 -348 common seccomp sys_seccomp sys_seccomp -349 common getrandom sys_getrandom sys_getrandom -350 common memfd_create sys_memfd_create sys_memfd_create -351 common bpf sys_bpf sys_bpf -352 common s390_pci_mmio_write sys_s390_pci_mmio_write sys_s390_pci_mmio_write -353 common s390_pci_mmio_read sys_s390_pci_mmio_read sys_s390_pci_mmio_read -354 common execveat sys_execveat compat_sys_execveat -355 common userfaultfd sys_userfaultfd sys_userfaultfd -356 common membarrier sys_membarrier sys_membarrier -357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 -358 common sendmmsg sys_sendmmsg compat_sys_sendmmsg -359 common socket sys_socket sys_socket -360 common socketpair sys_socketpair sys_socketpair -361 common bind sys_bind sys_bind -362 common connect sys_connect sys_connect -363 common listen sys_listen sys_listen -364 common accept4 sys_accept4 sys_accept4 -365 common getsockopt sys_getsockopt sys_getsockopt -366 common setsockopt sys_setsockopt sys_setsockopt -367 common getsockname sys_getsockname sys_getsockname -368 common getpeername sys_getpeername sys_getpeername -369 common sendto sys_sendto sys_sendto -370 common sendmsg sys_sendmsg compat_sys_sendmsg -371 common recvfrom sys_recvfrom compat_sys_recvfrom -372 common recvmsg sys_recvmsg compat_sys_recvmsg -373 common shutdown sys_shutdown sys_shutdown -374 common mlock2 sys_mlock2 sys_mlock2 -375 common copy_file_range sys_copy_file_range sys_copy_file_range -376 common preadv2 sys_preadv2 compat_sys_preadv2 -377 common pwritev2 sys_pwritev2 compat_sys_pwritev2 -378 common s390_guarded_storage sys_s390_guarded_storage sys_s390_guarded_storage -379 common statx sys_statx sys_statx -380 common s390_sthyi sys_s390_sthyi sys_s390_sthyi -381 common kexec_file_load sys_kexec_file_load sys_kexec_file_load -382 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents -383 common rseq sys_rseq sys_rseq -384 common pkey_mprotect sys_pkey_mprotect sys_pkey_mprotect -385 common pkey_alloc sys_pkey_alloc sys_pkey_alloc -386 common pkey_free sys_pkey_free sys_pkey_free +1 common exit sys_exit +2 common fork sys_fork +3 common read sys_read +4 common write sys_write +5 common open sys_open +6 common close sys_close +7 common restart_syscall sys_restart_syscall +8 common creat sys_creat +9 common link sys_link +10 common unlink sys_unlink +11 common execve sys_execve +12 common chdir sys_chdir +14 common mknod sys_mknod +15 common chmod sys_chmod +19 common lseek sys_lseek +20 common getpid sys_getpid +21 common mount sys_mount +22 common umount sys_oldumount +26 common ptrace sys_ptrace +27 common alarm sys_alarm +29 common pause sys_pause +30 common utime sys_utime +33 common access sys_access +34 common nice sys_nice +36 common sync sys_sync +37 common kill sys_kill +38 common rename sys_rename +39 common mkdir sys_mkdir +40 common rmdir sys_rmdir +41 common dup sys_dup +42 common pipe sys_pipe +43 common times sys_times +45 common brk sys_brk +48 common signal sys_signal +51 common acct sys_acct +52 common umount2 sys_umount +54 common ioctl sys_ioctl +55 common fcntl sys_fcntl +57 common setpgid sys_setpgid +60 common umask sys_umask +61 common chroot sys_chroot +62 common ustat sys_ustat +63 common dup2 sys_dup2 +64 common getppid sys_getppid +65 common getpgrp sys_getpgrp +66 common setsid sys_setsid +67 common sigaction sys_sigaction +72 common sigsuspend sys_sigsuspend +73 common sigpending sys_sigpending +74 common sethostname sys_sethostname +75 common setrlimit sys_setrlimit +77 common getrusage sys_getrusage +78 common gettimeofday sys_gettimeofday +79 common settimeofday sys_settimeofday +83 common symlink sys_symlink +85 common readlink sys_readlink +86 common uselib sys_uselib +87 common swapon sys_swapon +88 common reboot sys_reboot +89 common readdir sys_ni_syscall +90 common mmap sys_old_mmap +91 common munmap sys_munmap +92 common truncate sys_truncate +93 common ftruncate sys_ftruncate +94 common fchmod sys_fchmod +96 common getpriority sys_getpriority +97 common setpriority sys_setpriority +99 common statfs sys_statfs +100 common fstatfs sys_fstatfs +102 common socketcall sys_socketcall +103 common syslog sys_syslog +104 common setitimer sys_setitimer +105 common getitimer sys_getitimer +106 common stat sys_newstat +107 common lstat sys_newlstat +108 common fstat sys_newfstat +110 common lookup_dcookie sys_ni_syscall +111 common vhangup sys_vhangup +112 common idle sys_ni_syscall +114 common wait4 sys_wait4 +115 common swapoff sys_swapoff +116 common sysinfo sys_sysinfo +117 common ipc sys_s390_ipc +118 common fsync sys_fsync +119 common sigreturn sys_sigreturn +120 common clone sys_clone +121 common setdomainname sys_setdomainname +122 common uname sys_newuname +124 common adjtimex sys_adjtimex +125 common mprotect sys_mprotect +126 common sigprocmask sys_sigprocmask +127 common create_module sys_ni_syscall +128 common init_module sys_init_module +129 common delete_module sys_delete_module +130 common get_kernel_syms sys_ni_syscall +131 common quotactl sys_quotactl +132 common getpgid sys_getpgid +133 common fchdir sys_fchdir +134 common bdflush sys_ni_syscall +135 common sysfs sys_sysfs +136 common personality sys_s390_personality +137 common afs_syscall sys_ni_syscall +141 common getdents sys_getdents +142 common select sys_select +143 common flock sys_flock +144 common msync sys_msync +145 common readv sys_readv +146 common writev sys_writev +147 common getsid sys_getsid +148 common fdatasync sys_fdatasync +149 common _sysctl sys_ni_syscall +150 common mlock sys_mlock +151 common munlock sys_munlock +152 common mlockall sys_mlockall +153 common munlockall sys_munlockall +154 common sched_setparam sys_sched_setparam +155 common sched_getparam sys_sched_getparam +156 common sched_setscheduler sys_sched_setscheduler +157 common sched_getscheduler sys_sched_getscheduler +158 common sched_yield sys_sched_yield +159 common sched_get_priority_max sys_sched_get_priority_max +160 common sched_get_priority_min sys_sched_get_priority_min +161 common sched_rr_get_interval sys_sched_rr_get_interval +162 common nanosleep sys_nanosleep +163 common mremap sys_mremap +167 common query_module sys_ni_syscall +168 common poll sys_poll +169 common nfsservctl sys_ni_syscall +172 common prctl sys_prctl +173 common rt_sigreturn sys_rt_sigreturn +174 common rt_sigaction sys_rt_sigaction +175 common rt_sigprocmask sys_rt_sigprocmask +176 common rt_sigpending sys_rt_sigpending +177 common rt_sigtimedwait sys_rt_sigtimedwait +178 common rt_sigqueueinfo sys_rt_sigqueueinfo +179 common rt_sigsuspend sys_rt_sigsuspend +180 common pread64 sys_pread64 +181 common pwrite64 sys_pwrite64 +183 common getcwd sys_getcwd +184 common capget sys_capget +185 common capset sys_capset +186 common sigaltstack sys_sigaltstack +187 common sendfile sys_sendfile64 +188 common getpmsg sys_ni_syscall +189 common putpmsg sys_ni_syscall +190 common vfork sys_vfork +191 common getrlimit sys_getrlimit +198 common lchown sys_lchown +199 common getuid sys_getuid +200 common getgid sys_getgid +201 common geteuid sys_geteuid +202 common getegid sys_getegid +203 common setreuid sys_setreuid +204 common setregid sys_setregid +205 common getgroups sys_getgroups +206 common setgroups sys_setgroups +207 common fchown sys_fchown +208 common setresuid sys_setresuid +209 common getresuid sys_getresuid +210 common setresgid sys_setresgid +211 common getresgid sys_getresgid +212 common chown sys_chown +213 common setuid sys_setuid +214 common setgid sys_setgid +215 common setfsuid sys_setfsuid +216 common setfsgid sys_setfsgid +217 common pivot_root sys_pivot_root +218 common mincore sys_mincore +219 common madvise sys_madvise +220 common getdents64 sys_getdents64 +222 common readahead sys_readahead +224 common setxattr sys_setxattr +225 common lsetxattr sys_lsetxattr +226 common fsetxattr sys_fsetxattr +227 common getxattr sys_getxattr +228 common lgetxattr sys_lgetxattr +229 common fgetxattr sys_fgetxattr +230 common listxattr sys_listxattr +231 common llistxattr sys_llistxattr +232 common flistxattr sys_flistxattr +233 common removexattr sys_removexattr +234 common lremovexattr sys_lremovexattr +235 common fremovexattr sys_fremovexattr +236 common gettid sys_gettid +237 common tkill sys_tkill +238 common futex sys_futex +239 common sched_setaffinity sys_sched_setaffinity +240 common sched_getaffinity sys_sched_getaffinity +241 common tgkill sys_tgkill +243 common io_setup sys_io_setup +244 common io_destroy sys_io_destroy +245 common io_getevents sys_io_getevents +246 common io_submit sys_io_submit +247 common io_cancel sys_io_cancel +248 common exit_group sys_exit_group +249 common epoll_create sys_epoll_create +250 common epoll_ctl sys_epoll_ctl +251 common epoll_wait sys_epoll_wait +252 common set_tid_address sys_set_tid_address +253 common fadvise64 sys_fadvise64_64 +254 common timer_create sys_timer_create +255 common timer_settime sys_timer_settime +256 common timer_gettime sys_timer_gettime +257 common timer_getoverrun sys_timer_getoverrun +258 common timer_delete sys_timer_delete +259 common clock_settime sys_clock_settime +260 common clock_gettime sys_clock_gettime +261 common clock_getres sys_clock_getres +262 common clock_nanosleep sys_clock_nanosleep +265 common statfs64 sys_statfs64 +266 common fstatfs64 sys_fstatfs64 +267 common remap_file_pages sys_remap_file_pages +268 common mbind sys_mbind +269 common get_mempolicy sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy +271 common mq_open sys_mq_open +272 common mq_unlink sys_mq_unlink +273 common mq_timedsend sys_mq_timedsend +274 common mq_timedreceive sys_mq_timedreceive +275 common mq_notify sys_mq_notify +276 common mq_getsetattr sys_mq_getsetattr +277 common kexec_load sys_kexec_load +278 common add_key sys_add_key +279 common request_key sys_request_key +280 common keyctl sys_keyctl +281 common waitid sys_waitid +282 common ioprio_set sys_ioprio_set +283 common ioprio_get sys_ioprio_get +284 common inotify_init sys_inotify_init +285 common inotify_add_watch sys_inotify_add_watch +286 common inotify_rm_watch sys_inotify_rm_watch +287 common migrate_pages sys_migrate_pages +288 common openat sys_openat +289 common mkdirat sys_mkdirat +290 common mknodat sys_mknodat +291 common fchownat sys_fchownat +292 common futimesat sys_futimesat +293 common newfstatat sys_newfstatat +294 common unlinkat sys_unlinkat +295 common renameat sys_renameat +296 common linkat sys_linkat +297 common symlinkat sys_symlinkat +298 common readlinkat sys_readlinkat +299 common fchmodat sys_fchmodat +300 common faccessat sys_faccessat +301 common pselect6 sys_pselect6 +302 common ppoll sys_ppoll +303 common unshare sys_unshare +304 common set_robust_list sys_set_robust_list +305 common get_robust_list sys_get_robust_list +306 common splice sys_splice +307 common sync_file_range sys_sync_file_range +308 common tee sys_tee +309 common vmsplice sys_vmsplice +310 common move_pages sys_move_pages +311 common getcpu sys_getcpu +312 common epoll_pwait sys_epoll_pwait +313 common utimes sys_utimes +314 common fallocate sys_fallocate +315 common utimensat sys_utimensat +316 common signalfd sys_signalfd +317 common timerfd sys_ni_syscall +318 common eventfd sys_eventfd +319 common timerfd_create sys_timerfd_create +320 common timerfd_settime sys_timerfd_settime +321 common timerfd_gettime sys_timerfd_gettime +322 common signalfd4 sys_signalfd4 +323 common eventfd2 sys_eventfd2 +324 common inotify_init1 sys_inotify_init1 +325 common pipe2 sys_pipe2 +326 common dup3 sys_dup3 +327 common epoll_create1 sys_epoll_create1 +328 common preadv sys_preadv +329 common pwritev sys_pwritev +330 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +331 common perf_event_open sys_perf_event_open +332 common fanotify_init sys_fanotify_init +333 common fanotify_mark sys_fanotify_mark +334 common prlimit64 sys_prlimit64 +335 common name_to_handle_at sys_name_to_handle_at +336 common open_by_handle_at sys_open_by_handle_at +337 common clock_adjtime sys_clock_adjtime +338 common syncfs sys_syncfs +339 common setns sys_setns +340 common process_vm_readv sys_process_vm_readv +341 common process_vm_writev sys_process_vm_writev +342 common s390_runtime_instr sys_s390_runtime_instr +343 common kcmp sys_kcmp +344 common finit_module sys_finit_module +345 common sched_setattr sys_sched_setattr +346 common sched_getattr sys_sched_getattr +347 common renameat2 sys_renameat2 +348 common seccomp sys_seccomp +349 common getrandom sys_getrandom +350 common memfd_create sys_memfd_create +351 common bpf sys_bpf +352 common s390_pci_mmio_write sys_s390_pci_mmio_write +353 common s390_pci_mmio_read sys_s390_pci_mmio_read +354 common execveat sys_execveat +355 common userfaultfd sys_userfaultfd +356 common membarrier sys_membarrier +357 common recvmmsg sys_recvmmsg +358 common sendmmsg sys_sendmmsg +359 common socket sys_socket +360 common socketpair sys_socketpair +361 common bind sys_bind +362 common connect sys_connect +363 common listen sys_listen +364 common accept4 sys_accept4 +365 common getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt +367 common getsockname sys_getsockname +368 common getpeername sys_getpeername +369 common sendto sys_sendto +370 common sendmsg sys_sendmsg +371 common recvfrom sys_recvfrom +372 common recvmsg sys_recvmsg +373 common shutdown sys_shutdown +374 common mlock2 sys_mlock2 +375 common copy_file_range sys_copy_file_range +376 common preadv2 sys_preadv2 +377 common pwritev2 sys_pwritev2 +378 common s390_guarded_storage sys_s390_guarded_storage +379 common statx sys_statx +380 common s390_sthyi sys_s390_sthyi +381 common kexec_file_load sys_kexec_file_load +382 common io_pgetevents sys_io_pgetevents +383 common rseq sys_rseq +384 common pkey_mprotect sys_pkey_mprotect +385 common pkey_alloc sys_pkey_alloc +386 common pkey_free sys_pkey_free # room for arch specific syscalls -392 64 semtimedop sys_semtimedop - -393 common semget sys_semget sys_semget -394 common semctl sys_semctl compat_sys_semctl -395 common shmget sys_shmget sys_shmget -396 common shmctl sys_shmctl compat_sys_shmctl -397 common shmat sys_shmat compat_sys_shmat -398 common shmdt sys_shmdt sys_shmdt -399 common msgget sys_msgget sys_msgget -400 common msgsnd sys_msgsnd compat_sys_msgsnd -401 common msgrcv sys_msgrcv compat_sys_msgrcv -402 common msgctl sys_msgctl compat_sys_msgctl -403 32 clock_gettime64 - sys_clock_gettime -404 32 clock_settime64 - sys_clock_settime -405 32 clock_adjtime64 - sys_clock_adjtime -406 32 clock_getres_time64 - sys_clock_getres -407 32 clock_nanosleep_time64 - sys_clock_nanosleep -408 32 timer_gettime64 - sys_timer_gettime -409 32 timer_settime64 - sys_timer_settime -410 32 timerfd_gettime64 - sys_timerfd_gettime -411 32 timerfd_settime64 - sys_timerfd_settime -412 32 utimensat_time64 - sys_utimensat -413 32 pselect6_time64 - compat_sys_pselect6_time64 -414 32 ppoll_time64 - compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 - sys_io_pgetevents -417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 -418 32 mq_timedsend_time64 - sys_mq_timedsend -419 32 mq_timedreceive_time64 - sys_mq_timedreceive -420 32 semtimedop_time64 - sys_semtimedop -421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 -422 32 futex_time64 - sys_futex -423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval -424 common pidfd_send_signal sys_pidfd_send_signal sys_pidfd_send_signal -425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup -426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter -427 common io_uring_register sys_io_uring_register sys_io_uring_register -428 common open_tree sys_open_tree sys_open_tree -429 common move_mount sys_move_mount sys_move_mount -430 common fsopen sys_fsopen sys_fsopen -431 common fsconfig sys_fsconfig sys_fsconfig -432 common fsmount sys_fsmount sys_fsmount -433 common fspick sys_fspick sys_fspick -434 common pidfd_open sys_pidfd_open sys_pidfd_open -435 common clone3 sys_clone3 sys_clone3 -436 common close_range sys_close_range sys_close_range -437 common openat2 sys_openat2 sys_openat2 -438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd -439 common faccessat2 sys_faccessat2 sys_faccessat2 -440 common process_madvise sys_process_madvise sys_process_madvise -441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 -442 common mount_setattr sys_mount_setattr sys_mount_setattr -443 common quotactl_fd sys_quotactl_fd sys_quotactl_fd -444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset -445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule -446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +392 common semtimedop sys_semtimedop +393 common semget sys_semget +394 common semctl sys_semctl +395 common shmget sys_shmget +396 common shmctl sys_shmctl +397 common shmat sys_shmat +398 common shmdt sys_shmdt +399 common msgget sys_msgget +400 common msgsnd sys_msgsnd +401 common msgrcv sys_msgrcv +402 common msgctl sys_msgctl +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree +429 common move_mount sys_move_mount +430 common fsopen sys_fsopen +431 common fsconfig sys_fsconfig +432 common fsmount sys_fsmount +433 common fspick sys_fspick +434 common pidfd_open sys_pidfd_open +435 common clone3 sys_clone3 +436 common close_range sys_close_range +437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr +443 common quotactl_fd sys_quotactl_fd +444 common landlock_create_ruleset sys_landlock_create_ruleset +445 common landlock_add_rule sys_landlock_add_rule +446 common landlock_restrict_self sys_landlock_restrict_self +447 common memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat +452 common fchmodat2 sys_fchmodat2 +453 common map_shadow_stack sys_map_shadow_stack +454 common futex_wake sys_futex_wake +455 common futex_wait sys_futex_wait +456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount +459 common lsm_get_self_attr sys_lsm_get_self_attr +460 common lsm_set_self_attr sys_lsm_set_self_attr +461 common lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat +467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/s390/kernel/syscalls/syscalltbl b/arch/s390/kernel/syscalls/syscalltbl deleted file mode 100755 index fbac1732f874..000000000000 --- a/arch/s390/kernel/syscalls/syscalltbl +++ /dev/null @@ -1,232 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Generate system call table and header files -# -# Copyright IBM Corp. 2018 -# Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> - -# -# File path to the system call table definition. -# You can set the path with the -i option. If omitted, -# system call table definitions are read from standard input. -# -SYSCALL_TBL="" - - -create_syscall_table_entries() -{ - local nr abi name entry64 entry32 _ignore - local temp=$(mktemp ${TMPDIR:-/tmp}/syscalltbl-common.XXXXXXXXX) - - ( - # - # Initialize with 0 to create an NI_SYSCALL for 0 - # - local prev_nr=0 prev_32=sys_ni_syscall prev_64=sys_ni_syscall - while read nr abi name entry64 entry32 _ignore; do - test x$entry32 = x- && entry32=sys_ni_syscall - test x$entry64 = x- && entry64=sys_ni_syscall - - if test $prev_nr -eq $nr; then - # - # Same syscall but different ABI, just update - # the respective entry point - # - case $abi in - 32) - prev_32=$entry32 - ;; - 64) - prev_64=$entry64 - ;; - esac - continue; - else - printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32 - fi - - prev_nr=$nr - prev_64=$entry64 - prev_32=$entry32 - done - printf "%d\t%s\t%s\n" $prev_nr $prev_64 $prev_32 - ) >> $temp - - # - # Check for duplicate syscall numbers - # - if ! cat $temp |cut -f1 |uniq -d 2>&1; then - echo "Error: generated system call table contains duplicate entries: $temp" >&2 - exit 1 - fi - - # - # Generate syscall table - # - prev_nr=0 - while read nr entry64 entry32; do - while test $prev_nr -lt $((nr - 1)); do - printf "NI_SYSCALL\n" - prev_nr=$((prev_nr + 1)) - done - if test x$entry64 = xsys_ni_syscall && - test x$entry32 = xsys_ni_syscall; then - printf "NI_SYSCALL\n" - else - printf "SYSCALL(%s,%s)\n" $entry64 $entry32 - fi - prev_nr=$nr - done < $temp - rm $temp -} - -generate_syscall_table() -{ - cat <<-EoHEADER - /* SPDX-License-Identifier: GPL-2.0 */ - /* - * Definitions for sys_call_table, each line represents an - * entry in the table in the form - * SYSCALL(64 bit syscall, 31 bit emulated syscall) - * - * This file is meant to be included from entry.S. - */ - - #define NI_SYSCALL SYSCALL(sys_ni_syscall,sys_ni_syscall) - -EoHEADER - grep -Ev '^(#|[[:blank:]]*$)' $SYSCALL_TBL \ - |sort -k1 -n \ - |create_syscall_table_entries -} - -create_header_defines() -{ - local nr abi name _ignore - - while read nr abi name _ignore; do - printf "#define __NR_%s %d\n" $name $nr - done -} - -normalize_fileguard() -{ - local fileguard="$1" - - echo "$1" |tr '[[:lower:]]' '[[:upper:]]' \ - |sed -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g' -} - -generate_syscall_header() -{ - local abis=$(echo "($1)" | tr ',' '|') - local filename="$2" - local fileguard suffix - - if test "$filename"; then - fileguard=$(normalize_fileguard "__UAPI_ASM_S390_$2") - else - case "$abis" in - *64*) suffix=64 ;; - *32*) suffix=32 ;; - esac - fileguard=$(normalize_fileguard "__UAPI_ASM_S390_SYSCALLS_$suffix") - fi - - cat <<-EoHEADER - /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ - #ifndef ${fileguard} - #define ${fileguard} - -EoHEADER - - grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL \ - |sort -k1 -n \ - |create_header_defines - - cat <<-EoFOOTER - - #endif /* ${fileguard} */ -EoFOOTER -} - -__max_syscall_nr() -{ - local abis=$(echo "($1)" | tr ',' '|') - - grep -E "^[[:digit:]]+[[:space:]]+${abis}" $SYSCALL_TBL \ - |sed -ne 's/^\([[:digit:]]*\)[[:space:]].*/\1/p' \ - |sort -n \ - |tail -1 -} - - -generate_syscall_nr() -{ - local abis="$1" - local max_syscall_nr num_syscalls - - max_syscall_nr=$(__max_syscall_nr "$abis") - num_syscalls=$((max_syscall_nr + 1)) - - cat <<-EoHEADER - /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ - #ifndef __ASM_S390_SYSCALLS_NR - #define __ASM_S390_SYSCALLS_NR - - #define NR_syscalls ${num_syscalls} - - #endif /* __ASM_S390_SYSCALLS_NR */ -EoHEADER -} - - -# -# Parse command line arguments -# -do_syscall_header="" -do_syscall_table="" -do_syscall_nr="" -output_file="" -abi_list="common,64" -filename="" -while getopts ":HNSXi:a:f:" arg; do - case $arg in - a) - abi_list="$OPTARG" - ;; - i) - SYSCALL_TBL="$OPTARG" - ;; - f) - filename=${OPTARG##*/} - ;; - H) - do_syscall_header=1 - ;; - N) - do_syscall_nr=1 - ;; - S) - do_syscall_table=1 - ;; - X) - set -x - ;; - :) - echo "Missing argument for -$OPTARG" >&2 - exit 1 - ;; - \?) - echo "Invalid option specified" >&2 - exit 1 - ;; - esac -done - -test "$do_syscall_header" && generate_syscall_header "$abi_list" "$filename" -test "$do_syscall_table" && generate_syscall_table -test "$do_syscall_nr" && generate_syscall_nr "$abi_list" - -exit 0 diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index ef3f2659876c..33ca3e47a0e6 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -5,6 +5,7 @@ * Martin Schwidefsky <schwidefsky@de.ibm.com>, */ +#include <linux/cpufeature.h> #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -14,54 +15,18 @@ #include <linux/delay.h> #include <linux/export.h> #include <linux/slab.h> +#include <asm/asm-extable.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include <asm/debug.h> #include <asm/sysinfo.h> #include <asm/cpcmd.h> #include <asm/topology.h> -#include <asm/fpu/api.h> +#include <asm/fpu.h> +#include <asm/asm.h> int topology_max_mnest; -static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl) -{ - int r0 = (fc << 28) | sel1; - int rc = 0; - - asm volatile( - " lr 0,%[r0]\n" - " lr 1,%[r1]\n" - " stsi 0(%[sysinfo])\n" - "0: jz 2f\n" - "1: lhi %[rc],%[retval]\n" - "2: lr %[r0],0\n" - EX_TABLE(0b, 1b) - : [r0] "+d" (r0), [rc] "+d" (rc) - : [r1] "d" (sel2), - [sysinfo] "a" (sysinfo), - [retval] "K" (-EOPNOTSUPP) - : "cc", "0", "1", "memory"); - *lvl = ((unsigned int) r0) >> 28; - return rc; -} - -/* - * stsi - store system information - * - * Returns the current configuration level if function code 0 was specified. - * Otherwise returns 0 on success or a negative value on error. - */ -int stsi(void *sysinfo, int fc, int sel1, int sel2) -{ - int lvl, rc; - - rc = __stsi(sysinfo, fc, sel1, sel2, &lvl); - if (rc) - return rc; - return fc ? 0 : lvl; -} -EXPORT_SYMBOL(stsi); - #ifdef CONFIG_PROC_FS static bool convert_ext_name(unsigned char encoding, char *name, size_t len) @@ -80,10 +45,12 @@ static bool convert_ext_name(unsigned char encoding, char *name, size_t len) static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) { + bool has_var_cap; int i; if (stsi(info, 1, 1, 1)) return; + has_var_cap = !!info->model_var_cap[0]; EBCASC(info->manufacturer, sizeof(info->manufacturer)); EBCASC(info->type, sizeof(info->type)); EBCASC(info->model, sizeof(info->model)); @@ -92,6 +59,8 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) EBCASC(info->model_capacity, sizeof(info->model_capacity)); EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap)); EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap)); + if (has_var_cap) + EBCASC(info->model_var_cap, sizeof(info->model_var_cap)); seq_printf(m, "Manufacturer: %-16.16s\n", info->manufacturer); seq_printf(m, "Type: %-4.4s\n", info->type); if (info->lic) @@ -119,12 +88,18 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n", info->model_temp_cap, info->model_temp_cap_rating); + if (has_var_cap && info->model_var_cap_rating) + seq_printf(m, "Model Var. Capacity: %-16.16s %08u\n", + info->model_var_cap, + info->model_var_cap_rating); if (info->ncr) seq_printf(m, "Nominal Cap. Rating: %08u\n", info->ncr); if (info->npr) seq_printf(m, "Nominal Perm. Rating: %08u\n", info->npr); if (info->ntr) seq_printf(m, "Nominal Temp. Rating: %08u\n", info->ntr); + if (has_var_cap && info->nvr) + seq_printf(m, "Nominal Var. Rating: %08u\n", info->nvr); if (info->cai) { seq_printf(m, "Capacity Adj. Ind.: %d\n", info->cai); seq_printf(m, "Capacity Ch. Reason: %d\n", info->ccr); @@ -143,7 +118,7 @@ static void stsi_15_1_x(struct seq_file *m, struct sysinfo_15_1_x *info) int i; seq_putc(m, '\n'); - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return; if (stsi(info, 15, 1, topology_max_mnest)) return; @@ -386,7 +361,7 @@ static void service_level_vm_print(struct seq_file *m, { char *query_buffer, *str; - query_buffer = kmalloc(1024, GFP_KERNEL | GFP_DMA); + query_buffer = kmalloc(1024, GFP_KERNEL); if (!query_buffer) return; cpcmd("QUERY CPLEVEL", query_buffer, 1024, NULL); @@ -404,7 +379,7 @@ static struct service_level service_level_vm = { static __init int create_proc_service_level(void) { proc_create_seq("service_levels", 0, NULL, &service_level_seq_ops); - if (MACHINE_IS_VM) + if (machine_is_vm()) register_service_level(&service_level_vm); return 0; } @@ -415,9 +390,9 @@ subsys_initcall(create_proc_service_level); */ void s390_adjust_jiffies(void) { + DECLARE_KERNEL_FPU_ONSTACK16(fpu); struct sysinfo_1_2_2 *info; unsigned long capability; - struct kernel_fpu fpu; info = (void *) get_zeroed_page(GFP_KERNEL); if (!info) @@ -436,21 +411,14 @@ void s390_adjust_jiffies(void) * point division .. */ kernel_fpu_begin(&fpu, KERNEL_FPR); - asm volatile( - " sfpc %3\n" - " l %0,%1\n" - " tmlh %0,0xff80\n" - " jnz 0f\n" - " cefbr %%f2,%0\n" - " j 1f\n" - "0: le %%f2,%1\n" - "1: cefbr %%f0,%2\n" - " debr %%f0,%%f2\n" - " cgebr %0,5,%%f0\n" - : "=&d" (capability) - : "Q" (info->capability), "d" (10000000), "d" (0) - : "cc" - ); + fpu_sfpc(0); + if (info->capability & 0xff800000) + fpu_ldgr(2, info->capability); + else + fpu_cefbr(2, info->capability); + fpu_cefbr(0, 10000000); + fpu_debr(0, 2); + capability = fpu_cgebr(0, 5); kernel_fpu_end(&fpu, KERNEL_FPR); } else /* @@ -494,7 +462,6 @@ static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = { \ .open = stsi_open_##fc##_##s1##_##s2, \ .release = stsi_release, \ .read = stsi_read, \ - .llseek = no_llseek, \ }; static int stsi_release(struct inode *inode, struct file *file) @@ -556,10 +523,10 @@ static __init int stsi_init_debugfs(void) sf = &stsi_file[i]; debugfs_create_file(sf->name, 0400, stsi_root, NULL, sf->fops); } - if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && MACHINE_HAS_TOPOLOGY) { + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY) && cpu_has_topology()) { char link_to[10]; - sprintf(link_to, "15_1_%d", topology_mnest_limit()); + snprintf(link_to, sizeof(link_to), "15_1_%d", topology_mnest_limit()); debugfs_create_symlink("topology", stsi_root, link_to); } return 0; diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S new file mode 100644 index 000000000000..26f2981aa09e --- /dev/null +++ b/arch/s390/kernel/text_amode31.S @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code that needs to run below 2 GB. + * + * Copyright IBM Corp. 2019 + */ + +#include <linux/linkage.h> +#include <asm/asm-extable.h> +#include <asm/errno.h> +#include <asm/sigp.h> + + .section .amode31.text,"ax" +/* + * Simplified version of expoline thunk. The normal thunks can not be used here, + * because they might be more than 2 GB away, and not reachable by the relative + * branch. No comdat, exrl, etc. optimizations used here, because it only + * affects a few functions that are not performance-relevant. + */ + .macro BR_EX_AMODE31_r14 + exrl 0,0f + j . +0: br %r14 + .endm + +/* + * int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode) + */ +SYM_FUNC_START(_diag14_amode31) + lgr %r1,%r2 + lgr %r2,%r3 + lgr %r3,%r4 + lhi %r5,-EIO + sam31 + diag %r1,%r2,0x14 +.Ldiag14_ex: + ipm %r5 + srl %r5,28 +.Ldiag14_fault: + sam64 + lgfr %r2,%r5 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag14_ex, .Ldiag14_fault) +SYM_FUNC_END(_diag14_amode31) + +/* + * int _diag210_amode31(struct diag210 *addr) + */ +SYM_FUNC_START(_diag210_amode31) + lgr %r1,%r2 + lhi %r2,-1 + sam31 + diag %r1,%r0,0x210 +.Ldiag210_ex: + ipm %r2 + srl %r2,28 +.Ldiag210_fault: + sam64 + lgfr %r2,%r2 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag210_ex, .Ldiag210_fault) +SYM_FUNC_END(_diag210_amode31) + +/* + * int diag8c(struct diag8c *addr, struct ccw_dev_id *devno, size_t len) +*/ +SYM_FUNC_START(_diag8c_amode31) + llgf %r3,0(%r3) + sam31 + diag %r2,%r4,0x8c +.Ldiag8c_ex: + sam64 + lgfr %r2,%r3 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag8c_ex, .Ldiag8c_ex) +SYM_FUNC_END(_diag8c_amode31) +/* + * int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode) + */ +SYM_FUNC_START(_diag26c_amode31) + lghi %r5,-EOPNOTSUPP + sam31 + diag %r2,%r4,0x26c +.Ldiag26c_ex: + sam64 + lgfr %r2,%r5 + BR_EX_AMODE31_r14 + EX_TABLE_AMODE31(.Ldiag26c_ex, .Ldiag26c_ex) +SYM_FUNC_END(_diag26c_amode31) + +/* + * void _diag0c_amode31(unsigned long rx) + */ +SYM_FUNC_START(_diag0c_amode31) + sam31 + diag %r2,%r2,0x0c + sam64 + BR_EX_AMODE31_r14 +SYM_FUNC_END(_diag0c_amode31) + +/* + * void _diag308_reset_amode31(void) + * + * Calls diag 308 subcode 1 and continues execution + */ +SYM_FUNC_START(_diag308_reset_amode31) + larl %r4,ctlregs # Save control registers + stctg %c0,%c15,0(%r4) + lg %r2,0(%r4) # Disable lowcore protection + nilh %r2,0xefff + larl %r4,ctlreg0 + stg %r2,0(%r4) + lctlg %c0,%c0,0(%r4) + larl %r4,fpctl # Floating point control register + stfpc 0(%r4) + larl %r4,prefix # Save prefix register + stpx 0(%r4) + larl %r4,prefix_zero # Set prefix register to 0 + spx 0(%r4) + larl %r4,continue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,.Lrestart_part2 # Setup restart PSW at absolute 0 + larl %r3,restart_diag308_psw + og %r4,0(%r3) # Save PSW + lghi %r3,0 + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + lghi %r0,0 + diag %r0,%r1,0x308 +.Lrestart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,ctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,fpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,prefix # Restore prefix register + spx 0(%r4) + larl %r4,continue_psw # Restore PSW flags + larl %r2,.Lcontinue + stg %r2,8(%r4) + lpswe 0(%r4) +.Lcontinue: + BR_EX_AMODE31_r14 +SYM_FUNC_END(_diag308_reset_amode31) + + .section .amode31.data,"aw",@progbits + .balign 8 +SYM_DATA_LOCAL(restart_diag308_psw, .long 0x00080000,0x80000000) +SYM_DATA_LOCAL(continue_psw, .quad 0,0) +SYM_DATA_LOCAL(ctlreg0, .quad 0) +SYM_DATA_LOCAL(ctlregs, .fill 16,8,0) +SYM_DATA_LOCAL(fpctl, .long 0) +SYM_DATA_LOCAL(prefix, .long 0) +SYM_DATA_LOCAL(prefix_zero, .long 0) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 326cb8f75f58..bd0df61d1907 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -12,8 +12,7 @@ * Copyright (C) 1991, 1992, 1995 Linus Torvalds */ -#define KMSG_COMPONENT "time" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "time: " fmt #include <linux/kernel_stat.h> #include <linux/errno.h> @@ -36,7 +35,6 @@ #include <linux/profile.h> #include <linux/timex.h> #include <linux/notifier.h> -#include <linux/timekeeper_internal.h> #include <linux/clockchips.h> #include <linux/gfp.h> #include <linux/kprobes.h> @@ -55,10 +53,10 @@ #include <asm/cio.h> #include "entry.h" -union tod_clock tod_clock_base __section(".data"); +union tod_clock __bootdata_preserved(tod_clock_base); EXPORT_SYMBOL_GPL(tod_clock_base); -u64 clock_comparator_max = -1ULL; +u64 __bootdata_preserved(clock_comparator_max); EXPORT_SYMBOL_GPL(clock_comparator_max); static DEFINE_PER_CPU(struct clock_event_device, comparators); @@ -70,8 +68,6 @@ unsigned char ptff_function_mask[16]; static unsigned long lpar_offset; static unsigned long initial_leap_seconds; -static unsigned long tod_steering_end; -static long tod_steering_delta; /* * Get time offsets with PTFF @@ -80,12 +76,8 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; - int cs; - /* Initialize TOD steering parameters */ - tod_steering_end = tod_clock_base.tod; - for (cs = 0; cs < CS_BASES; cs++) - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_k_time_data->arch_data.tod_delta = tod_clock_base.tod; if (!test_facility(28)) return; @@ -102,6 +94,11 @@ void __init time_early_init(void) ((long) qui.old_leap * 4096000000L); } +unsigned long long noinstr sched_clock_noinstr(void) +{ + return tod_to_ns(__get_tod_clock_monotonic()); +} + /* * Scheduler clock - returns current time in nanosec units. */ @@ -126,7 +123,7 @@ void clock_comparator_work(void) { struct clock_event_device *cd; - S390_lowcore.clock_comparator = clock_comparator_max; + get_lowcore()->clock_comparator = clock_comparator_max; cd = this_cpu_ptr(&comparators); cd->event_handler(cd); } @@ -134,8 +131,8 @@ void clock_comparator_work(void) static int s390_next_event(unsigned long delta, struct clock_event_device *evt) { - S390_lowcore.clock_comparator = get_tod_clock() + delta; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = get_tod_clock() + delta; + set_clock_comparator(get_lowcore()->clock_comparator); return 0; } @@ -148,8 +145,8 @@ void init_cpu_timer(void) struct clock_event_device *cd; int cpu; - S390_lowcore.clock_comparator = clock_comparator_max; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = clock_comparator_max; + set_clock_comparator(get_lowcore()->clock_comparator); cpu = smp_processor_id(); cd = &per_cpu(comparators, cpu); @@ -168,10 +165,10 @@ void init_cpu_timer(void) clockevents_register_device(cd); /* Enable clock comparator timer interrupt. */ - __ctl_set_bit(0,11); + local_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SUBMASK_BIT); /* Always allow the timing alert external interrupt. */ - __ctl_set_bit(0, 4); + local_ctl_set_bit(0, CR0_ETR_SUBMASK_BIT); } static void clock_comparator_interrupt(struct ext_code ext_code, @@ -179,8 +176,8 @@ static void clock_comparator_interrupt(struct ext_code ext_code, unsigned long param64) { inc_irq_stat(IRQEXT_CLK); - if (S390_lowcore.clock_comparator == clock_comparator_max) - set_clock_comparator(S390_lowcore.clock_comparator); + if (get_lowcore()->clock_comparator == clock_comparator_max) + set_clock_comparator(get_lowcore()->clock_comparator); } static void stp_timing_alert(struct stp_irq_parm *); @@ -224,21 +221,7 @@ void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, static u64 read_tod_clock(struct clocksource *cs) { - unsigned long now, adj; - - preempt_disable(); /* protect from changes to steering parameters */ - now = get_tod_clock(); - adj = tod_steering_end - now; - if (unlikely((s64) adj > 0)) - /* - * manually steer by 1 cycle every 2^16 cycles. This - * corresponds to shifting the tod delta by 15. 1s is - * therefore steered in ~9h. The adjust will decrease - * over time, until it finally reaches 0. - */ - now += (tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); - preempt_enable(); - return now; + return get_tod_clock_monotonic(); } static struct clocksource clocksource_tod = { @@ -246,10 +229,11 @@ static struct clocksource clocksource_tod = { .rating = 400, .read = read_tod_clock, .mask = CLOCKSOURCE_MASK(64), - .mult = 1000, - .shift = 12, + .mult = 4096000, + .shift = 24, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .vdso_clock_mode = VDSO_CLOCKMODE_TOD, + .id = CSID_S390_TOD, }; struct clocksource * __init clocksource_default_clock(void) @@ -364,31 +348,13 @@ static inline int check_sync_clock(void) * Apply clock delta to the global data structures. * This is called once on the CPU that performed the clock sync. */ -static void clock_sync_global(unsigned long delta) +static void clock_sync_global(long delta) { - unsigned long now, adj; struct ptff_qto qto; - int cs; /* Fixup the monotonic sched clock. */ tod_clock_base.eitod += delta; - /* Adjust TOD steering parameters. */ - now = get_tod_clock(); - adj = tod_steering_end - now; - if (unlikely((s64) adj >= 0)) - /* Calculate how much of the old adjustment is left. */ - tod_steering_delta = (tod_steering_delta < 0) ? - -(adj >> 15) : (adj >> 15); - tod_steering_delta += delta; - if ((abs(tod_steering_delta) >> 48) != 0) - panic("TOD clock sync offset %li is too large to drift\n", - tod_steering_delta); - tod_steering_end = now + (abs(tod_steering_delta) << 15); - for (cs = 0; cs < CS_BASES; cs++) { - vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; - vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; - } - + vdso_k_time_data->arch_data.tod_delta = tod_clock_base.tod; /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) lpar_offset = qto.tod_epoch_difference; @@ -400,15 +366,15 @@ static void clock_sync_global(unsigned long delta) * Apply clock delta to the per-CPU data structures of this CPU. * This is called for each online CPU after the call to clock_sync_global. */ -static void clock_sync_local(unsigned long delta) +static void clock_sync_local(long delta) { /* Add the delta to the clock comparator. */ - if (S390_lowcore.clock_comparator != clock_comparator_max) { - S390_lowcore.clock_comparator += delta; - set_clock_comparator(S390_lowcore.clock_comparator); + if (get_lowcore()->clock_comparator != clock_comparator_max) { + get_lowcore()->clock_comparator += delta; + set_clock_comparator(get_lowcore()->clock_comparator); } /* Adjust the last_update_clock time-stamp. */ - S390_lowcore.last_update_clock += delta; + get_lowcore()->last_update_clock += delta; } /* Single threaded workqueue used for stp sync events */ @@ -424,13 +390,13 @@ static void __init time_init_wq(void) struct clock_sync_data { atomic_t cpus; int in_sync; - unsigned long clock_delta; + long clock_delta; }; /* * Server Time Protocol (STP) code. */ -static bool stp_online; +static bool stp_online = true; static struct stp_sstpi stp_info; static void *stp_page; @@ -456,13 +422,18 @@ static void __init stp_reset(void) if (rc == 0) set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags); else if (stp_online) { - pr_warn("The real or virtual hardware system does not provide an STP interface\n"); free_page((unsigned long) stp_page); stp_page = NULL; stp_online = false; } } +bool stp_enabled(void) +{ + return test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online; +} +EXPORT_SYMBOL(stp_enabled); + static void stp_timeout(struct timer_list *unused) { queue_work(time_sync_wq, &stp_work); @@ -544,7 +515,7 @@ static int stpinfo_valid(void) static int stp_sync_clock(void *data) { struct clock_sync_data *sync = data; - u64 clock_delta, flags; + long clock_delta, flags; static int first; int rc; @@ -554,9 +525,7 @@ static int stp_sync_clock(void *data) while (atomic_read(&sync->cpus) != 0) cpu_relax(); rc = 0; - if (stp_info.todoff[0] || stp_info.todoff[1] || - stp_info.todoff[2] || stp_info.todoff[3] || - stp_info.tmd != 2) { + if (stp_info.todoff || stp_info.tmd != 2) { flags = vdso_update_begin(); rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta); @@ -576,7 +545,7 @@ static int stp_sync_clock(void *data) atomic_dec(&sync->cpus); /* Wait for in_sync to be set. */ while (READ_ONCE(sync->in_sync) == 0) - __udelay(1); + ; } if (sync->in_sync != 1) /* Didn't work. Clear per-cpu in sync bit again. */ @@ -587,81 +556,6 @@ static int stp_sync_clock(void *data) return 0; } -static int stp_clear_leap(void) -{ - struct __kernel_timex txc; - int ret; - - memset(&txc, 0, sizeof(txc)); - - ret = do_adjtimex(&txc); - if (ret < 0) - return ret; - - txc.modes = ADJ_STATUS; - txc.status &= ~(STA_INS|STA_DEL); - return do_adjtimex(&txc); -} - -static void stp_check_leap(void) -{ - struct stp_stzi stzi; - struct stp_lsoib *lsoib = &stzi.lsoib; - struct __kernel_timex txc; - int64_t timediff; - int leapdiff, ret; - - if (!stp_info.lu || !check_sync_clock()) { - /* - * Either a scheduled leap second was removed by the operator, - * or STP is out of sync. In both cases, clear the leap second - * kernel flags. - */ - if (stp_clear_leap() < 0) - pr_err("failed to clear leap second flags\n"); - return; - } - - if (chsc_stzi(stp_page, &stzi, sizeof(stzi))) { - pr_err("stzi failed\n"); - return; - } - - timediff = tod_to_ns(lsoib->nlsout - get_tod_clock()) / NSEC_PER_SEC; - leapdiff = lsoib->nlso - lsoib->also; - - if (leapdiff != 1 && leapdiff != -1) { - pr_err("Cannot schedule %d leap seconds\n", leapdiff); - return; - } - - if (timediff < 0) { - if (stp_clear_leap() < 0) - pr_err("failed to clear leap second flags\n"); - } else if (timediff < 7200) { - memset(&txc, 0, sizeof(txc)); - ret = do_adjtimex(&txc); - if (ret < 0) - return; - - txc.modes = ADJ_STATUS; - if (leapdiff > 0) - txc.status |= STA_INS; - else - txc.status |= STA_DEL; - ret = do_adjtimex(&txc); - if (ret < 0) - pr_err("failed to set leap second flags\n"); - /* arm Timer to clear leap second flags */ - mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC)); - } else { - /* The day the leap second is scheduled for hasn't been reached. Retry - * in one hour. - */ - mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC)); - } -} - /* * STP work. Check for the STP state and take over the clock * synchronization if the STP clock source is usable. @@ -676,7 +570,7 @@ static void stp_work_fn(struct work_struct *work) if (!stp_online) { chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL); - del_timer_sync(&stp_timer); + timer_delete_sync(&stp_timer); goto out_unlock; } @@ -699,12 +593,10 @@ static void stp_work_fn(struct work_struct *work) if (!check_sync_clock()) /* - * There is a usable clock but the synchonization failed. + * There is a usable clock but the synchronization failed. * Retry after a second. */ mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC)); - else if (stp_info.lu) - stp_check_leap(); out_unlock: mutex_unlock(&stp_mutex); @@ -713,7 +605,7 @@ out_unlock: /* * STP subsys sysfs interface functions */ -static struct bus_type stp_subsys = { +static const struct bus_type stp_subsys = { .name = "stp", .dev_name = "stp", }; @@ -726,8 +618,8 @@ static ssize_t ctn_id_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%016lx\n", - *(unsigned long *) stp_info.ctnid); + ret = sysfs_emit(buf, "%016lx\n", + *(unsigned long *)stp_info.ctnid); mutex_unlock(&stp_mutex); return ret; } @@ -742,7 +634,7 @@ static ssize_t ctn_type_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.ctn); + ret = sysfs_emit(buf, "%i\n", stp_info.ctn); mutex_unlock(&stp_mutex); return ret; } @@ -757,7 +649,7 @@ static ssize_t dst_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x2000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.dsto); mutex_unlock(&stp_mutex); return ret; } @@ -772,7 +664,7 @@ static ssize_t leap_seconds_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x8000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.leaps); mutex_unlock(&stp_mutex); return ret; } @@ -798,11 +690,11 @@ static ssize_t leap_seconds_scheduled_show(struct device *dev, return ret; if (!stzi.lsoib.p) - return sprintf(buf, "0,0\n"); + return sysfs_emit(buf, "0,0\n"); - return sprintf(buf, "%lu,%d\n", - tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, - stzi.lsoib.nlso - stzi.lsoib.also); + return sysfs_emit(buf, "%lu,%d\n", + tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, + stzi.lsoib.nlso - stzi.lsoib.also); } static DEVICE_ATTR_RO(leap_seconds_scheduled); @@ -815,7 +707,7 @@ static ssize_t stratum_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.stratum); mutex_unlock(&stp_mutex); return ret; } @@ -830,7 +722,7 @@ static ssize_t time_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x0800)) - ret = sprintf(buf, "%i\n", (int) stp_info.tto); + ret = sysfs_emit(buf, "%i\n", (int)stp_info.tto); mutex_unlock(&stp_mutex); return ret; } @@ -845,7 +737,7 @@ static ssize_t time_zone_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x4000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.tzo); mutex_unlock(&stp_mutex); return ret; } @@ -860,7 +752,7 @@ static ssize_t timing_mode_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.tmd); + ret = sysfs_emit(buf, "%i\n", stp_info.tmd); mutex_unlock(&stp_mutex); return ret; } @@ -875,7 +767,7 @@ static ssize_t timing_state_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.tst); + ret = sysfs_emit(buf, "%i\n", stp_info.tst); mutex_unlock(&stp_mutex); return ret; } @@ -886,7 +778,7 @@ static ssize_t online_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%i\n", stp_online); + return sysfs_emit(buf, "%i\n", stp_online); } static ssize_t online_store(struct device *dev, diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 26aa2614ee35..1913a5566ac2 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2007, 2011 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ -#define KMSG_COMPONENT "cpu" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "cpu: " fmt +#include <linux/cpufeature.h> #include <linux/workqueue.h> #include <linux/memblock.h> #include <linux/uaccess.h> @@ -25,7 +24,9 @@ #include <linux/mm.h> #include <linux/nodemask.h> #include <linux/node.h> +#include <asm/hiperdispatch.h> #include <asm/sysinfo.h> +#include <asm/asm.h> #define PTF_HORIZONTAL (0UL) #define PTF_VERTICAL (1UL) @@ -48,6 +49,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; +static int cpu_management; static DECLARE_WORK(topology_work, topology_work_fn); @@ -67,7 +69,7 @@ static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int c static cpumask_t mask; cpumask_clear(&mask); - if (!cpu_online(cpu)) + if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) goto out; cpumask_set_cpu(cpu, &mask); switch (topology_mode) { @@ -88,7 +90,7 @@ static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int c case TOPOLOGY_MODE_SINGLE: break; } - cpumask_and(&mask, &mask, cpu_online_mask); + cpumask_and(&mask, &mask, &cpu_setup_mask); out: cpumask_copy(dst, &mask); } @@ -96,19 +98,20 @@ out: static void cpu_thread_map(cpumask_t *dst, unsigned int cpu) { static cpumask_t mask; - int i; + unsigned int max_cpu; cpumask_clear(&mask); - if (!cpu_online(cpu)) + if (!cpumask_test_cpu(cpu, &cpu_setup_mask)) goto out; cpumask_set_cpu(cpu, &mask); if (topology_mode != TOPOLOGY_MODE_HW) goto out; cpu -= cpu % (smp_cpu_mtid + 1); - for (i = 0; i <= smp_cpu_mtid; i++) - if (cpu_present(cpu + i)) - cpumask_set_cpu(cpu + i, &mask); - cpumask_and(&mask, &mask, cpu_online_mask); + max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1); + for (; cpu <= max_cpu; cpu++) { + if (cpumask_test_cpu(cpu, &cpu_setup_mask)) + cpumask_set_cpu(cpu, &mask); + } out: cpumask_copy(dst, &mask); } @@ -124,25 +127,27 @@ static void add_cpus_to_mask(struct topology_core *tl_core, unsigned int core; for_each_set_bit(core, &tl_core->mask, TOPOLOGY_CORE_BITS) { - unsigned int rcore; - int lcpu, i; + unsigned int max_cpu, rcore; + int cpu; rcore = TOPOLOGY_CORE_BITS - 1 - core + tl_core->origin; - lcpu = smp_find_processor_id(rcore << smp_cpu_mt_shift); - if (lcpu < 0) + cpu = smp_find_processor_id(rcore << smp_cpu_mt_shift); + if (cpu < 0) continue; - for (i = 0; i <= smp_cpu_mtid; i++) { - topo = &cpu_topology[lcpu + i]; + max_cpu = min(cpu + smp_cpu_mtid, nr_cpu_ids - 1); + for (; cpu <= max_cpu; cpu++) { + topo = &cpu_topology[cpu]; topo->drawer_id = drawer->id; topo->book_id = book->id; topo->socket_id = socket->id; topo->core_id = rcore; - topo->thread_id = lcpu + i; + topo->thread_id = cpu; topo->dedicated = tl_core->d; - cpumask_set_cpu(lcpu + i, &drawer->mask); - cpumask_set_cpu(lcpu + i, &book->mask); - cpumask_set_cpu(lcpu + i, &socket->mask); - smp_cpu_set_polarization(lcpu + i, tl_core->pp); + cpumask_set_cpu(cpu, &drawer->mask); + cpumask_set_cpu(cpu, &book->mask); + cpumask_set_cpu(cpu, &socket->mask); + smp_cpu_set_polarization(cpu, tl_core->pp); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); } } } @@ -220,22 +225,22 @@ static void topology_update_polarization_simple(void) static int ptf(unsigned long fc) { - int rc; + int cc; asm volatile( - " .insn rre,0xb9a20000,%1,%1\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc) - : "d" (fc) : "cc"); - return rc; + " .insn rre,0xb9a20000,%[fc],%[fc]\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [fc] "d" (fc) + : CC_CLOBBER); + return CC_TRANSFORM(cc); } int topology_set_cpu_management(int fc) { int cpu, rc; - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return -EOPNOTSUPP; if (fc) rc = ptf(PTF_VERTICAL); @@ -269,6 +274,7 @@ void update_cpu_masks(void) topo->drawer_id = id; } } + hd_reset_state(); for_each_online_cpu(cpu) { topo = &cpu_topology[cpu]; pkg_first = cpumask_first(&topo->core_mask); @@ -277,8 +283,10 @@ void update_cpu_masks(void) for_each_cpu(sibling, &topo->core_mask) { topo_sibling = &cpu_topology[sibling]; smt_first = cpumask_first(&topo_sibling->thread_mask); - if (sibling == smt_first) + if (sibling == smt_first) { topo_package->booted_cores++; + hd_add_core(sibling); + } } } else { topo->booted_cores = topo_package->booted_cores; @@ -302,33 +310,33 @@ static void __arch_update_dedicated_flag(void *arg) static int __arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; - int rc = 0; + int rc, hd_status; + hd_status = 0; + rc = 0; mutex_lock(&smp_cpu_state_mutex); - if (MACHINE_HAS_TOPOLOGY) { + if (cpu_has_topology()) { rc = 1; store_topology(info); tl_to_masks(info); } update_cpu_masks(); - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) topology_update_polarization_simple(); + if (cpu_management == 1) + hd_status = hd_enable_hiperdispatch(); mutex_unlock(&smp_cpu_state_mutex); + if (hd_status == 0) + hd_disable_hiperdispatch(); return rc; } int arch_update_cpu_topology(void) { - struct device *dev; - int cpu, rc; + int rc; rc = __arch_update_cpu_topology(); on_each_cpu(__arch_update_dedicated_flag, NULL, 0); - for_each_online_cpu(cpu) { - dev = get_cpu_device(cpu); - if (dev) - kobject_uevent(&dev->kobj, KOBJ_CHANGE); - } return rc; } @@ -363,12 +371,12 @@ static void set_topology_timer(void) if (atomic_add_unless(&topology_poll, -1, 0)) mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100)); else - mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC)); + mod_timer(&topology_timer, jiffies + secs_to_jiffies(60)); } void topology_expect_change(void) { - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) return; /* This is racy, but it doesn't matter since it is just a heuristic. * Worst case is that we poll in a higher frequency for a bit longer. @@ -379,7 +387,24 @@ void topology_expect_change(void) set_topology_timer(); } -static int cpu_management; +static int set_polarization(int polarization) +{ + int rc = 0; + + cpus_read_lock(); + mutex_lock(&smp_cpu_state_mutex); + if (cpu_management == polarization) + goto out; + rc = topology_set_cpu_management(polarization); + if (rc) + goto out; + cpu_management = polarization; + topology_expect_change(); +out: + mutex_unlock(&smp_cpu_state_mutex); + cpus_read_unlock(); + return rc; +} static ssize_t dispatching_show(struct device *dev, struct device_attribute *attr, @@ -388,7 +413,7 @@ static ssize_t dispatching_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", cpu_management); + count = sysfs_emit(buf, "%d\n", cpu_management); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -405,19 +430,7 @@ static ssize_t dispatching_store(struct device *dev, return -EINVAL; if (val != 0 && val != 1) return -EINVAL; - rc = 0; - get_online_cpus(); - mutex_lock(&smp_cpu_state_mutex); - if (cpu_management == val) - goto out; - rc = topology_set_cpu_management(val); - if (rc) - goto out; - cpu_management = val; - topology_expect_change(); -out: - mutex_unlock(&smp_cpu_state_mutex); - put_online_cpus(); + rc = set_polarization(val); return rc ? rc : count; } static DEVICE_ATTR_RW(dispatching); @@ -431,19 +444,19 @@ static ssize_t cpu_polarization_show(struct device *dev, mutex_lock(&smp_cpu_state_mutex); switch (smp_cpu_get_polarization(cpu)) { case POLARIZATION_HRZ: - count = sprintf(buf, "horizontal\n"); + count = sysfs_emit(buf, "horizontal\n"); break; case POLARIZATION_VL: - count = sprintf(buf, "vertical:low\n"); + count = sysfs_emit(buf, "vertical:low\n"); break; case POLARIZATION_VM: - count = sprintf(buf, "vertical:medium\n"); + count = sysfs_emit(buf, "vertical:medium\n"); break; case POLARIZATION_VH: - count = sprintf(buf, "vertical:high\n"); + count = sysfs_emit(buf, "vertical:high\n"); break; default: - count = sprintf(buf, "unknown\n"); + count = sysfs_emit(buf, "unknown\n"); break; } mutex_unlock(&smp_cpu_state_mutex); @@ -467,7 +480,7 @@ static ssize_t cpu_dedicated_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", topology_cpu_dedicated(cpu)); + count = sysfs_emit(buf, "%d\n", topology_cpu_dedicated(cpu)); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -487,7 +500,7 @@ int topology_cpu_init(struct cpu *cpu) int rc; rc = sysfs_create_group(&cpu->dev.kobj, &topology_cpu_attr_group); - if (rc || !MACHINE_HAS_TOPOLOGY) + if (rc || !cpu_has_topology()) return rc; rc = sysfs_create_group(&cpu->dev.kobj, &topology_extra_cpu_attr_group); if (rc) @@ -495,33 +508,27 @@ int topology_cpu_init(struct cpu *cpu) return rc; } -static const struct cpumask *cpu_thread_mask(int cpu) -{ - return &cpu_topology[cpu].thread_mask; -} - - const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_mask; } -static const struct cpumask *cpu_book_mask(int cpu) +static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].book_mask; } -static const struct cpumask *cpu_drawer_mask(int cpu) +static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].drawer_mask; } static struct sched_domain_topology_level s390_topology[] = { - { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, - { cpu_book_mask, SD_INIT_NAME(BOOK) }, - { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), + SDTL_INIT(tl_book_mask, NULL, BOOK), + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -535,33 +542,38 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = memblock_alloc(sizeof(*mask->next), 8); - if (!mask->next) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*mask->next), 8); + mask->next = memblock_alloc_or_panic(sizeof(*mask->next), 8); mask = mask->next; } } +static int __init detect_polarization(union topology_entry *tle) +{ + struct topology_core *tl_core; + + while (tle->nl) + tle = next_tle(tle); + tl_core = (struct topology_core *)tle; + return tl_core->pp != POLARIZATION_HRZ; +} + void __init topology_init_early(void) { struct sysinfo_15_1_x *info; set_sched_topology(s390_topology); if (topology_mode == TOPOLOGY_MODE_UNINITIALIZED) { - if (MACHINE_HAS_TOPOLOGY) + if (cpu_has_topology()) topology_mode = TOPOLOGY_MODE_HW; else topology_mode = TOPOLOGY_MODE_SINGLE; } - if (!MACHINE_HAS_TOPOLOGY) + if (!cpu_has_topology()) goto out; - tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!tl_info) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + tl_info = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); info = tl_info; store_topology(info); + cpu_management = detect_polarization(info->tle); pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", info->mag[0], info->mag[1], info->mag[2], info->mag[3], info->mag[4], info->mag[5], info->mnest); @@ -569,6 +581,7 @@ void __init topology_init_early(void) alloc_masks(info, &book_info, 2); alloc_masks(info, &drawer_info, 3); out: + cpumask_set_cpu(0, &cpu_setup_mask); __arch_update_cpu_topology(); __arch_update_dedicated_flag(NULL); } @@ -577,7 +590,7 @@ static inline int topology_get_mode(int enabled) { if (!enabled) return TOPOLOGY_MODE_SINGLE; - return MACHINE_HAS_TOPOLOGY ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; + return cpu_has_topology() ? TOPOLOGY_MODE_HW : TOPOLOGY_MODE_PACKAGE; } static inline int topology_is_enabled(void) @@ -598,7 +611,7 @@ static int __init topology_setup(char *str) } early_param("topology", topology_setup); -static int topology_ctl_handler(struct ctl_table *ctl, int write, +static int topology_ctl_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); @@ -628,33 +641,58 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write, return rc; } -static struct ctl_table topology_ctl_table[] = { +static int polarization_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int polarization; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &polarization, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + polarization = cpu_management; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + return set_polarization(polarization); +} + +static const struct ctl_table topology_ctl_table[] = { { .procname = "topology", .mode = 0644, .proc_handler = topology_ctl_handler, }, - { }, -}; - -static struct ctl_table topology_dir_table[] = { { - .procname = "s390", - .maxlen = 0, - .mode = 0555, - .child = topology_ctl_table, + .procname = "polarization", + .mode = 0644, + .proc_handler = polarization_ctl_handler, }, - { }, }; static int __init topology_init(void) { + struct device *dev_root; + int rc = 0; + timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE); - if (MACHINE_HAS_TOPOLOGY) + if (cpu_has_topology()) set_topology_timer(); else topology_update_polarization_simple(); - register_sysctl_table(topology_dir_table); - return device_create_file(cpu_subsys.dev_root, &dev_attr_dispatching); + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY_VERTICAL)) + set_polarization(1); + register_sysctl("s390", topology_ctl_table); + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_dispatching); + put_device(dev_root); + } + return rc; } device_initcall(topology_init); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 76947275fe8b..19687dab32f7 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -3,18 +3,13 @@ * S390 version * Copyright IBM Corp. 1999, 2000 * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), + * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), * * Derived from "arch/i386/kernel/traps.c" * Copyright (C) 1991, 1992 Linus Torvalds */ -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. - */ -#include "asm/irqflags.h" -#include "asm/ptrace.h" +#include <linux/cpufeature.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/randomize_kstack.h> @@ -27,8 +22,13 @@ #include <linux/uaccess.h> #include <linux/cpu.h> #include <linux/entry-common.h> -#include <asm/fpu/api.h> +#include <linux/kmsan.h> +#include <asm/asm-extable.h> +#include <asm/irqflags.h> +#include <asm/ptrace.h> #include <asm/vtime.h> +#include <asm/fpu.h> +#include <asm/fault.h> #include "entry.h" static inline void __user *get_trap_ip(struct pt_regs *regs) @@ -39,31 +39,30 @@ static inline void __user *get_trap_ip(struct pt_regs *regs) address = current->thread.trap_tdb.data[3]; else address = regs->psw.addr; - return (void __user *) (address - (regs->int_code >> 16)); + return (void __user *)(address - (regs->int_code >> 16)); } +#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { return 1; } +#endif void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) { if (user_mode(regs)) { force_sig_fault(si_signo, si_code, get_trap_ip(regs)); report_user_fault(regs, si_signo, 0); - } else { - const struct exception_table_entry *fixup; - fixup = s390_search_extables(regs->psw.addr); - if (!fixup || !ex_handle(fixup, regs)) + } else { + if (!fixup_exception(regs)) die(regs, str); - } + } } static void do_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) { - if (notify_die(DIE_TRAP, str, regs, 0, - regs->int_code, si_signo) == NOTIFY_STOP) + if (notify_die(DIE_TRAP, str, regs, 0, regs->int_code, si_signo) == NOTIFY_STOP) return; do_report_trap(regs, si_signo, si_code, str); } @@ -75,8 +74,7 @@ void do_per_trap(struct pt_regs *regs) return; if (!current->ptrace) return; - force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __force __user *) current->thread.per_event.address); + force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __force __user *)current->thread.per_event.address); } NOKPROBE_SYMBOL(do_per_trap); @@ -84,7 +82,7 @@ static void default_trap_handler(struct pt_regs *regs) { if (user_mode(regs)) { report_user_fault(regs, SIGSEGV, 0); - do_exit(SIGSEGV); + force_exit_sig(SIGSEGV); } else die(regs, "Unknown program exception"); } @@ -95,36 +93,25 @@ static void name(struct pt_regs *regs) \ do_trap(regs, signr, sicode, str); \ } -DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, - "addressing exception") -DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, - "execute exception") -DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, - "fixpoint divide exception") -DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, - "fixpoint overflow exception") -DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, - "HFP overflow exception") -DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, - "HFP underflow exception") -DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, - "HFP significance exception") -DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, - "HFP divide exception") -DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, - "HFP square root exception") -DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, - "operand exception") -DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, - "privileged operation") -DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, - "special operation exception") -DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, - "transaction constraint exception") +DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, "addressing exception") +DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, "fixpoint divide exception") +DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, "execute exception") +DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, "HFP divide exception") +DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, "HFP overflow exception") +DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, "HFP significance exception") +DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, "HFP square root exception") +DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, "HFP underflow exception") +DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, "operand exception") +DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, "fixpoint overflow exception") +DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, "privileged operation") +DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, "special operation exception") +DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, "specification exception"); +DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, "transaction constraint exception") static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) { int si_code = 0; + /* FPC[2] is Data Exception Code */ if ((fpc & 0x00000300) == 0) { /* bits 6 and 7 of DXC are 0 iff IEEE exception */ @@ -142,44 +129,43 @@ static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) do_trap(regs, SIGFPE, si_code, "floating point exception"); } -static void translation_exception(struct pt_regs *regs) +static void translation_specification_exception(struct pt_regs *regs) { /* May never happen. */ - panic("Translation exception"); + panic("Translation-Specification Exception"); } static void illegal_op(struct pt_regs *regs) { - __u8 opcode[6]; - __u16 __user *location; int is_uprobe_insn = 0; + u16 __user *location; int signal = 0; + u16 opcode; location = get_trap_ip(regs); - if (user_mode(regs)) { - if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) + if (get_user(opcode, location)) return; - if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { + if (opcode == S390_BREAKPOINT_U16) { if (current->ptrace) force_sig_fault(SIGTRAP, TRAP_BRKPT, location); else signal = SIGILL; #ifdef CONFIG_UPROBES - } else if (*((__u16 *) opcode) == UPROBE_SWBP_INSN) { + } else if (opcode == UPROBE_SWBP_INSN) { is_uprobe_insn = 1; #endif - } else + } else { signal = SIGILL; + } } /* - * We got either an illegal op in kernel mode, or user space trapped + * This is either an illegal op in kernel mode, or user space trapped * on a uprobes illegal instruction. See if kprobes or uprobes picks * it up. If not, SIGILL. */ if (is_uprobe_insn || !user_mode(regs)) { - if (notify_die(DIE_BPT, "bpt", regs, 0, - 3, SIGTRAP) != NOTIFY_STOP) + if (notify_die(DIE_BPT, "bpt", regs, 0, 3, SIGTRAP) != NOTIFY_STOP) signal = SIGILL; } if (signal) @@ -187,21 +173,13 @@ static void illegal_op(struct pt_regs *regs) } NOKPROBE_SYMBOL(illegal_op); -DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, - "specification exception"); - static void vector_exception(struct pt_regs *regs) { int si_code, vic; - if (!MACHINE_HAS_VX) { - do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation"); - return; - } - /* get vector interrupt code from fpc */ - save_fpu_regs(); - vic = (current->thread.fpu.fpc & 0xf00) >> 8; + save_user_fpu_regs(); + vic = (current->thread.ufpu.fpc & 0xf00) >> 8; switch (vic) { case 1: /* invalid vector operation */ si_code = FPE_FLTINV; @@ -226,9 +204,9 @@ static void vector_exception(struct pt_regs *regs) static void data_exception(struct pt_regs *regs) { - save_fpu_regs(); - if (current->thread.fpu.fpc & FPC_DXC_MASK) - do_fp_trap(regs, current->thread.fpu.fpc); + save_user_fpu_regs(); + if (current->thread.ufpu.fpc & FPC_DXC_MASK) + do_fp_trap(regs, current->thread.ufpu.fpc); else do_trap(regs, SIGILL, ILL_ILLOPN, "data exception"); } @@ -244,16 +222,11 @@ static void space_switch_exception(struct pt_regs *regs) static void monitor_event_exception(struct pt_regs *regs) { - const struct exception_table_entry *fixup; - if (user_mode(regs)) return; - switch (report_bug(regs->psw.addr - (regs->int_code >> 16), regs)) { case BUG_TRAP_TYPE_NONE: - fixup = s390_search_extables(regs->psw.addr); - if (fixup) - ex_handle(fixup, regs); + fixup_exception(regs); break; case BUG_TRAP_TYPE_WARN: break; @@ -263,15 +236,20 @@ static void monitor_event_exception(struct pt_regs *regs) } } -void kernel_stack_overflow(struct pt_regs *regs) +void kernel_stack_invalid(struct pt_regs *regs) { + /* + * Normally regs are unpoisoned by the generic entry code, but + * kernel_stack_overflow() is a rare case that is called bypassing it. + */ + kmsan_unpoison_entry_regs(regs); bust_spinlocks(1); - printk("Kernel stack overflow.\n"); + pr_emerg("Kernel stack pointer invalid\n"); show_regs(regs); bust_spinlocks(0); - panic("Corrupt kernel stack, can't continue."); + panic("Invalid kernel stack pointer, cannot continue"); } -NOKPROBE_SYMBOL(kernel_stack_overflow); +NOKPROBE_SYMBOL(kernel_stack_invalid); static void __init test_monitor_call(void) { @@ -279,19 +257,30 @@ static void __init test_monitor_call(void) if (!IS_ENABLED(CONFIG_BUG)) return; - asm volatile( + asm_inline volatile( " mc 0,0\n" - "0: xgr %0,%0\n" + "0: lhi %[val],0\n" "1:\n" - EX_TABLE(0b,1b) - : "+d" (val)); + EX_TABLE(0b, 1b) + : [val] "+d" (val)); if (!val) panic("Monitor call doesn't work!\n"); } void __init trap_init(void) { - sort_extable(__start_dma_ex_table, __stop_dma_ex_table); + struct lowcore *lc = get_lowcore(); + unsigned long flags; + struct ctlreg cr0; + + local_irq_save(flags); + cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); + psw_bits(lc->external_new_psw).mcheck = 1; + psw_bits(lc->program_new_psw).mcheck = 1; + psw_bits(lc->svc_new_psw).mcheck = 1; + psw_bits(lc->io_new_psw).mcheck = 1; + local_ctl_load(0, &cr0); + local_irq_restore(flags); local_mcck_enable(); test_monitor_call(); } @@ -300,36 +289,47 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { - unsigned long last_break = S390_lowcore.breaking_event_addr; - unsigned int trapnr; + struct lowcore *lc = get_lowcore(); irqentry_state_t state; + unsigned int trapnr; + union teid teid; - regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc; - regs->int_parm_long = S390_lowcore.trans_exc_code; - + teid.val = lc->trans_exc_code; + regs->int_code = lc->pgm_int_code; + regs->int_parm_long = teid.val; + /* + * In case of a guest fault, short-circuit the fault handler and return. + * This way the sie64a() function will return 0; fault address and + * other relevant bits are saved in current->thread.gmap_teid, and + * the fault number in current->thread.gmap_int_code. KVM will be + * able to use this information to handle the fault. + */ + if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) { + current->thread.gmap_teid.val = regs->int_parm_long; + current->thread.gmap_int_code = regs->int_code & 0xffff; + return; + } state = irqentry_enter(regs); - if (user_mode(regs)) { update_timer_sys(); - if (last_break < 4096) - last_break = 1; - current->thread.last_break = last_break; - regs->args[0] = last_break; + if (!cpu_has_bear()) { + if (regs->last_break < 4096) + regs->last_break = 1; + } + current->thread.last_break = regs->last_break; } - - if (S390_lowcore.pgm_code & 0x0200) { + if (lc->pgm_code & 0x0200) { /* transaction abort */ - current->thread.trap_tdb = S390_lowcore.pgm_tdb; + current->thread.trap_tdb = lc->pgm_tdb; } - - if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) { + if (lc->pgm_code & PGM_INT_CODE_PER) { if (user_mode(regs)) { struct per_event *ev = ¤t->thread.per_event; set_thread_flag(TIF_PER_TRAP); - ev->address = S390_lowcore.per_address; - ev->cause = *(u16 *)&S390_lowcore.per_code; - ev->paid = S390_lowcore.per_access_id; + ev->address = lc->per_address; + ev->cause = lc->per_code_combined; + ev->paid = lc->per_access_id; } else { /* PER event in kernel is kprobes */ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); @@ -337,11 +337,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs) goto out; } } - if (!irqs_disabled_flags(regs->psw.mask)) trace_hardirqs_on(); __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); - trapnr = regs->int_code & PGM_INT_CODE_MASK; if (trapnr) pgm_check_table[trapnr](regs); @@ -374,7 +372,7 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = { [0x0f] = hfp_divide_exception, [0x10] = do_dat_exception, [0x11] = do_dat_exception, - [0x12] = translation_exception, + [0x12] = translation_specification_exception, [0x13] = special_op_exception, [0x14] = default_trap_handler, [0x15] = operand_exception, @@ -393,8 +391,8 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = { [0x3b] = do_dat_exception, [0x3c] = default_trap_handler, [0x3d] = do_secure_storage_access, - [0x3e] = do_non_secure_storage_access, - [0x3f] = do_secure_storage_violation, + [0x3e] = default_trap_handler, + [0x3f] = default_trap_handler, [0x40] = monitor_event_exception, [0x41 ... 0x7f] = default_trap_handler, }; @@ -405,5 +403,3 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = { __stringify(default_trap_handler)) COND_TRAP(do_secure_storage_access); -COND_TRAP(do_non_secure_storage_access); -COND_TRAP(do_secure_storage_violation); diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index 707fd99f6734..0f88caca4eaf 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/export.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> @@ -49,6 +51,8 @@ static inline bool is_final_pt_regs(struct unwind_state *state, READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE; } +/* Avoid KMSAN false positives from touching uninitialized frames. */ +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; @@ -64,8 +68,8 @@ bool unwind_next_frame(struct unwind_state *state) ip = READ_ONCE_NOCHECK(sf->gprs[8]); reliable = false; regs = NULL; - if (!__kernel_text_address(ip)) { - /* skip bogus %r14 */ + /* skip bogus %r14 or if is the same as regs->psw.addr */ + if (!__kernel_text_address(ip) || state->ip == unwind_recover_ret_addr(state, ip)) { state->regs = NULL; return unwind_next_frame(state); } @@ -103,13 +107,11 @@ bool unwind_next_frame(struct unwind_state *state) if (sp & 0x7) goto out_err; - ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *) sp); - /* Update unwind state */ state->sp = sp; - state->ip = ip; state->regs = regs; state->reliable = reliable; + state->ip = unwind_recover_ret_addr(state, ip); return true; out_err: @@ -120,6 +122,8 @@ out_stop: } EXPORT_SYMBOL_GPL(unwind_next_frame); +/* Avoid KMSAN false positives from touching uninitialized frames. */ +__no_kmsan_checks void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long first_frame) { @@ -161,12 +165,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, ip = READ_ONCE_NOCHECK(sf->gprs[8]); } - ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL); - /* Update unwind state */ state->sp = sp; - state->ip = ip; state->reliable = true; + state->ip = unwind_recover_ret_addr(state, ip); if (!first_frame) return; diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index bbf8622bbf5d..c624f3361e43 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -8,11 +8,9 @@ #include <linux/uaccess.h> #include <linux/uprobes.h> -#include <linux/compat.h> #include <linux/kdebug.h> #include <linux/sched/task_stack.h> -#include <asm/switch_to.h> #include <asm/facility.h> #include <asm/kprobes.h> #include <asm/dis.h> @@ -30,7 +28,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) return -EINVAL; - if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) + if (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) return -EINVAL; clear_thread_flag(TIF_PER_TRAP); auprobe->saved_per = psw_bits(regs->psw).per; @@ -126,6 +124,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, case DIE_SSTEP: if (uprobe_post_sstep_notifier(regs)) return NOTIFY_STOP; + break; default: break; } @@ -161,11 +160,6 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, /* Instruction Emulation */ -static void adjust_psw_addr(psw_t *psw, unsigned long len) -{ - psw->addr = __rewind_psw(*psw, -len); -} - #define EMU_ILLEGAL_OP 1 #define EMU_SPECIFICATION 2 #define EMU_ADDRESSING 3 @@ -176,9 +170,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(*(ptr)) input; \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)ptr & mask) \ + if ((u64 __force)ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (get_user(input, ptr)) \ __rc = EMU_ADDRESSING; \ @@ -193,9 +185,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(ptr) __ptr = (ptr); \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)__ptr & mask) \ + if ((u64 __force)__ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (put_user(*(input), __ptr)) \ __rc = EMU_ADDRESSING; \ @@ -212,9 +202,7 @@ static void adjust_psw_addr(psw_t *psw, unsigned long len) __typeof__(*(ptr)) input; \ int __rc = 0; \ \ - if (!test_facility(34)) \ - __rc = EMU_ILLEGAL_OP; \ - else if ((u64 __force)ptr & mask) \ + if ((u64 __force)ptr & mask) \ __rc = EMU_SPECIFICATION; \ else if (get_user(input, ptr)) \ __rc = EMU_ADDRESSING; \ @@ -326,10 +314,6 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) break; case 0xc6: switch (insn->opc1) { - case 0x02: /* pfdrl */ - if (!test_facility(34)) - rc = EMU_ILLEGAL_OP; - break; case 0x04: /* cghrl */ rc = emu_cmp_ril(regs, (s16 __user *)uptr, &rx->s64); break; @@ -363,7 +347,7 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) } break; } - adjust_psw_addr(®s->psw, ilen); + regs->psw.addr = __forward_psw(regs->psw, ilen); switch (rc) { case EMU_ILLEGAL_OP: regs->int_code = ilen << 16 | 0x0001; @@ -383,8 +367,7 @@ static void handle_insn_ril(struct arch_uprobe *auprobe, struct pt_regs *regs) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { if ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_24BIT) || - ((psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) && - !is_compat_task())) { + (psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT)) { regs->psw.addr = __rewind_psw(regs->psw, UPROBE_SWBP_INSN_SIZE); do_report_trap(regs, SIGILL, ILL_ILLADR, NULL); return true; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index aeb0a15bcbb7..ed46950be86f 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -2,11 +2,11 @@ /* * Common Ultravisor functions and initialization * - * Copyright IBM Corp. 2019, 2020 + * Copyright IBM Corp. 2019, 2024 */ -#define KMSG_COMPONENT "prot_virt" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "prot_virt: " fmt +#include <linux/export.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/sizes.h> @@ -14,23 +14,31 @@ #include <linux/memblock.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/pagewalk.h> +#include <linux/backing-dev.h> #include <asm/facility.h> #include <asm/sections.h> #include <asm/uv.h> /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); -#endif +EXPORT_SYMBOL(prot_virt_guest); +/* + * uv_info contains both host and guest information but it's currently only + * expected to be used within modules if it's the KVM module or for + * any PV guest module. + * + * The kernel itself will write these values once in uv_query_info() + * and then make some of them readable via a sysfs interface. + */ struct uv_info __bootdata_preserved(uv_info); +EXPORT_SYMBOL(uv_info); -#if IS_ENABLED(CONFIG_KVM) int __bootdata_preserved(prot_virt_host); EXPORT_SYMBOL(prot_virt_host); -EXPORT_SYMBOL(uv_info); -static int __init uv_init(unsigned long stor_base, unsigned long stor_len) +static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len) { struct uv_cb_init uvcb = { .header.cmd = UVC_CMD_INIT_UV, @@ -49,27 +57,12 @@ static int __init uv_init(unsigned long stor_base, unsigned long stor_len) void __init setup_uv(void) { - unsigned long uv_stor_base; + void *uv_stor_base; - /* - * keep these conditions in line with has_uv_sec_stor_limit() - */ if (!is_prot_virt_host()) return; - if (is_prot_virt_guest()) { - prot_virt_host = 0; - pr_warn("Protected virtualization not available in protected guests."); - return; - } - - if (!test_facility(158)) { - prot_virt_host = 0; - pr_warn("Protected virtualization not supported by the hardware."); - return; - } - - uv_stor_base = (unsigned long)memblock_alloc_try_nid( + uv_stor_base = memblock_alloc_try_nid( uv_info.uv_base_stor_len, SZ_1M, SZ_2G, MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); if (!uv_stor_base) { @@ -78,7 +71,7 @@ void __init setup_uv(void) goto fail; } - if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) { + if (uv_init(__pa(uv_stor_base), uv_info.uv_base_stor_len)) { memblock_free(uv_stor_base, uv_info.uv_base_stor_len); goto fail; } @@ -95,7 +88,7 @@ fail: * Requests the Ultravisor to pin the page in the shared state. This will * cause an intercept when the guest attempts to unshare the pinned page. */ -static int uv_pin_shared(unsigned long paddr) +int uv_pin_shared(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_PIN_PAGE_SHARED, @@ -107,6 +100,7 @@ static int uv_pin_shared(unsigned long paddr) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(uv_pin_shared); /* * Requests the Ultravisor to destroy a guest page and make it @@ -115,7 +109,7 @@ static int uv_pin_shared(unsigned long paddr) * * @paddr: Absolute host address of page to be destroyed */ -int uv_destroy_page(unsigned long paddr) +static int uv_destroy(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_DESTR_SEC_STOR, @@ -136,6 +130,35 @@ int uv_destroy_page(unsigned long paddr) } /* + * The caller must already hold a reference to the folio + */ +int uv_destroy_folio(struct folio *folio) +{ + int rc; + + /* Large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) + return 0; + + folio_get(folio); + rc = uv_destroy(folio_to_phys(folio)); + if (!rc) + clear_bit(PG_arch_1, &folio->flags.f); + folio_put(folio); + return rc; +} +EXPORT_SYMBOL(uv_destroy_folio); + +/* + * The present PTE still indirectly holds a folio reference through the mapping. + */ +int uv_destroy_pte(pte_t pte) +{ + VM_WARN_ON(!pte_present(pte)); + return uv_destroy_folio(pfn_folio(pte_pfn(pte))); +} + +/* * Requests the Ultravisor to encrypt a guest page and make it * accessible to the host for paging (export). * @@ -153,211 +176,389 @@ int uv_convert_from_secure(unsigned long paddr) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure); /* - * Calculate the expected ref_count for a page that would otherwise have no + * The caller must already hold a reference to the folio. + */ +int uv_convert_from_secure_folio(struct folio *folio) +{ + int rc; + + /* Large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) + return 0; + + folio_get(folio); + rc = uv_convert_from_secure(folio_to_phys(folio)); + if (!rc) + clear_bit(PG_arch_1, &folio->flags.f); + folio_put(folio); + return rc; +} +EXPORT_SYMBOL_GPL(uv_convert_from_secure_folio); + +/* + * The present PTE still indirectly holds a folio reference through the mapping. + */ +int uv_convert_from_secure_pte(pte_t pte) +{ + VM_WARN_ON(!pte_present(pte)); + return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte))); +} + +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +/* + * Calculate the expected ref_count for a folio that would otherwise have no * further pins. This was cribbed from similar functions in other places in * the kernel, but with some slight modifications. We know that a secure - * page can not be a huge page for example. + * folio can not be a large folio, for example. */ -static int expected_page_refs(struct page *page) +static int expected_folio_refs(struct folio *folio) { int res; - res = page_mapcount(page); - if (PageSwapCache(page)) { + res = folio_mapcount(folio); + if (folio_test_swapcache(folio)) { res++; - } else if (page_mapping(page)) { + } else if (folio_mapping(folio)) { res++; - if (page_has_private(page)) + if (folio->private) res++; } return res; } -static int make_secure_pte(pte_t *ptep, unsigned long addr, - struct page *exp_page, struct uv_cb_header *uvcb) +/** + * __make_folio_secure() - make a folio secure + * @folio: the folio to make secure + * @uvcb: the uvcb that describes the UVC to be used + * + * The folio @folio will be made secure if possible, @uvcb will be passed + * as-is to the UVC. + * + * Return: 0 on success; + * -EBUSY if the folio is in writeback or has too many references; + * -EAGAIN if the UVC needs to be attempted again; + * -ENXIO if the address is not mapped; + * -EINVAL if the UVC failed for other reasons. + * + * Context: The caller must hold exactly one extra reference on the folio + * (it's the same logic as split_folio()), and the folio must be + * locked. + */ +static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { - pte_t entry = READ_ONCE(*ptep); - struct page *page; - int expected, rc = 0; + int expected, cc = 0; - if (!pte_present(entry)) - return -ENXIO; - if (pte_val(entry) & _PAGE_INVALID) - return -ENXIO; + if (folio_test_writeback(folio)) + return -EBUSY; + expected = expected_folio_refs(folio) + 1; + if (!folio_ref_freeze(folio, expected)) + return -EBUSY; + set_bit(PG_arch_1, &folio->flags.f); + /* + * If the UVC does not succeed or fail immediately, we don't want to + * loop for long, or we might get stall notifications. + * On the other hand, this is a complex scenario and we are holding a lot of + * locks, so we can't easily sleep and reschedule. We try only once, + * and if the UVC returned busy or partial completion, we return + * -EAGAIN and we let the callers deal with it. + */ + cc = __uv_call(0, (u64)uvcb); + folio_ref_unfreeze(folio, expected); + /* + * Return -ENXIO if the folio was not mapped, -EINVAL for other errors. + * If busy or partially completed, return -EAGAIN. + */ + if (cc == UVC_CC_OK) + return 0; + else if (cc == UVC_CC_BUSY || cc == UVC_CC_PARTIAL) + return -EAGAIN; + return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; +} - page = pte_page(entry); - if (page != exp_page) - return -ENXIO; - if (PageWriteback(page)) +static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb) +{ + int rc; + + if (!folio_trylock(folio)) return -EAGAIN; - expected = expected_page_refs(page); - if (!page_ref_freeze(page, expected)) - return -EBUSY; - set_bit(PG_arch_1, &page->flags); - rc = uv_call(0, (u64)uvcb); - page_ref_unfreeze(page, expected); - /* Return -ENXIO if the page was not mapped, -EINVAL otherwise */ - if (rc) - rc = uvcb->rc == 0x10a ? -ENXIO : -EINVAL; + if (should_export_before_import(uvcb, mm)) + uv_convert_from_secure(folio_to_phys(folio)); + rc = __make_folio_secure(folio, uvcb); + folio_unlock(folio); + return rc; } -/* - * Requests the Ultravisor to make a page accessible to a guest. - * If it's brought in the first time, it will be cleared. If - * it has been exported before, it will be decrypted and integrity - * checked. +/** + * s390_wiggle_split_folio() - try to drain extra references to a folio and + * split the folio if it is large. + * @mm: the mm containing the folio to work on + * @folio: the folio + * + * Context: Must be called while holding an extra reference to the folio; + * the mm lock should not be held. + * Return: 0 if the operation was successful; + * -EAGAIN if splitting the large folio was not successful, + * but another attempt can be made; + * -EINVAL in case of other folio splitting errors. See split_folio(). */ -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) { - struct vm_area_struct *vma; - bool local_drain = false; - spinlock_t *ptelock; - unsigned long uaddr; - struct page *page; - pte_t *ptep; - int rc; + int rc, tried_splits; -again: - rc = -EFAULT; - mmap_read_lock(gmap->mm); + lockdep_assert_not_held(&mm->mmap_lock); + folio_wait_writeback(folio); + lru_add_drain_all(); + + if (!folio_test_large(folio)) + return 0; + + for (tried_splits = 0; tried_splits < 2; tried_splits++) { + struct address_space *mapping; + loff_t lstart, lend; + struct inode *inode; + + folio_lock(folio); + rc = split_folio(folio); + if (rc != -EBUSY) { + folio_unlock(folio); + return rc; + } - uaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(uaddr)) - goto out; - vma = find_vma(gmap->mm, uaddr); - if (!vma) - goto out; - /* - * Secure pages cannot be huge and userspace should not combine both. - * In case userspace does it anyway this will result in an -EFAULT for - * the unpack. The guest is thus never reaching secure mode. If - * userspace is playing dirty tricky with mapping huge pages later - * on this will result in a segmentation fault. - */ - if (is_vm_hugetlb_page(vma)) - goto out; - - rc = -ENXIO; - page = follow_page(vma, uaddr, FOLL_WRITE); - if (IS_ERR_OR_NULL(page)) - goto out; - - lock_page(page); - ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); - rc = make_secure_pte(ptep, uaddr, page, uvcb); - pte_unmap_unlock(ptep, ptelock); - unlock_page(page); -out: - mmap_read_unlock(gmap->mm); - - if (rc == -EAGAIN) { - wait_on_page_writeback(page); - } else if (rc == -EBUSY) { /* - * If we have tried a local drain and the page refcount - * still does not match our expected safe value, try with a - * system wide drain. This is needed if the pagevecs holding - * the page are on a different CPU. + * Splitting with -EBUSY can fail for various reasons, but we + * have to handle one case explicitly for now: some mappings + * don't allow for splitting dirty folios; writeback will + * mark them clean again, including marking all page table + * entries mapping the folio read-only, to catch future write + * attempts. + * + * While the system should be writing back dirty folios in the + * background, we obtained this folio by looking up a writable + * page table entry. On these problematic mappings, writable + * page table entries imply dirty folios, preventing the + * split in the first place. + * + * To prevent a livelock when trigger writeback manually and + * letting the caller look up the folio again in the page + * table (turning it dirty), immediately try to split again. + * + * This is only a problem for some mappings (e.g., XFS); + * mappings that do not support writeback (e.g., shmem) do not + * apply. */ - if (local_drain) { - lru_add_drain_all(); - /* We give up here, and let the caller try again */ - return -EAGAIN; + if (!folio_test_dirty(folio) || folio_test_anon(folio) || + !folio->mapping || !mapping_can_writeback(folio->mapping)) { + folio_unlock(folio); + break; } + /* - * We are here if the page refcount does not match the - * expected safe value. The main culprits are usually - * pagevecs. With lru_add_drain() we drain the pagevecs - * on the local CPU so that hopefully the refcount will - * reach the expected safe value. + * Ideally, we'd only trigger writeback on this exact folio. But + * there is no easy way to do that, so we'll stabilize the + * mapping while we still hold the folio lock, so we can drop + * the folio lock to trigger writeback on the range currently + * covered by the folio instead. */ - lru_add_drain(); - local_drain = true; - /* And now we try again immediately after draining */ - goto again; - } else if (rc == -ENXIO) { - if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) - return -EFAULT; - return -EAGAIN; + mapping = folio->mapping; + lstart = folio_pos(folio); + lend = lstart + folio_size(folio) - 1; + inode = igrab(mapping->host); + folio_unlock(folio); + + if (unlikely(!inode)) + break; + + filemap_write_and_wait_range(mapping, lstart, lend); + iput(mapping->host); } - return rc; + return -EAGAIN; } -EXPORT_SYMBOL_GPL(gmap_make_secure); -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) { - struct uv_cb_cts uvcb = { - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, - .header.len = sizeof(uvcb), - .guest_handle = gmap->guest_handle, - .gaddr = gaddr, - }; + struct vm_area_struct *vma; + struct folio_walk fw; + struct folio *folio; + int rc; + + mmap_read_lock(mm); + vma = vma_lookup(mm, hva); + if (!vma) { + mmap_read_unlock(mm); + return -EFAULT; + } + folio = folio_walk_start(&fw, vma, hva, 0); + if (!folio) { + mmap_read_unlock(mm); + return -ENXIO; + } + + folio_get(folio); + /* + * Secure pages cannot be huge and userspace should not combine both. + * In case userspace does it anyway this will result in an -EFAULT for + * the unpack. The guest is thus never reaching secure mode. + * If userspace plays dirty tricks and decides to map huge pages at a + * later point in time, it will receive a segmentation fault or + * KVM_RUN will return -EFAULT. + */ + if (folio_test_hugetlb(folio)) + rc = -EFAULT; + else if (folio_test_large(folio)) + rc = -E2BIG; + else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID)) + rc = -ENXIO; + else + rc = make_folio_secure(mm, folio, uvcb); + folio_walk_end(&fw, vma); + mmap_read_unlock(mm); + + if (rc == -E2BIG || rc == -EBUSY) { + rc = s390_wiggle_split_folio(mm, folio); + if (!rc) + rc = -EAGAIN; + } + folio_put(folio); - return gmap_make_secure(gmap, gaddr, &uvcb); + return rc; } -EXPORT_SYMBOL_GPL(gmap_convert_to_secure); +EXPORT_SYMBOL_GPL(make_hva_secure); /* - * To be called with the page locked or with an extra reference! This will - * prevent gmap_make_secure from touching the page concurrently. Having 2 - * parallel make_page_accessible is fine, as the UV calls will become a - * no-op if the page is already exported. + * To be called with the folio locked or with an extra reference! This will + * prevent kvm_s390_pv_make_secure() from touching the folio concurrently. + * Having 2 parallel arch_make_folio_accessible is fine, as the UV calls will + * become a no-op if the folio is already exported. */ -int arch_make_page_accessible(struct page *page) +int arch_make_folio_accessible(struct folio *folio) { int rc = 0; - /* Hugepage cannot be protected, so nothing to do */ - if (PageHuge(page)) + /* Large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) return 0; /* - * PG_arch_1 is used in 3 places: - * 1. for kernel page tables during early boot - * 2. for storage keys of huge pages and KVM - * 3. As an indication that this page might be secure. This can + * PG_arch_1 is used in 2 places: + * 1. for storage keys of hugetlb folios and KVM + * 2. As an indication that this small folio might be secure. This can * overindicate, e.g. we set the bit before calling * convert_to_secure. - * As secure pages are never huge, all 3 variants can co-exists. + * As secure pages are never large folios, both variants can co-exists. */ - if (!test_bit(PG_arch_1, &page->flags)) + if (!test_bit(PG_arch_1, &folio->flags.f)) return 0; - rc = uv_pin_shared(page_to_phys(page)); + rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } - rc = uv_convert_from_secure(page_to_phys(page)); + rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } return rc; } -EXPORT_SYMBOL_GPL(arch_make_page_accessible); +EXPORT_SYMBOL_GPL(arch_make_folio_accessible); -#endif - -#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) static ssize_t uv_query_facilities(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n%lx\n%lx\n%lx\n", - uv_info.inst_calls_list[0], - uv_info.inst_calls_list[1], - uv_info.inst_calls_list[2], - uv_info.inst_calls_list[3]); + return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n", + uv_info.inst_calls_list[0], + uv_info.inst_calls_list[1], + uv_info.inst_calls_list[2], + uv_info.inst_calls_list[3]); } static struct kobj_attribute uv_query_facilities_attr = __ATTR(facilities, 0444, uv_query_facilities, NULL); +static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_se_hdr_ver_attr = + __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL); + +static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf); +} + +static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr = + __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL); + +static ssize_t uv_query_dump_cpu_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len); +} + +static struct kobj_attribute uv_query_dump_cpu_len_attr = + __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL); + +static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len); +} + +static struct kobj_attribute uv_query_dump_storage_state_len_attr = + __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL); + +static ssize_t uv_query_dump_finalize_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len); +} + +static struct kobj_attribute uv_query_dump_finalize_len_attr = + __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL); + static ssize_t uv_query_feature_indications(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -368,69 +569,208 @@ static struct kobj_attribute uv_query_feature_indications_attr = __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL); static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%d\n", - uv_info.max_guest_cpu_id + 1); + return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1); } static struct kobj_attribute uv_query_max_guest_cpus_attr = __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL); static ssize_t uv_query_max_guest_vms(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%d\n", - uv_info.max_num_sec_conf); + return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf); } static struct kobj_attribute uv_query_max_guest_vms_attr = __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL); static ssize_t uv_query_max_guest_addr(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n", - uv_info.max_sec_stor_addr); + return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr); } static struct kobj_attribute uv_query_max_guest_addr_attr = __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); +static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr = + __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL); + +static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags); +} + +static struct kobj_attribute uv_query_supp_att_pflags_attr = + __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL); + +static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver); +} + +static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr = + __ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL); + +static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf); +} + +static struct kobj_attribute uv_query_supp_add_secret_pcf_attr = + __ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL); + +static ssize_t uv_query_supp_secret_types(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types); +} + +static struct kobj_attribute uv_query_supp_secret_types_attr = + __ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL); + +static ssize_t uv_query_max_secrets(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", + uv_info.max_assoc_secrets + uv_info.max_retr_secrets); +} + +static struct kobj_attribute uv_query_max_secrets_attr = + __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL); + +static ssize_t uv_query_max_retr_secrets(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_retr_secrets); +} + +static struct kobj_attribute uv_query_max_retr_secrets_attr = + __ATTR(max_retr_secrets, 0444, uv_query_max_retr_secrets, NULL); + +static ssize_t uv_query_max_assoc_secrets(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_assoc_secrets); +} + +static struct kobj_attribute uv_query_max_assoc_secrets_attr = + __ATTR(max_assoc_secrets, 0444, uv_query_max_assoc_secrets, NULL); + static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, &uv_query_feature_indications_attr.attr, &uv_query_max_guest_cpus_attr.attr, &uv_query_max_guest_vms_attr.attr, &uv_query_max_guest_addr_attr.attr, + &uv_query_supp_se_hdr_ver_attr.attr, + &uv_query_supp_se_hdr_pcf_attr.attr, + &uv_query_dump_storage_state_len_attr.attr, + &uv_query_dump_finalize_len_attr.attr, + &uv_query_dump_cpu_len_attr.attr, + &uv_query_supp_att_req_hdr_ver_attr.attr, + &uv_query_supp_att_pflags_attr.attr, + &uv_query_supp_add_secret_req_ver_attr.attr, + &uv_query_supp_add_secret_pcf_attr.attr, + &uv_query_supp_secret_types_attr.attr, + &uv_query_max_secrets_attr.attr, + &uv_query_max_assoc_secrets_attr.attr, + &uv_query_max_retr_secrets_attr.attr, NULL, }; +static inline struct uv_cb_query_keys uv_query_keys(void) +{ + struct uv_cb_query_keys uvcb = { + .header.cmd = UVC_CMD_QUERY_KEYS, + .header.len = sizeof(uvcb) + }; + + uv_call(0, (uint64_t)&uvcb); + return uvcb; +} + +static inline ssize_t emit_hash(struct uv_key_hash *hash, char *buf, int at) +{ + return sysfs_emit_at(buf, at, "%016llx%016llx%016llx%016llx\n", + hash->dword[0], hash->dword[1], hash->dword[2], hash->dword[3]); +} + +static ssize_t uv_keys_host_key(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + + return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_HK], buf, 0); +} + +static struct kobj_attribute uv_keys_host_key_attr = + __ATTR(host_key, 0444, uv_keys_host_key, NULL); + +static ssize_t uv_keys_backup_host_key(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + + return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_BACK_HK], buf, 0); +} + +static struct kobj_attribute uv_keys_backup_host_key_attr = + __ATTR(backup_host_key, 0444, uv_keys_backup_host_key, NULL); + +static ssize_t uv_keys_all(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + ssize_t len = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(uvcb.key_hashes); i++) + len += emit_hash(uvcb.key_hashes + i, buf, len); + + return len; +} + +static struct kobj_attribute uv_keys_all_attr = + __ATTR(all, 0444, uv_keys_all, NULL); + static struct attribute_group uv_query_attr_group = { .attrs = uv_query_attrs, }; +static struct attribute *uv_keys_attrs[] = { + &uv_keys_host_key_attr.attr, + &uv_keys_backup_host_key_attr.attr, + &uv_keys_all_attr.attr, + NULL, +}; + +static struct attribute_group uv_keys_attr_group = { + .attrs = uv_keys_attrs, +}; + static ssize_t uv_is_prot_virt_guest(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - int val = 0; - -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST - val = prot_virt_guest; -#endif - return scnprintf(page, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", prot_virt_guest); } static ssize_t uv_is_prot_virt_host(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - int val = 0; - -#if IS_ENABLED(CONFIG_KVM) - val = prot_virt_host; -#endif - - return scnprintf(page, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", prot_virt_host); } static struct kobj_attribute uv_prot_virt_guest = @@ -446,9 +786,27 @@ static const struct attribute *uv_prot_virt_attrs[] = { }; static struct kset *uv_query_kset; +static struct kset *uv_keys_kset; static struct kobject *uv_kobj; -static int __init uv_info_init(void) +static int __init uv_sysfs_dir_init(const struct attribute_group *grp, + struct kset **uv_dir_kset, const char *name) +{ + struct kset *kset; + int rc; + + kset = kset_create_and_add(name, NULL, uv_kobj); + if (!kset) + return -ENOMEM; + *uv_dir_kset = kset; + + rc = sysfs_create_group(&kset->kobj, grp); + if (rc) + kset_unregister(kset); + return rc; +} + +static int __init uv_sysfs_init(void) { int rc = -ENOMEM; @@ -463,17 +821,16 @@ static int __init uv_info_init(void) if (rc) goto out_kobj; - uv_query_kset = kset_create_and_add("query", NULL, uv_kobj); - if (!uv_query_kset) { - rc = -ENOMEM; + rc = uv_sysfs_dir_init(&uv_query_attr_group, &uv_query_kset, "query"); + if (rc) goto out_ind_files; - } - rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group); - if (!rc) - return 0; + /* Get installed key hashes if available, ignore any errors */ + if (test_bit_inv(BIT_UVC_CMD_QUERY_KEYS, uv_info.inst_calls_list)) + uv_sysfs_dir_init(&uv_keys_attr_group, &uv_keys_kset, "keys"); + + return 0; - kset_unregister(uv_query_kset); out_ind_files: sysfs_remove_files(uv_kobj, uv_prot_virt_attrs); out_kobj: @@ -481,5 +838,110 @@ out_kobj: kobject_put(uv_kobj); return rc; } -device_initcall(uv_info_init); -#endif +device_initcall(uv_sysfs_init); + +/* + * Locate a secret in the list by its id. + * @secret_id: search pattern. + * @list: ephemeral buffer space + * @secret: output data, containing the secret's metadata. + * + * Search for a secret with the given secret_id in the Ultravisor secret store. + * + * Context: might sleep. + */ +static int find_secret_in_page(const u8 secret_id[UV_SECRET_ID_LEN], + const struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret) +{ + u16 i; + + for (i = 0; i < list->total_num_secrets; i++) { + if (memcmp(secret_id, list->secrets[i].id, UV_SECRET_ID_LEN) == 0) { + *secret = list->secrets[i].hdr; + return 0; + } + } + return -ENOENT; +} + +/** + * uv_find_secret() - search secret metadata for a given secret id. + * @secret_id: search pattern. + * @list: ephemeral buffer space + * @secret: output data, containing the secret's metadata. + * + * Context: might sleep. + */ +int uv_find_secret(const u8 secret_id[UV_SECRET_ID_LEN], + struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret) +{ + u16 start_idx = 0; + u16 list_rc; + int ret; + + do { + uv_list_secrets(list, start_idx, &list_rc, NULL); + if (list_rc != UVC_RC_EXECUTED && list_rc != UVC_RC_MORE_DATA) { + if (list_rc == UVC_RC_INV_CMD) + return -ENODEV; + else + return -EIO; + } + ret = find_secret_in_page(secret_id, list, secret); + if (ret == 0) + return ret; + start_idx = list->next_secret_idx; + } while (list_rc == UVC_RC_MORE_DATA && start_idx < list->next_secret_idx); + + return -ENOENT; +} +EXPORT_SYMBOL_GPL(uv_find_secret); + +/** + * uv_retrieve_secret() - get the secret value for the secret index. + * @secret_idx: Secret index for which the secret should be retrieved. + * @buf: Buffer to store retrieved secret. + * @buf_size: Size of the buffer. The correct buffer size is reported as part of + * the result from `uv_get_secret_metadata`. + * + * Calls the Retrieve Secret UVC and translates the UV return code into an errno. + * + * Context: might sleep. + * + * Return: + * * %0 - Entry found; buffer contains a valid secret. + * * %ENOENT: - No entry found or secret at the index is non-retrievable. + * * %ENODEV: - Not supported: UV not available or command not available. + * * %EINVAL: - Buffer too small for content. + * * %EIO: - Other unexpected UV error. + */ +int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size) +{ + struct uv_cb_retr_secr uvcb = { + .header.len = sizeof(uvcb), + .header.cmd = UVC_CMD_RETR_SECRET, + .secret_idx = secret_idx, + .buf_addr = (u64)buf, + .buf_size = buf_size, + }; + + uv_call_sched(0, (u64)&uvcb); + + switch (uvcb.header.rc) { + case UVC_RC_EXECUTED: + return 0; + case UVC_RC_INV_CMD: + return -ENODEV; + case UVC_RC_RETR_SECR_STORE_EMPTY: + case UVC_RC_RETR_SECR_INV_SECRET: + case UVC_RC_RETR_SECR_INV_IDX: + return -ENOENT; + case UVC_RC_RETR_SECR_BUF_SMALL: + return -EINVAL; + default: + return -EIO; + } +} +EXPORT_SYMBOL_GPL(uv_retrieve_secret); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 99694260cac9..a27a90a199be 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -7,129 +7,22 @@ */ #include <linux/binfmts.h> -#include <linux/compat.h> #include <linux/elf.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/time_namespace.h> +#include <linux/random.h> +#include <linux/vdso_datastore.h> #include <vdso/datapage.h> +#include <asm/vdso/vsyscall.h> +#include <asm/alternative.h> #include <asm/vdso.h> -extern char vdso64_start[], vdso64_end[]; -extern char vdso32_start[], vdso32_end[]; - -static struct vm_special_mapping vvar_mapping; - -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; - -struct vdso_data *vdso_data = vdso_data_store.data; - -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - -#ifdef CONFIG_TIME_NS -struct vdso_data *arch_get_vdso_data(void *vvar_page) -{ - return (struct vdso_data *)(vvar_page); -} - -static struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - WARN(1, "vvar_page accessed remotely"); - return NULL; -} - -/* - * The VVAR page layout depends on whether a task belongs to the root or - * non-root time namespace. Whenever a task changes its namespace, the VVAR - * page tables are cleared and then they will be re-faulted with a - * corresponding layout. - * See also the comment near timens_setup_vdso_data() for details. - */ -int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long size = vma->vm_end - vma->vm_start; - - if (!vma_is_special_mapping(vma, &vvar_mapping)) - continue; - zap_page_range(vma, vma->vm_start, size); - break; - } - mmap_read_unlock(mm); - return 0; -} -#else -static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - return NULL; -} -#endif - -static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long addr, pfn; - vm_fault_t err; - - switch (vmf->pgoff) { - case VVAR_DATA_PAGE_OFFSET: - pfn = virt_to_pfn(vdso_data); - if (timens_page) { - /* - * Fault in VVAR page too, since it will be accessed - * to get clock data anyway. - */ - addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; - err = vmf_insert_pfn(vma, addr, pfn); - if (unlikely(err & VM_FAULT_ERROR)) - return err; - pfn = page_to_pfn(timens_page); - } - break; -#ifdef CONFIG_TIME_NS - case VVAR_TIMENS_PAGE_OFFSET: - /* - * If a task belongs to a time namespace then a namespace - * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and - * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET - * offset. - * See also the comment near timens_setup_vdso_data(). - */ - if (!timens_page) - return VM_FAULT_SIGBUS; - pfn = virt_to_pfn(vdso_data); - break; -#endif /* CONFIG_TIME_NS */ - default: - return VM_FAULT_SIGBUS; - } - return vmf_insert_pfn(vma, vmf->address, pfn); -} +extern char vdso_start[], vdso_end[]; static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *vma) @@ -138,17 +31,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .fault = vvar_fault, -}; - -static struct vm_special_mapping vdso64_mapping = { - .name = "[vdso]", - .mremap = vdso_mremap, -}; - -static struct vm_special_mapping vdso32_mapping = { +static struct vm_special_mapping vdso_mapping = { .name = "[vdso]", .mremap = vdso_mremap, }; @@ -160,44 +43,32 @@ int vdso_getcpu_init(void) } early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int map_vdso(unsigned long addr, unsigned long vdso_mapping_len) { - unsigned long vdso_text_len, vdso_mapping_len; - unsigned long vvar_start, vdso_text_start; - struct vm_special_mapping *vdso_mapping; + unsigned long vvar_start, vdso_text_start, vdso_text_len; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc; - BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + BUILD_BUG_ON(VDSO_NR_PAGES != __VDSO_PAGES); if (mmap_write_lock_killable(mm)) return -EINTR; - if (is_compat_task()) { - vdso_text_len = vdso32_end - vdso32_start; - vdso_mapping = &vdso32_mapping; - } else { - vdso_text_len = vdso64_end - vdso64_start; - vdso_mapping = &vdso64_mapping; - } - vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; - vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); + vdso_text_len = vdso_end - vdso_start; + vvar_start = get_unmapped_area(NULL, addr, vdso_mapping_len, 0, 0); rc = vvar_start; if (IS_ERR_VALUE(vvar_start)) goto out; - vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| - VM_PFNMAP, - &vvar_mapping); + vma = vdso_install_vvar_mapping(mm, vvar_start); rc = PTR_ERR(vma); if (IS_ERR(vma)) goto out; - vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE; + vdso_text_start = vvar_start + VDSO_NR_PAGES * PAGE_SIZE; /* VM_MAYWRITE for COW so gdb can set breakpoints */ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, - VM_READ|VM_EXEC| + VM_READ|VM_EXEC|VM_SEALED_SYSMAP| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso_mapping); + &vdso_mapping); if (IS_ERR(vma)) { do_munmap(mm, vvar_start, PAGE_SIZE, NULL); rc = PTR_ERR(vma); @@ -210,6 +81,51 @@ out: return rc; } +static unsigned long vdso_addr(unsigned long start, unsigned long len) +{ + unsigned long addr, end, offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= VDSO_BASE) + end = VDSO_BASE; + end -= len; + + if (end > start) { + offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + return addr; +} + +unsigned long vdso_text_size(void) +{ + return PAGE_ALIGN(vdso_end - vdso_start); +} + +unsigned long vdso_size(void) +{ + return vdso_text_size() + VDSO_NR_PAGES * PAGE_SIZE; +} + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + unsigned long addr = VDSO_BASE; + unsigned long size = vdso_size(); + + if (current->flags & PF_RANDOMIZE) + addr = vdso_addr(current->mm->start_stack + PAGE_SIZE, size); + return map_vdso(addr, size); +} + static struct page ** __init vdso_setup_pages(void *start, void *end) { int pages = (end - start) >> PAGE_SHIFT; @@ -224,11 +140,26 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) return pagelist; } +static void vdso_apply_alternatives(void) +{ + const struct elf64_shdr *alt, *shdr; + struct alt_instr *start, *end; + const struct elf64_hdr *hdr; + + hdr = (struct elf64_hdr *)vdso_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".altinstructions"); + if (!alt) + return; + start = (void *)hdr + alt->sh_offset; + end = (void *)hdr + alt->sh_offset + alt->sh_size; + apply_alternatives(start, end); +} + static int __init vdso_init(void) { - vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); - if (IS_ENABLED(CONFIG_COMPAT)) - vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); + vdso_apply_alternatives(); + vdso_mapping.pages = vdso_setup_pages(vdso_start, vdso_end); return 0; } arch_initcall(vdso_init); diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso/.gitignore index 5167384843b9..652e31d82582 100644 --- a/arch/s390/kernel/vdso32/.gitignore +++ b/arch/s390/kernel/vdso/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -vdso32.lds +vdso.lds diff --git a/arch/s390/kernel/vdso/Makefile b/arch/s390/kernel/vdso/Makefile new file mode 100644 index 000000000000..2fa12d4ac106 --- /dev/null +++ b/arch/s390/kernel/vdso/Makefile @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso + +# Include the generic Makefile to check the built vdso. +include $(srctree)/lib/vdso/Makefile.include +obj-vdso = vdso_user_wrapper.o note.o vgetrandom-chacha.o +obj-cvdso = vdso_generic.o getcpu.o vgetrandom.o +VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) +CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vdso_generic.o = $(VDSO_CFLAGS_REMOVE) + +ifneq ($(c-getrandom-y),) + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) +endif + +# Build rules + +targets := $(obj-vdso) $(obj-cvdso) vdso.so vdso.so.dbg +obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) +obj-cvdso := $(addprefix $(obj)/, $(obj-cvdso)) + +KBUILD_AFLAGS_VDSO := $(KBUILD_AFLAGS) -DBUILD_VDSO + +KBUILD_CFLAGS_VDSO := $(KBUILD_CFLAGS) -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_VDSO := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS_VDSO)) +KBUILD_CFLAGS_VDSO := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_VDSO)) +KBUILD_CFLAGS_VDSO := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_VDSO)) +KBUILD_CFLAGS_VDSO := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_VDSO)) +KBUILD_CFLAGS_VDSO += -fPIC -fno-common -fno-builtin -fasynchronous-unwind-tables +KBUILD_CFLAGS_VDSO += -fno-stack-protector +ldflags-y := -shared -soname=linux-vdso.so.1 \ + --hash-style=both --build-id=sha1 -T + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_VDSO) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_VDSO) + +obj-y += vdso_wrapper.o +targets += vdso.lds +CPPFLAGS_vdso.lds += -P -C -U$(ARCH) + +# Force dependency (incbin is bad) +$(obj)/vdso_wrapper.o : $(obj)/vdso.so + +quiet_cmd_vdso_and_check = VDSO $@ + cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) + +# link rule for the .so file, .lds has to be first +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) $(obj-cvdso) FORCE + $(call if_changed,vdso_and_check) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# assembly rules for the .S files +$(obj-vdso): %.o: %.S FORCE + $(call if_changed_dep,vdsoas) + +$(obj-cvdso): %.o: %.c FORCE + $(call if_changed_dep,vdsocc) + +# actual build commands +quiet_cmd_vdsoas = VDSOA $@ + cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdsocc = VDSOC $@ + cmd_vdsocc = $(CC) $(c_flags) -c -o $@ $< + +# Generate VDSO offsets using helper script +gen-vdsosym := $(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso/gen_vdso_offsets.sh index 37f05cb38dad..359982fb002d 100755 --- a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh +++ b/arch/s390/kernel/vdso/gen_vdso_offsets.sh @@ -12,4 +12,4 @@ # LC_ALL=C -sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p' +sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso/getcpu.c index 5c5d4a848b76..5c5d4a848b76 100644 --- a/arch/s390/kernel/vdso64/getcpu.c +++ b/arch/s390/kernel/vdso/getcpu.c diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso/note.S index db19d0680a0a..db19d0680a0a 100644 --- a/arch/s390/kernel/vdso32/note.S +++ b/arch/s390/kernel/vdso/note.S diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso/vdso.h index 34c7a2312f9d..8cff033dd854 100644 --- a/arch/s390/kernel/vdso64/vdso.h +++ b/arch/s390/kernel/vdso/vdso.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ARCH_S390_KERNEL_VDSO64_VDSO_H -#define __ARCH_S390_KERNEL_VDSO64_VDSO_H +#ifndef __ARCH_S390_KERNEL_VDSO_VDSO_H +#define __ARCH_S390_KERNEL_VDSO_VDSO_H #include <vdso/datapage.h> @@ -10,5 +10,6 @@ int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unuse int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); -#endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */ +#endif /* __ARCH_S390_KERNEL_VDSO_VDSO_H */ diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso/vdso.lds.S index d4fb336d747b..7bec4de0e8e0 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso/vdso.lds.S @@ -4,20 +4,20 @@ * library */ +#include <asm/vdso/vsyscall.h> #include <asm/page.h> #include <asm/vdso.h> +#include <asm-generic/vmlinux.lds.h> +#include <vdso/datapage.h> OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") OUTPUT_ARCH(s390:64-bit) -ENTRY(_start) SECTIONS { - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = VDSO_LBASE + SIZEOF_HEADERS; + VDSO_VVAR_SYMS + + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -43,6 +43,10 @@ SECTIONS .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } .rodata1 : { *(.rodata1) } + . = ALIGN(8); + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + .dynamic : { *(.dynamic) } :text :dynamic .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr @@ -51,51 +55,14 @@ SECTIONS .rela.dyn ALIGN(8) : { *(.rela.dyn) } .got ALIGN(8) : { *(.got .toc) } + .got.plt ALIGN(8) : { *(.got.plt) } _end = .; PROVIDE(end = .); - /* - * Stabs debugging sections are here too. - */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } + STABS_DEBUG + DWARF_DEBUG .comment 0 : { *(.comment) } - - /* - * DWARF debug sections. - * Symbols in the DWARF debugging sections are relative to the - * beginning of the section so we begin them at 0. - */ - /* DWARF 1 */ - .debug 0 : { *(.debug) } - .line 0 : { *(.line) } - /* GNU DWARF 1 extensions */ - .debug_srcinfo 0 : { *(.debug_srcinfo) } - .debug_sfnames 0 : { *(.debug_sfnames) } - /* DWARF 1.1 and DWARF 2 */ - .debug_aranges 0 : { *(.debug_aranges) } - .debug_pubnames 0 : { *(.debug_pubnames) } - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } - /* DWARF 3 */ - .debug_pubtypes 0 : { *(.debug_pubtypes) } - .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } /DISCARD/ : { @@ -140,6 +107,7 @@ VERSION __kernel_restart_syscall; __kernel_rt_sigreturn; __kernel_sigreturn; + __kernel_getrandom; local: *; }; } diff --git a/arch/s390/kernel/vdso64/vdso64_generic.c b/arch/s390/kernel/vdso/vdso_generic.c index a9aa75643c08..a9aa75643c08 100644 --- a/arch/s390/kernel/vdso64/vdso64_generic.c +++ b/arch/s390/kernel/vdso/vdso_generic.c diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso/vdso_user_wrapper.S index 97f0c0a669a5..aa06c85bcbd3 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso/vdso_user_wrapper.S @@ -1,12 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> #include <asm/vdso.h> #include <asm/unistd.h> #include <asm/asm-offsets.h> #include <asm/dwarf.h> #include <asm/ptrace.h> -#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8) - /* * Older glibc version called vdso without allocating a stackframe. This wrapper * is just used to allocate a stackframe. See @@ -14,23 +13,23 @@ * for details. */ .macro vdso_func func - .globl __kernel_\func - .type __kernel_\func,@function - .align 8 -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC - aghi %r15,-WRAPPER_FRAME_SIZE - CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE) - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD - stg %r14,STACK_FRAME_OVERHEAD(%r15) + aghi %r15,-STACK_FRAME_VDSO_OVERHEAD + CFI_DEF_CFA_OFFSET (STACK_FRAME_USER_OVERHEAD + STACK_FRAME_VDSO_OVERHEAD) + CFI_VAL_OFFSET 15,-STACK_FRAME_USER_OVERHEAD + stg %r14,__SFVDSO_RETURN_ADDRESS(%r15) + CFI_REL_OFFSET 14,__SFVDSO_RETURN_ADDRESS + xc __SFUSER_BACKCHAIN(8,%r15),__SFUSER_BACKCHAIN(%r15) brasl %r14,__s390_vdso_\func - lg %r14,STACK_FRAME_OVERHEAD(%r15) - aghi %r15,WRAPPER_FRAME_SIZE - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + lg %r14,__SFVDSO_RETURN_ADDRESS(%r15) + CFI_RESTORE 14 + aghi %r15,STACK_FRAME_VDSO_OVERHEAD + CFI_DEF_CFA_OFFSET STACK_FRAME_USER_OVERHEAD CFI_RESTORE 15 br %r14 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_func gettimeofday @@ -39,16 +38,13 @@ vdso_func clock_gettime vdso_func getcpu .macro vdso_syscall func,syscall - .globl __kernel_\func - .type __kernel_\func,@function - .align 8 -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC svc \syscall /* Make sure we notice when a syscall returns, which shouldn't happen */ .word 0 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_syscall restart_syscall,__NR_restart_syscall diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso/vdso_wrapper.S index de2fb930471a..f69e62a14978 100644 --- a/arch/s390/kernel/vdso32/vdso32_wrapper.S +++ b/arch/s390/kernel/vdso/vdso_wrapper.S @@ -5,11 +5,11 @@ __PAGE_ALIGNED_DATA - .globl vdso32_start, vdso32_end + .globl vdso_start, vdso_end .balign PAGE_SIZE -vdso32_start: - .incbin "arch/s390/kernel/vdso32/vdso32.so" +vdso_start: + .incbin "arch/s390/kernel/vdso/vdso.so" .balign PAGE_SIZE -vdso32_end: +vdso_end: .previous diff --git a/arch/s390/kernel/vdso/vgetrandom-chacha.S b/arch/s390/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..09c034c2f853 --- /dev/null +++ b/arch/s390/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/stringify.h> +#include <linux/linkage.h> +#include <asm/alternative.h> +#include <asm/dwarf.h> +#include <asm/fpu-insn.h> + +#define STATE0 %v0 +#define STATE1 %v1 +#define STATE2 %v2 +#define STATE3 %v3 +#define COPY0 %v4 +#define COPY1 %v5 +#define COPY2 %v6 +#define COPY3 %v7 +#define BEPERM %v19 +#define TMP0 %v20 +#define TMP1 %v21 +#define TMP2 %v22 +#define TMP3 %v23 + + .section .rodata + + .balign 32 +SYM_DATA_START_LOCAL(chacha20_constants) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap +SYM_DATA_END(chacha20_constants) + + .text +/* + * s390 ChaCha20 implementation meant for vDSO. Produces a given positive + * number of blocks of output with nonce 0, taking an input key and 8-bytes + * counter. Does not spill to the stack. + * + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, + * const uint8_t *key, + * uint32_t *counter, + * size_t nblocks) + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + CFI_STARTPROC + larl %r1,chacha20_constants + + /* COPY0 = "expand 32-byte k" */ + VL COPY0,0,,%r1 + + /* BEPERM = byte selectors for VPERM */ + ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148) + + /* COPY1,COPY2 = key */ + VLM COPY1,COPY2,0,%r3 + + /* COPY3 = counter || zero nonce */ + lg %r3,0(%r4) + VZERO COPY3 + VLVGG COPY3,%r3,0 + + lghi %r1,0 +.Lblock: + VLR STATE0,COPY0 + VLR STATE1,COPY1 + VLR STATE2,COPY2 + VLR STATE3,COPY3 + + lghi %r0,10 +.Ldoubleround: + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,16 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,8 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,7 + + /* STATE1[0,1,2,3] = STATE1[1,2,3,0] */ + VSLDB STATE1,STATE1,STATE1,4 + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ + VSLDB STATE2,STATE2,STATE2,8 + /* STATE3[0,1,2,3] = STATE3[3,0,1,2] */ + VSLDB STATE3,STATE3,STATE3,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,16 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,8 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,7 + + /* STATE1[0,1,2,3] = STATE1[3,0,1,2] */ + VSLDB STATE1,STATE1,STATE1,12 + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ + VSLDB STATE2,STATE2,STATE2,8 + /* STATE3[0,1,2,3] = STATE3[1,2,3,0] */ + VSLDB STATE3,STATE3,STATE3,4 + brctg %r0,.Ldoubleround + + /* OUTPUT0 = STATE0 + COPY0 */ + VAF STATE0,STATE0,COPY0 + /* OUTPUT1 = STATE1 + COPY1 */ + VAF STATE1,STATE1,COPY1 + /* OUTPUT2 = STATE2 + COPY2 */ + VAF STATE2,STATE2,COPY2 + /* OUTPUT3 = STATE3 + COPY3 */ + VAF STATE3,STATE3,COPY3 + + ALTERNATIVE \ + __stringify( \ + /* Convert STATE to little endian and store to OUTPUT */\ + VPERM TMP0,STATE0,STATE0,BEPERM; \ + VPERM TMP1,STATE1,STATE1,BEPERM; \ + VPERM TMP2,STATE2,STATE2,BEPERM; \ + VPERM TMP3,STATE3,STATE3,BEPERM; \ + VSTM TMP0,TMP3,0,%r2), \ + __stringify( \ + /* 32 bit wise little endian store to OUTPUT */ \ + VSTBRF STATE0,0,,%r2; \ + VSTBRF STATE1,16,,%r2; \ + VSTBRF STATE2,32,,%r2; \ + VSTBRF STATE3,48,,%r2; \ + brcl 0,0), \ + ALT_FACILITY(148) + + /* ++COPY3.COUNTER */ + /* alsih %r3,1 */ + .insn rilu,0xcc0a00000000,%r3,1 + alcr %r3,%r1 + VLVGG COPY3,%r3,0 + + /* OUTPUT += 64, --NBLOCKS */ + aghi %r2,64 + brctg %r5,.Lblock + + /* COUNTER = COPY3.COUNTER */ + stg %r3,0(%r4) + + /* Zero out potentially sensitive regs */ + VZERO STATE0 + VZERO STATE1 + VZERO STATE2 + VZERO STATE3 + VZERO COPY1 + VZERO COPY2 + + /* Early exit if TMP0-TMP3 have not been used */ + ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148) + + VZERO TMP0 + VZERO TMP1 + VZERO TMP2 + VZERO TMP3 + + br %r14 + CFI_ENDPROC +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/s390/kernel/vdso/vgetrandom.c b/arch/s390/kernel/vdso/vgetrandom.c new file mode 100644 index 000000000000..b5268b507fb5 --- /dev/null +++ b/arch/s390/kernel/vdso/vgetrandom.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/facility.h> +#include <uapi/asm-generic/errno.h> +#include "vdso.h" + +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + if (test_facility(129)) + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) + return -ENOSYS; + return getrandom_syscall(buffer, len, flags); +} diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile deleted file mode 100644 index b2349a3f4fa3..000000000000 --- a/arch/s390/kernel/vdso32/Makefile +++ /dev/null @@ -1,75 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# List of files in the vdso - -KCOV_INSTRUMENT := n -ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE -ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT - -include $(srctree)/lib/vdso/Makefile -obj-vdso32 = vdso_user_wrapper-32.o note-32.o - -# Build rules - -targets := $(obj-vdso32) vdso32.so vdso32.so.dbg -obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) - -KBUILD_AFLAGS += -DBUILD_VDSO -KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING - -KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -KBUILD_AFLAGS_32 += -m31 -s - -KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin - -LDFLAGS_vdso32.so.dbg += -fPIC -shared -nostdlib -soname=linux-vdso32.so.1 \ - --hash-style=both --build-id=sha1 -melf_s390 -T - -$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) -$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) - -obj-y += vdso32_wrapper.o -CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) - -# Disable gcov profiling, ubsan and kasan for VDSO code -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n - -# Force dependency (incbin is bad) -$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so - -$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE - $(call if_changed,ld) - -# strip rule for the .so file -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) - -$(obj-vdso32): %-32.o: %.S FORCE - $(call if_changed_dep,vdso32as) - -# actual build commands -quiet_cmd_vdso32as = VDSO32A $@ - cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $< -quiet_cmd_vdso32cc = VDSO32C $@ - cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< - -# install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso32.so: $(obj)/vdso32.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso32.so - -# Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh -quiet_cmd_vdsosym = VDSOSYM $@ - cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ - -include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE - $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh deleted file mode 100755 index 9c4f951e227d..000000000000 --- a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# -# Match symbols in the DSO that look like VDSO_*; produce a header file -# of constant offsets into the shared object. -# -# Doing this inside the Makefile will break the $(filter-out) function, -# causing Kbuild to rebuild the vdso-offsets header file every time. -# -# Inspired by arm64 version. -# - -LC_ALL=C -sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S deleted file mode 100644 index bff50b6acd6d..000000000000 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ /dev/null @@ -1,141 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This is the infamous ld script for the 64 bits vdso - * library - */ - -#include <asm/page.h> -#include <asm/vdso.h> - -OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") -OUTPUT_ARCH(s390:31-bit) -ENTRY(_start) - -SECTIONS -{ - PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); -#ifdef CONFIG_TIME_NS - PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); -#endif - . = VDSO_LBASE + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - .note : { *(.note.*) } :text :note - - . = ALIGN(16); - .text : { - *(.text .stub .text.* .gnu.linkonce.t.*) - } :text - PROVIDE(__etext = .); - PROVIDE(_etext = .); - PROVIDE(etext = .); - - /* - * Other stuff is appended to the text segment: - */ - .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } - .rodata1 : { *(.rodata1) } - - .dynamic : { *(.dynamic) } :text :dynamic - - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } - - .rela.dyn ALIGN(8) : { *(.rela.dyn) } - .got ALIGN(8) : { *(.got .toc) } - - _end = .; - PROVIDE(end = .); - - /* - * Stabs debugging sections are here too. - */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - .comment 0 : { *(.comment) } - - /* - * DWARF debug sections. - * Symbols in the DWARF debugging sections are relative to the - * beginning of the section so we begin them at 0. - */ - /* DWARF 1 */ - .debug 0 : { *(.debug) } - .line 0 : { *(.line) } - /* GNU DWARF 1 extensions */ - .debug_srcinfo 0 : { *(.debug_srcinfo) } - .debug_sfnames 0 : { *(.debug_sfnames) } - /* DWARF 1.1 and DWARF 2 */ - .debug_aranges 0 : { *(.debug_aranges) } - .debug_pubnames 0 : { *(.debug_pubnames) } - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } - /* DWARF 3 */ - .debug_pubtypes 0 : { *(.debug_pubtypes) } - .debug_ranges 0 : { *(.debug_ranges) } - .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - - /DISCARD/ : { - *(.note.GNU-stack) - *(.branch_lt) - *(.data .data.* .gnu.linkonce.d.* .sdata*) - *(.bss .sbss .dynbss .dynsbss) - } -} - -/* - * Very old versions of ld do not recognize this name token; use the constant. - */ -#define PT_GNU_EH_FRAME 0x6474e550 - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr PT_GNU_EH_FRAME; -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - VDSO_VERSION_STRING { - global: - /* - * Has to be there for the kernel to find - */ - __kernel_compat_restart_syscall; - __kernel_compat_rt_sigreturn; - __kernel_compat_sigreturn; - local: *; - }; -} diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S deleted file mode 100644 index 3f42f27f978c..000000000000 --- a/arch/s390/kernel/vdso32/vdso_user_wrapper.S +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#include <asm/unistd.h> -#include <asm/dwarf.h> - -.macro vdso_syscall func,syscall - .globl __kernel_compat_\func - .type __kernel_compat_\func,@function - .align 8 -__kernel_compat_\func: - CFI_STARTPROC - svc \syscall - /* Make sure we notice when a syscall returns, which shouldn't happen */ - .word 0 - CFI_ENDPROC - .size __kernel_compat_\func,.-__kernel_compat_\func -.endm - -vdso_syscall restart_syscall,__NR_restart_syscall -vdso_syscall sigreturn,__NR_sigreturn -vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vdso64/.gitignore b/arch/s390/kernel/vdso64/.gitignore deleted file mode 100644 index 4ec80685fecc..000000000000 --- a/arch/s390/kernel/vdso64/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -vdso64.lds diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile deleted file mode 100644 index 2a2092ce19f1..000000000000 --- a/arch/s390/kernel/vdso64/Makefile +++ /dev/null @@ -1,84 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# List of files in the vdso - -KCOV_INSTRUMENT := n -ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE -ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT - -include $(srctree)/lib/vdso/Makefile -obj-vdso64 = vdso_user_wrapper.o note.o -obj-cvdso64 = vdso64_generic.o getcpu.o -CFLAGS_REMOVE_getcpu.o = -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) -CFLAGS_REMOVE_vdso64_generic.o = -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) - -# Build rules - -targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg -obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) -obj-cvdso64 := $(addprefix $(obj)/, $(obj-cvdso64)) - -KBUILD_AFLAGS += -DBUILD_VDSO -KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING - -KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) -KBUILD_AFLAGS_64 += -m64 -s - -KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin -ldflags-y := -fPIC -shared -nostdlib -soname=linux-vdso64.so.1 \ - --hash-style=both --build-id=sha1 -T - -$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) -$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) - -obj-y += vdso64_wrapper.o -targets += vdso64.lds -CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) - -# Disable gcov profiling, ubsan and kasan for VDSO code -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n - -# Force dependency (incbin is bad) -$(obj)/vdso64_wrapper.o : $(obj)/vdso64.so - -# link rule for the .so file, .lds has to be first -$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE - $(call if_changed,ld) - -# strip rule for the .so file -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) - -# assembly rules for the .S files -$(obj-vdso64): %.o: %.S FORCE - $(call if_changed_dep,vdso64as) - -$(obj-cvdso64): %.o: %.c FORCE - $(call if_changed_dep,vdso64cc) - -# actual build commands -quiet_cmd_vdso64as = VDSO64A $@ - cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< -quiet_cmd_vdso64cc = VDSO64C $@ - cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $< - -# install commands for the unstripped file -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ - -vdso64.so: $(obj)/vdso64.so.dbg - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: vdso64.so - -# Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh -quiet_cmd_vdsosym = VDSOSYM $@ - cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ - -include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE - $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/note.S b/arch/s390/kernel/vdso64/note.S deleted file mode 100644 index db19d0680a0a..000000000000 --- a/arch/s390/kernel/vdso64/note.S +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. - * Here we can supply some information useful to userland. - */ - -#include <linux/uts.h> -#include <linux/version.h> -#include <linux/elfnote.h> - -ELFNOTE_START(Linux, 0, "a") - .long LINUX_VERSION_CODE -ELFNOTE_END diff --git a/arch/s390/kernel/vdso64/vdso64_wrapper.S b/arch/s390/kernel/vdso64/vdso64_wrapper.S deleted file mode 100644 index 672184998623..000000000000 --- a/arch/s390/kernel/vdso64/vdso64_wrapper.S +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/init.h> -#include <linux/linkage.h> -#include <asm/page.h> - - __PAGE_ALIGNED_DATA - - .globl vdso64_start, vdso64_end - .balign PAGE_SIZE -vdso64_start: - .incbin "arch/s390/kernel/vdso64/vdso64.so" - .balign PAGE_SIZE -vdso64_end: - - .previous diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c new file mode 100644 index 000000000000..cc8933e04ff7 --- /dev/null +++ b/arch/s390/kernel/vmcore_info.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/vmcore_info.h> +#include <linux/mm.h> +#include <asm/abs_lowcore.h> +#include <asm/sections.h> +#include <asm/setup.h> + +void arch_crash_save_vmcoreinfo(void) +{ + struct lowcore *abs_lc; + + VMCOREINFO_SYMBOL(lowcore_ptr); + VMCOREINFO_SYMBOL(high_memory); + VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31); + vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31); + vmcoreinfo_append_str("IDENTITYBASE=%lx\n", __identity_base); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("KERNELOFFPHYS=%lx\n", __kaslr_offset_phys); + abs_lc = get_abs_lowcore(); + abs_lc->vmcore_info = paddr_vmcoreinfo_note(); + put_abs_lowcore(abs_lc); +} diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 4c0e19145cc6..53bcbb91bb9b 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -5,6 +5,7 @@ #include <asm/thread_info.h> #include <asm/page.h> +#include <asm/ftrace.lds.h> /* * Put .bss..swapper_pg_dir as the first thing in .bss. This will @@ -13,9 +14,13 @@ #define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir) \ *(.bss..invalid_pg_dir) +#define RO_EXCEPTION_TABLE_ALIGN 16 + /* Handle ro_after_init data on our own. */ #define RO_AFTER_INIT_DATA +#define RUNTIME_DISCARD_EXIT + #define EMITS_PT_NOTE #include <asm-generic/vmlinux.lds.h> @@ -34,20 +39,19 @@ PHDRS { SECTIONS { - . = 0x100000; + . = TEXT_OFFSET; .text : { _stext = .; /* Start of text section */ _text = .; /* Text and read-only data */ HEAD_TEXT TEXT_TEXT SCHED_TEXT - CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT - *(.text.*_indirect_*) - *(.fixup) + FTRACE_HOTPATCH_TRAMPOLINES_TEXT + *(.text..*_indirect_*) *(.gnu.warning) . = ALIGN(PAGE_SIZE); _etext = .; /* End of text section */ @@ -64,13 +68,39 @@ SECTIONS *(.data..ro_after_init) JUMP_TABLE_DATA } :data - EXCEPTION_TABLE(16) . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; + . = ALIGN(8); + .skey_region_table : { + __skey_region_start = .; + KEEP(*(.skey_region)) + __skey_region_end = .; + } + + .data.rel.ro : { + *(.data.rel.ro .data.rel.ro.*) + } + .got : { + __got_start = .; + *(.got) + __got_end = .; + } + RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE) + .data.rel : { + *(.data.rel*) + } BOOT_DATA_PRESERVED + . = ALIGN(8); + .amode31.refs : { + _start_amode31_refs = .; + *(.amode31.refs) + _end_amode31_refs = .; + } + + . = ALIGN(PAGE_SIZE); _edata = .; /* End of data section */ /* will be freed after init */ @@ -120,9 +150,19 @@ SECTIONS *(.altinstr_replacement) } +#ifdef CONFIG_STACKPROTECTOR + . = ALIGN(8); + .stack_prot_table : { + __stack_prot_start = .; + KEEP(*(__stack_protector_loc)) + __stack_prot_end = .; + } +#endif + /* * Table with the patch locations to undo expolines */ + . = ALIGN(4); .nospec_call_table : { __nospec_call_start = . ; *(.s390_indirect*) @@ -136,36 +176,80 @@ SECTIONS BOOT_DATA + /* + * .amode31 section for code, data, ex_table that need to stay + * below 2 GB, even when the kernel is relocated above 2 GB. + */ + . = ALIGN(PAGE_SIZE); + _samode31 = .; + .amode31.text : { + _stext_amode31 = .; + *(.amode31.text) + *(.amode31.text.*_indirect_*) + . = ALIGN(PAGE_SIZE); + _etext_amode31 = .; + } + . = ALIGN(16); + .amode31.ex_table : { + _start_amode31_ex_table = .; + KEEP(*(.amode31.ex_table)) + _stop_amode31_ex_table = .; + } + . = ALIGN(PAGE_SIZE); + .amode31.data : { + *(.amode31.data) + } + . = _samode31 + AMODE31_SIZE; + _eamode31 = .; + /* early.c uses stsi, which requires page aligned data. */ . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) - PERCPU_SECTION(0x100) + RUNTIME_CONST_VARIABLES - .dynsym ALIGN(8) : { - __dynsym_start = .; - *(.dynsym) - __dynsym_end = .; - } - .rela.dyn ALIGN(8) : { - __rela_dyn_start = .; - *(.rela*) - __rela_dyn_end = .; - } + PERCPU_SECTION(0x100) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE) + . = ALIGN(PAGE_SIZE); _end = . ; + /* Debugging sections. */ + STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS + + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the three reserved double words. + */ + .got.plt : { + *(.got.plt) + } + ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .plt : { + *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") + /* * uncompressed image info used by the decompressor * it should match struct vmlinux_info */ .vmlinux.info 0 (INFO) : { - QUAD(_stext) /* default_lma */ QUAD(startup_continue) /* entry */ QUAD(__bss_start - _stext) /* image_size */ QUAD(__bss_stop - __bss_start) /* bss_size */ @@ -174,19 +258,31 @@ SECTIONS QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ QUAD(__boot_data_preserved_end - __boot_data_preserved_start) /* bootdata_preserved_size */ - QUAD(__dynsym_start) /* dynsym_start */ - QUAD(__rela_dyn_start) /* rela_dyn_start */ - QUAD(__rela_dyn_end) /* rela_dyn_end */ + QUAD(__got_start) /* got_start */ + QUAD(__got_end) /* got_end */ + QUAD(_eamode31 - _samode31) /* amode31_size */ + QUAD(init_mm) + QUAD(swapper_pg_dir) + QUAD(invalid_pg_dir) + QUAD(__alt_instructions) + QUAD(__alt_instructions_end) +#ifdef CONFIG_STACKPROTECTOR + QUAD(__stack_prot_start) + QUAD(__stack_prot_end) +#endif +#ifdef CONFIG_KASAN + QUAD(kasan_early_shadow_page) + QUAD(kasan_early_shadow_pte) + QUAD(kasan_early_shadow_pmd) + QUAD(kasan_early_shadow_pud) + QUAD(kasan_early_shadow_p4d) +#endif } :NONE - /* Debugging sections. */ - STABS_DEBUG - DWARF_DEBUG - ELF_DETAILS - /* Sections to be discarded */ DISCARDS /DISCARD/ : { *(.eh_frame) + *(.interp) } } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index f216a1b2f825..234a0ba30510 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -7,13 +7,13 @@ */ #include <linux/kernel_stat.h> -#include <linux/sched/cputime.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/timex.h> #include <linux/types.h> #include <linux/time.h> #include <asm/alternative.h> +#include <asm/cputime.h> #include <asm/vtimer.h> #include <asm/vtime.h> #include <asm/cpu_mf.h> @@ -33,24 +33,17 @@ static DEFINE_PER_CPU(u64, mt_scaling_mult) = { 1 }; static DEFINE_PER_CPU(u64, mt_scaling_div) = { 1 }; static DEFINE_PER_CPU(u64, mt_scaling_jiffies); -static inline u64 get_vtimer(void) -{ - u64 timer; - - asm volatile("stpt %0" : "=Q" (timer)); - return timer; -} - static inline void set_vtimer(u64 expires) { + struct lowcore *lc = get_lowcore(); u64 timer; asm volatile( " stpt %0\n" /* Store current cpu timer value */ " spt %1" /* Set new value imm. afterwards */ : "=Q" (timer) : "Q" (expires)); - S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; - S390_lowcore.last_update_timer = expires; + lc->system_timer += lc->last_update_timer - timer; + lc->last_update_timer = expires; } static inline int virt_timer_forward(u64 elapsed) @@ -125,23 +118,23 @@ static void account_system_index_scaled(struct task_struct *p, u64 cputime, static int do_account_vtime(struct task_struct *tsk) { u64 timer, clock, user, guest, system, hardirq, softirq; + struct lowcore *lc = get_lowcore(); - timer = S390_lowcore.last_update_timer; - clock = S390_lowcore.last_update_clock; - /* Use STORE CLOCK by default, STORE CLOCK FAST if available. */ - alternative_io("stpt %0\n .insn s,0xb2050000,%1\n", - "stpt %0\n .insn s,0xb27c0000,%1\n", - 25, - ASM_OUTPUT2("=Q" (S390_lowcore.last_update_timer), - "=Q" (S390_lowcore.last_update_clock)), - ASM_NO_INPUT_CLOBBER("cc")); - clock = S390_lowcore.last_update_clock - clock; - timer -= S390_lowcore.last_update_timer; + timer = lc->last_update_timer; + clock = lc->last_update_clock; + asm volatile( + " stpt %0\n" /* Store current cpu timer value */ + " stckf %1" /* Store current tod clock value */ + : "=Q" (lc->last_update_timer), + "=Q" (lc->last_update_clock) + : : "cc"); + clock = lc->last_update_clock - clock; + timer -= lc->last_update_timer; if (hardirq_count()) - S390_lowcore.hardirq_timer += timer; + lc->hardirq_timer += timer; else - S390_lowcore.system_timer += timer; + lc->system_timer += timer; /* Update MT utilization calculation */ if (smp_cpu_mtid && @@ -150,16 +143,16 @@ static int do_account_vtime(struct task_struct *tsk) /* Calculate cputime delta */ user = update_tsk_timer(&tsk->thread.user_timer, - READ_ONCE(S390_lowcore.user_timer)); + READ_ONCE(lc->user_timer)); guest = update_tsk_timer(&tsk->thread.guest_timer, - READ_ONCE(S390_lowcore.guest_timer)); + READ_ONCE(lc->guest_timer)); system = update_tsk_timer(&tsk->thread.system_timer, - READ_ONCE(S390_lowcore.system_timer)); + READ_ONCE(lc->system_timer)); hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, - READ_ONCE(S390_lowcore.hardirq_timer)); + READ_ONCE(lc->hardirq_timer)); softirq = update_tsk_timer(&tsk->thread.softirq_timer, - READ_ONCE(S390_lowcore.softirq_timer)); - S390_lowcore.steal_timer += + READ_ONCE(lc->softirq_timer)); + lc->steal_timer += clock - user - guest - system - hardirq - softirq; /* Push account value */ @@ -185,17 +178,19 @@ static int do_account_vtime(struct task_struct *tsk) void vtime_task_switch(struct task_struct *prev) { + struct lowcore *lc = get_lowcore(); + do_account_vtime(prev); - prev->thread.user_timer = S390_lowcore.user_timer; - prev->thread.guest_timer = S390_lowcore.guest_timer; - prev->thread.system_timer = S390_lowcore.system_timer; - prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; - prev->thread.softirq_timer = S390_lowcore.softirq_timer; - S390_lowcore.user_timer = current->thread.user_timer; - S390_lowcore.guest_timer = current->thread.guest_timer; - S390_lowcore.system_timer = current->thread.system_timer; - S390_lowcore.hardirq_timer = current->thread.hardirq_timer; - S390_lowcore.softirq_timer = current->thread.softirq_timer; + prev->thread.user_timer = lc->user_timer; + prev->thread.guest_timer = lc->guest_timer; + prev->thread.system_timer = lc->system_timer; + prev->thread.hardirq_timer = lc->hardirq_timer; + prev->thread.softirq_timer = lc->softirq_timer; + lc->user_timer = current->thread.user_timer; + lc->guest_timer = current->thread.guest_timer; + lc->system_timer = current->thread.system_timer; + lc->hardirq_timer = current->thread.hardirq_timer; + lc->softirq_timer = current->thread.softirq_timer; } /* @@ -205,28 +200,29 @@ void vtime_task_switch(struct task_struct *prev) */ void vtime_flush(struct task_struct *tsk) { + struct lowcore *lc = get_lowcore(); u64 steal, avg_steal; if (do_account_vtime(tsk)) virt_timer_expire(); - steal = S390_lowcore.steal_timer; - avg_steal = S390_lowcore.avg_steal_timer / 2; + steal = lc->steal_timer; + avg_steal = lc->avg_steal_timer; if ((s64) steal > 0) { - S390_lowcore.steal_timer = 0; + lc->steal_timer = 0; account_steal_time(cputime_to_nsecs(steal)); avg_steal += steal; } - S390_lowcore.avg_steal_timer = avg_steal; + lc->avg_steal_timer = avg_steal / 2; } static u64 vtime_delta(void) { - u64 timer = S390_lowcore.last_update_timer; - - S390_lowcore.last_update_timer = get_vtimer(); + struct lowcore *lc = get_lowcore(); + u64 timer = lc->last_update_timer; - return timer - S390_lowcore.last_update_timer; + lc->last_update_timer = get_cpu_timer(); + return timer - lc->last_update_timer; } /* @@ -235,12 +231,13 @@ static u64 vtime_delta(void) */ void vtime_account_kernel(struct task_struct *tsk) { + struct lowcore *lc = get_lowcore(); u64 delta = vtime_delta(); if (tsk->flags & PF_VCPU) - S390_lowcore.guest_timer += delta; + lc->guest_timer += delta; else - S390_lowcore.system_timer += delta; + lc->system_timer += delta; virt_timer_forward(delta); } @@ -250,7 +247,7 @@ void vtime_account_softirq(struct task_struct *tsk) { u64 delta = vtime_delta(); - S390_lowcore.softirq_timer += delta; + get_lowcore()->softirq_timer += delta; virt_timer_forward(delta); } @@ -259,7 +256,7 @@ void vtime_account_hardirq(struct task_struct *tsk) { u64 delta = vtime_delta(); - S390_lowcore.hardirq_timer += delta; + get_lowcore()->hardirq_timer += delta; virt_timer_forward(delta); } diff --git a/arch/s390/kernel/wti.c b/arch/s390/kernel/wti.c new file mode 100644 index 000000000000..949fdbf0e8b6 --- /dev/null +++ b/arch/s390/kernel/wti.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for warning track interruption + * + * Copyright IBM Corp. 2023 + */ + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/kallsyms.h> +#include <linux/smpboot.h> +#include <linux/irq.h> +#include <uapi/linux/sched/types.h> +#include <asm/debug.h> +#include <asm/diag.h> +#include <asm/sclp.h> + +#define WTI_DBF_LEN 64 + +struct wti_debug { + unsigned long missed; + unsigned long addr; + pid_t pid; +}; + +struct wti_state { + /* debug data for s390dbf */ + struct wti_debug dbg; + /* + * Represents the real-time thread responsible to + * acknowledge the warning-track interrupt and trigger + * preliminary and postliminary precautions. + */ + struct task_struct *thread; + /* + * If pending is true, the real-time thread must be scheduled. + * If not, a wake up of that thread will remain a noop. + */ + bool pending; +}; + +static DEFINE_PER_CPU(struct wti_state, wti_state); + +static debug_info_t *wti_dbg; + +/* + * During a warning-track grace period, interrupts are disabled + * to prevent delays of the warning-track acknowledgment. + * + * Once the CPU is physically dispatched again, interrupts are + * re-enabled. + */ + +static void wti_irq_disable(void) +{ + unsigned long flags; + struct ctlreg cr6; + + local_irq_save(flags); + local_ctl_store(6, &cr6); + /* disable all I/O interrupts */ + cr6.val &= ~0xff000000UL; + local_ctl_load(6, &cr6); + local_irq_restore(flags); +} + +static void wti_irq_enable(void) +{ + unsigned long flags; + struct ctlreg cr6; + + local_irq_save(flags); + local_ctl_store(6, &cr6); + /* enable all I/O interrupts */ + cr6.val |= 0xff000000UL; + local_ctl_load(6, &cr6); + local_irq_restore(flags); +} + +static void store_debug_data(struct wti_state *st) +{ + struct pt_regs *regs = get_irq_regs(); + + st->dbg.pid = current->pid; + st->dbg.addr = 0; + if (!user_mode(regs)) + st->dbg.addr = regs->psw.addr; +} + +static void wti_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct wti_state *st = this_cpu_ptr(&wti_state); + + inc_irq_stat(IRQEXT_WTI); + wti_irq_disable(); + store_debug_data(st); + st->pending = true; + wake_up_process(st->thread); +} + +static int wti_pending(unsigned int cpu) +{ + struct wti_state *st = per_cpu_ptr(&wti_state, cpu); + + return st->pending; +} + +static void wti_dbf_grace_period(struct wti_state *st) +{ + struct wti_debug *wdi = &st->dbg; + char buf[WTI_DBF_LEN]; + + if (wdi->addr) + snprintf(buf, sizeof(buf), "%d %pS", wdi->pid, (void *)wdi->addr); + else + snprintf(buf, sizeof(buf), "%d <user>", wdi->pid); + debug_text_event(wti_dbg, 2, buf); + wdi->missed++; +} + +static int wti_show(struct seq_file *seq, void *v) +{ + struct wti_state *st; + int cpu; + + cpus_read_lock(); + seq_puts(seq, " "); + for_each_online_cpu(cpu) + seq_printf(seq, "CPU%-8d", cpu); + seq_putc(seq, '\n'); + for_each_online_cpu(cpu) { + st = per_cpu_ptr(&wti_state, cpu); + seq_printf(seq, " %10lu", st->dbg.missed); + } + seq_putc(seq, '\n'); + cpus_read_unlock(); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(wti); + +static void wti_thread_fn(unsigned int cpu) +{ + struct wti_state *st = per_cpu_ptr(&wti_state, cpu); + + st->pending = false; + /* + * Yield CPU voluntarily to the hypervisor. Control + * resumes when hypervisor decides to dispatch CPU + * to this LPAR again. + */ + if (diag49c(DIAG49C_SUBC_ACK)) + wti_dbf_grace_period(st); + wti_irq_enable(); +} + +static struct smp_hotplug_thread wti_threads = { + .store = &wti_state.thread, + .thread_should_run = wti_pending, + .thread_fn = wti_thread_fn, + .thread_comm = "cpuwti/%u", + .selfparking = false, +}; + +static int __init wti_init(void) +{ + struct sched_param wti_sched_param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct dentry *wti_dir; + struct wti_state *st; + int cpu, rc; + + rc = -EOPNOTSUPP; + if (!sclp.has_wti) + goto out; + rc = smpboot_register_percpu_thread(&wti_threads); + if (WARN_ON(rc)) + goto out; + for_each_online_cpu(cpu) { + st = per_cpu_ptr(&wti_state, cpu); + sched_setscheduler(st->thread, SCHED_FIFO, &wti_sched_param); + } + rc = register_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt); + if (rc) { + pr_warn("Couldn't request external interrupt 0x1007\n"); + goto out_thread; + } + irq_subclass_register(IRQ_SUBCLASS_WARNING_TRACK); + rc = diag49c(DIAG49C_SUBC_REG); + if (rc) { + pr_warn("Failed to register warning track interrupt through DIAG 49C\n"); + rc = -EOPNOTSUPP; + goto out_subclass; + } + wti_dir = debugfs_create_dir("wti", arch_debugfs_dir); + debugfs_create_file("stat", 0400, wti_dir, NULL, &wti_fops); + wti_dbg = debug_register("wti", 1, 1, WTI_DBF_LEN); + if (!wti_dbg) { + rc = -ENOMEM; + goto out_debug_register; + } + rc = debug_register_view(wti_dbg, &debug_hex_ascii_view); + if (rc) + goto out_debug_register; + goto out; +out_debug_register: + debug_unregister(wti_dbg); +out_subclass: + irq_subclass_unregister(IRQ_SUBCLASS_WARNING_TRACK); + unregister_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt); +out_thread: + smpboot_unregister_percpu_thread(&wti_threads); +out: + return rc; +} +late_initcall(wti_init); |
