diff options
Diffstat (limited to 'arch/s390/kernel')
80 files changed, 4127 insertions, 2320 deletions
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index fa029d0dc28f..db5f3a3faefb 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -11,6 +11,8 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) # Do not trace early setup code CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_stacktrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_unwind_bc.o = $(CC_FLAGS_FTRACE) endif @@ -34,22 +36,24 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls -obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o +obj-y := head64.o traps.o time.o process.o early.o setup.o idle.o vtime.o obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o -obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o +obj-y += debug.o irq.o ipl.o dis.o vdso.o cpufeature.o obj-y += sysinfo.o lgr.o os_info.o ctlreg.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o +obj-y += diag/ extra-y += vmlinux.lds obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) +obj-$(CONFIG_SYSFS) += cpacf.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o +obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o hiperdispatch.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AUDIT) += audit.o compat-obj-$(CONFIG_AUDIT) += compat_audit.o @@ -57,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o obj-$(CONFIG_COMPAT) += $(compat-obj-y) obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_KPROBES) += kprobes_insn_page.o obj-$(CONFIG_KPROBES) += mcount.o obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o @@ -79,7 +82,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o obj-$(CONFIG_PERF_EVENTS) += perf_pai_crypto.o perf_pai_ext.o obj-$(CONFIG_TRACEPOINTS) += trace.o -obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso obj-y += vdso64/ diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c index f9efc54ec4b7..88f0b91d7a73 100644 --- a/arch/s390/kernel/abs_lowcore.c +++ b/arch/s390/kernel/abs_lowcore.c @@ -2,8 +2,10 @@ #include <linux/pgtable.h> #include <asm/abs_lowcore.h> +#include <asm/sections.h> unsigned long __bootdata_preserved(__abs_lowcore); +int __bootdata_preserved(relocate_lowcore); int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc) { diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index e7bca29f9c34..8d5d0de35de0 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -1,75 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <asm/text-patching.h> + +#include <linux/uaccess.h> +#include <asm/nospec-branch.h> +#include <asm/abs_lowcore.h> #include <asm/alternative.h> #include <asm/facility.h> -#include <asm/nospec-branch.h> -static int __initdata_or_module alt_instr_disabled; - -static int __init disable_alternative_instructions(char *str) +void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx) { - alt_instr_disabled = 1; - return 0; -} - -early_param("noaltinstr", disable_alternative_instructions); - -static void __init_or_module __apply_alternatives(struct alt_instr *start, - struct alt_instr *end) -{ - struct alt_instr *a; u8 *instr, *replacement; + struct alt_instr *a; + bool replace; /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. */ for (a = start; a < end; a++) { - instr = (u8 *)&a->instr_offset + a->instr_offset; - replacement = (u8 *)&a->repl_offset + a->repl_offset; - - if (!__test_facility(a->facility, alt_stfle_fac_list)) - continue; - - if (unlikely(a->instrlen % 2)) { - WARN_ONCE(1, "cpu alternatives instructions length is " - "odd, skipping patching\n"); + if (!(a->ctx & ctx)) continue; + switch (a->type) { + case ALT_TYPE_FACILITY: + replace = test_facility(a->data); + break; + case ALT_TYPE_SPEC: + replace = nobp_enabled(); + break; + case ALT_TYPE_LOWCORE: + replace = have_relocated_lowcore(); + break; + default: + replace = false; } - + if (!replace) + continue; + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; s390_kernel_write(instr, replacement, a->instrlen); } } - -void __init_or_module apply_alternatives(struct alt_instr *start, - struct alt_instr *end) -{ - if (!alt_instr_disabled) - __apply_alternatives(start, end); -} - -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -void __init apply_alternative_instructions(void) -{ - apply_alternatives(__alt_instructions, __alt_instructions_end); -} - -static void do_sync_core(void *info) -{ - sync_core(); -} - -void text_poke_sync(void) -{ - on_each_cpu(do_sync_core, NULL, 1); -} - -void text_poke_sync_lock(void) -{ - cpus_read_lock(); - text_poke_sync(); - cpus_read_unlock(); -} diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index fa5f6885c74a..36709112ae7a 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -13,8 +13,6 @@ #include <linux/purgatory.h> #include <linux/pgtable.h> #include <linux/ftrace.h> -#include <asm/idle.h> -#include <asm/gmap.h> #include <asm/stacktrace.h> int main(void) @@ -29,6 +27,7 @@ int main(void) BLANK(); /* thread info offsets */ OFFSET(__TI_flags, task_struct, thread_info.flags); + OFFSET(__TI_sie, task_struct, thread_info.sie); BLANK(); /* pt_regs offsets */ OFFSET(__PT_PSW, pt_regs, psw); @@ -64,12 +63,13 @@ int main(void) OFFSET(__SF_SIE_REASON, stack_frame, sie_reason); OFFSET(__SF_SIE_FLAGS, stack_frame, sie_flags); OFFSET(__SF_SIE_CONTROL_PHYS, stack_frame, sie_control_block_phys); + OFFSET(__SF_SIE_GUEST_ASCE, stack_frame, sie_guest_asce); DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame)); BLANK(); - /* idle data offsets */ - OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); - OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter); - OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter); + OFFSET(__SFUSER_BACKCHAIN, stack_frame_user, back_chain); + DEFINE(STACK_FRAME_USER_OVERHEAD, sizeof(struct stack_frame_user)); + OFFSET(__SFVDSO_RETURN_ADDRESS, stack_frame_vdso_wrapper, return_address); + DEFINE(STACK_FRAME_VDSO_OVERHEAD, sizeof(struct stack_frame_vdso_wrapper)); BLANK(); /* hardware defined lowcore locations 0x000 - 0x1ff */ OFFSET(__LC_EXT_PARAMS, lowcore, ext_params); @@ -111,10 +111,9 @@ int main(void) OFFSET(__LC_MCK_NEW_PSW, lowcore, mcck_new_psw); OFFSET(__LC_IO_NEW_PSW, lowcore, io_new_psw); /* software defined lowcore locations 0x200 - 0xdff*/ - OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync); - OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async); + OFFSET(__LC_SAVE_AREA, lowcore, save_area); OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart); - OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags); + OFFSET(__LC_PCPU, lowcore, pcpu); OFFSET(__LC_RETURN_PSW, lowcore, return_psw); OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw); OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer); @@ -138,7 +137,6 @@ int main(void) OFFSET(__LC_USER_ASCE, lowcore, user_asce); OFFSET(__LC_LPP, lowcore, lpp); OFFSET(__LC_CURRENT_PID, lowcore, current_pid); - OFFSET(__LC_GMAP, lowcore, gmap); OFFSET(__LC_LAST_BREAK, lowcore, last_break); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); @@ -161,7 +159,6 @@ int main(void) OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); BLANK(); /* gmap/sie offsets */ - OFFSET(__GMAP_ASCE, gmap, asce); OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20); /* kexec_sha_region */ @@ -178,13 +175,9 @@ int main(void) DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* function graph return value tracing */ - OFFSET(__FGRAPH_RET_GPR2, fgraph_ret_regs, gpr2); - OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp); - DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs)); -#endif - OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs); - DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs)); + OFFSET(__FTRACE_REGS_PT_REGS, __arch_ftrace_regs, regs); + DEFINE(__FTRACE_REGS_SIZE, sizeof(struct __arch_ftrace_regs)); + + OFFSET(__PCPU_FLAGS, pcpu, flags); return 0; } diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c index 554447768bdd..bf983513dd33 100644 --- a/arch/s390/kernel/cert_store.c +++ b/arch/s390/kernel/cert_store.c @@ -21,6 +21,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/sysfs.h> +#include <linux/vmalloc.h> #include <crypto/sha2.h> #include <keys/user-type.h> #include <asm/debug.h> diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 1942e2a9f8db..5a86b9d1da71 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -24,11 +24,11 @@ #include <linux/tty.h> #include <linux/personality.h> #include <linux/binfmts.h> +#include <asm/vdso-symbols.h> #include <asm/access-regs.h> #include <asm/ucontext.h> #include <linux/uaccess.h> #include <asm/lowcore.h> -#include <asm/vdso.h> #include <asm/fpu.h> #include "compat_linux.h" #include "compat_ptrace.h" diff --git a/arch/s390/kernel/cpacf.c b/arch/s390/kernel/cpacf.c new file mode 100644 index 000000000000..4b9b34f95d72 --- /dev/null +++ b/arch/s390/kernel/cpacf.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define KMSG_COMPONENT "cpacf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/sysfs.h> +#include <asm/cpacf.h> + +#define CPACF_QUERY(name, instruction) \ +static ssize_t name##_query_raw_read(struct file *fp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t offs, \ + size_t count) \ +{ \ + cpacf_mask_t mask; \ + \ + if (!cpacf_query(CPACF_##instruction, &mask)) \ + return -EOPNOTSUPP; \ + return memory_read_from_buffer(buf, count, &offs, &mask, sizeof(mask)); \ +} \ +static const BIN_ATTR_RO(name##_query_raw, sizeof(cpacf_mask_t)) + +CPACF_QUERY(km, KM); +CPACF_QUERY(kmc, KMC); +CPACF_QUERY(kimd, KIMD); +CPACF_QUERY(klmd, KLMD); +CPACF_QUERY(kmac, KMAC); +CPACF_QUERY(pckmo, PCKMO); +CPACF_QUERY(kmf, KMF); +CPACF_QUERY(kmctr, KMCTR); +CPACF_QUERY(kmo, KMO); +CPACF_QUERY(pcc, PCC); +CPACF_QUERY(prno, PRNO); +CPACF_QUERY(kma, KMA); +CPACF_QUERY(kdsa, KDSA); + +#define CPACF_QAI(name, instruction) \ +static ssize_t name##_query_auth_info_raw_read( \ + struct file *fp, struct kobject *kobj, \ + const struct bin_attribute *attr, char *buf, loff_t offs, \ + size_t count) \ +{ \ + cpacf_qai_t qai; \ + \ + if (!cpacf_qai(CPACF_##instruction, &qai)) \ + return -EOPNOTSUPP; \ + return memory_read_from_buffer(buf, count, &offs, &qai, \ + sizeof(qai)); \ +} \ +static const BIN_ATTR_RO(name##_query_auth_info_raw, sizeof(cpacf_qai_t)) + +CPACF_QAI(km, KM); +CPACF_QAI(kmc, KMC); +CPACF_QAI(kimd, KIMD); +CPACF_QAI(klmd, KLMD); +CPACF_QAI(kmac, KMAC); +CPACF_QAI(pckmo, PCKMO); +CPACF_QAI(kmf, KMF); +CPACF_QAI(kmctr, KMCTR); +CPACF_QAI(kmo, KMO); +CPACF_QAI(pcc, PCC); +CPACF_QAI(prno, PRNO); +CPACF_QAI(kma, KMA); +CPACF_QAI(kdsa, KDSA); + +static const struct bin_attribute *const cpacf_attrs[] = { + &bin_attr_km_query_raw, + &bin_attr_kmc_query_raw, + &bin_attr_kimd_query_raw, + &bin_attr_klmd_query_raw, + &bin_attr_kmac_query_raw, + &bin_attr_pckmo_query_raw, + &bin_attr_kmf_query_raw, + &bin_attr_kmctr_query_raw, + &bin_attr_kmo_query_raw, + &bin_attr_pcc_query_raw, + &bin_attr_prno_query_raw, + &bin_attr_kma_query_raw, + &bin_attr_kdsa_query_raw, + &bin_attr_km_query_auth_info_raw, + &bin_attr_kmc_query_auth_info_raw, + &bin_attr_kimd_query_auth_info_raw, + &bin_attr_klmd_query_auth_info_raw, + &bin_attr_kmac_query_auth_info_raw, + &bin_attr_pckmo_query_auth_info_raw, + &bin_attr_kmf_query_auth_info_raw, + &bin_attr_kmctr_query_auth_info_raw, + &bin_attr_kmo_query_auth_info_raw, + &bin_attr_pcc_query_auth_info_raw, + &bin_attr_prno_query_auth_info_raw, + &bin_attr_kma_query_auth_info_raw, + &bin_attr_kdsa_query_auth_info_raw, + NULL, +}; + +static const struct attribute_group cpacf_attr_grp = { + .name = "cpacf", + .bin_attrs_new = cpacf_attrs, +}; + +static int __init cpacf_init(void) +{ + struct device *cpu_root; + int rc = 0; + + cpu_root = bus_get_dev_root(&cpu_subsys); + if (cpu_root) { + rc = sysfs_create_group(&cpu_root->kobj, &cpacf_attr_grp); + put_device(cpu_root); + } + return rc; +} +device_initcall(cpacf_init); diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index b210a29d3ee9..2f4174b961de 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -20,6 +20,7 @@ #include <asm/diag.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> +#include <asm/asm.h> static DEFINE_SPINLOCK(cpcmd_lock); static char cpcmd_buf[241]; @@ -45,12 +46,11 @@ static int diag8_response(int cmdlen, char *response, int *rlen) ry.odd = *rlen; asm volatile( " diag %[rx],%[ry],0x8\n" - " ipm %[cc]\n" - " srl %[cc],28\n" - : [cc] "=&d" (cc), [ry] "+&d" (ry.pair) + CC_IPM(cc) + : CC_OUT(cc, cc), [ry] "+d" (ry.pair) : [rx] "d" (rx.pair) - : "cc"); - if (cc) + : CC_CLOBBER); + if (CC_TRANSFORM(cc)) *rlen += ry.odd; else *rlen = ry.odd; diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index d09ebb6f5262..276cb4c1e11b 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -63,9 +63,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) { struct save_area *sa; - sa = memblock_alloc(sizeof(*sa), 8); - if (!sa) - return NULL; + sa = memblock_alloc_or_panic(sizeof(*sa), 8); if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); @@ -237,6 +235,17 @@ int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, prot); } +/* + * Return true only when in a kdump or stand-alone kdump environment. + * Note that /proc/vmcore might also be available in "standard zfcp/nvme dump" + * environments, where this function returns false; see dump_available(). + */ +bool is_kdump_kernel(void) +{ + return oldmem_data.start; +} +EXPORT_SYMBOL_GPL(is_kdump_kernel); + static const char *nt_name(Elf64_Word type) { const char *name = "LINUX"; @@ -451,7 +460,7 @@ static void *nt_final(void *ptr) /* * Initialize ELF header (new kernel) */ -static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +static void *ehdr_init(Elf64_Ehdr *ehdr, int phdr_count) { memset(ehdr, 0, sizeof(*ehdr)); memcpy(ehdr->e_ident, ELFMAG, SELFMAG); @@ -465,7 +474,8 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) ehdr->e_phoff = sizeof(Elf64_Ehdr); ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - ehdr->e_phnum = mem_chunk_cnt + 1; + /* Number of PT_LOAD program headers plus PT_NOTE program header */ + ehdr->e_phnum = phdr_count + 1; return ehdr + 1; } @@ -496,27 +506,77 @@ static int get_mem_chunk_cnt(void) return cnt; } +static void fill_ptload(Elf64_Phdr *phdr, unsigned long paddr, + unsigned long vaddr, unsigned long size) +{ + phdr->p_type = PT_LOAD; + phdr->p_vaddr = vaddr; + phdr->p_offset = paddr; + phdr->p_paddr = paddr; + phdr->p_filesz = size; + phdr->p_memsz = size; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; +} + /* * Initialize ELF loads (new kernel) */ -static void loads_init(Elf64_Phdr *phdr) +static void loads_init(Elf64_Phdr *phdr, bool os_info_has_vm) { + unsigned long old_identity_base = 0; phys_addr_t start, end; u64 idx; + if (os_info_has_vm) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); for_each_physmem_range(idx, &oldmem_type, &start, &end) { - phdr->p_filesz = end - start; - phdr->p_type = PT_LOAD; - phdr->p_offset = start; - phdr->p_vaddr = (unsigned long)__va(start); - phdr->p_paddr = start; - phdr->p_memsz = end - start; - phdr->p_flags = PF_R | PF_W | PF_X; - phdr->p_align = PAGE_SIZE; + fill_ptload(phdr, start, old_identity_base + start, + end - start); phdr++; } } +static bool os_info_has_vm(void) +{ + return os_info_old_value(OS_INFO_KASLR_OFFSET); +} + +#ifdef CONFIG_PROC_VMCORE_DEVICE_RAM +/* + * Fill PT_LOAD for a physical memory range owned by a device and detected by + * its device driver. + */ +void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr, + unsigned long long paddr, unsigned long long size) +{ + unsigned long old_identity_base = 0; + + if (os_info_has_vm()) + old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); + fill_ptload(phdr, paddr, old_identity_base + paddr, size); +} +#endif + +/* + * Prepare PT_LOAD type program header for kernel image region + */ +static void text_init(Elf64_Phdr *phdr) +{ + unsigned long start_phys = os_info_old_value(OS_INFO_IMAGE_PHYS); + unsigned long start = os_info_old_value(OS_INFO_IMAGE_START); + unsigned long end = os_info_old_value(OS_INFO_IMAGE_END); + + phdr->p_type = PT_LOAD; + phdr->p_vaddr = start; + phdr->p_filesz = end - start; + phdr->p_memsz = end - start; + phdr->p_offset = start_phys; + phdr->p_paddr = start_phys; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; +} + /* * Initialize notes (new kernel) */ @@ -542,7 +602,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) return ptr; } -static size_t get_elfcorehdr_size(int mem_chunk_cnt) +static size_t get_elfcorehdr_size(int phdr_count) { size_t size; @@ -558,7 +618,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) /* nt_final */ size += sizeof(Elf64_Nhdr); /* PT_LOADS */ - size += mem_chunk_cnt * sizeof(Elf64_Phdr); + size += phdr_count * sizeof(Elf64_Phdr); return size; } @@ -568,9 +628,9 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) */ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { - Elf64_Phdr *phdr_notes, *phdr_loads; + Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text; + int mem_chunk_cnt, phdr_text_cnt; size_t alloc_size; - int mem_chunk_cnt; void *ptr, *hdr; u64 hdr_off; @@ -589,12 +649,14 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) } mem_chunk_cnt = get_mem_chunk_cnt(); + phdr_text_cnt = os_info_has_vm() ? 1 : 0; - alloc_size = get_elfcorehdr_size(mem_chunk_cnt); + alloc_size = get_elfcorehdr_size(mem_chunk_cnt + phdr_text_cnt); hdr = kzalloc(alloc_size, GFP_KERNEL); - /* Without elfcorehdr /proc/vmcore cannot be created. Thus creating + /* + * Without elfcorehdr /proc/vmcore cannot be created. Thus creating * a dump with this crash kernel will fail. Panic now to allow other * dump mechanisms to take over. */ @@ -602,18 +664,25 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) panic("s390 kdump allocating elfcorehdr failed"); /* Init elf header */ - ptr = ehdr_init(hdr, mem_chunk_cnt); + phdr_notes = ehdr_init(hdr, mem_chunk_cnt + phdr_text_cnt); /* Init program headers */ - phdr_notes = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); - phdr_loads = ptr; - ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + if (phdr_text_cnt) { + phdr_text = phdr_notes + 1; + phdr_loads = phdr_text + 1; + } else { + phdr_loads = phdr_notes + 1; + } + ptr = PTR_ADD(phdr_loads, sizeof(Elf64_Phdr) * mem_chunk_cnt); /* Init notes */ hdr_off = PTR_DIFF(ptr, hdr); ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + /* Init kernel text program header */ + if (phdr_text_cnt) + text_init(phdr_text); /* Init loads */ + loads_init(phdr_loads, phdr_text_cnt); + /* Finalize program headers */ hdr_off = PTR_DIFF(ptr, hdr); - loads_init(phdr_loads); *addr = (unsigned long long) hdr; *size = (unsigned long long) hdr_off; BUG_ON(elfcorehdr_size > alloc_size); diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 85328a0ef3b6..ce038e9205f7 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/math.h> #include <linux/minmax.h> #include <linux/debugfs.h> @@ -38,13 +39,13 @@ typedef struct file_private_info { loff_t offset; /* offset of last read in file */ - int act_area; /* number of last formated area */ + int act_area; /* number of last formatted area */ int act_page; /* act page in given area */ - int act_entry; /* last formated entry (offset */ + int act_entry; /* last formatted entry (offset */ /* relative to beginning of last */ - /* formated page) */ + /* formatted page) */ size_t act_entry_offset; /* up to this offset we copied */ - /* in last read the last formated */ + /* in last read the last formatted */ /* entry to userland */ char temp_buf[2048]; /* buffer for output */ debug_info_t *debug_info_org; /* original debug information */ @@ -63,7 +64,7 @@ typedef struct { long args[]; } debug_sprintf_entry_t; -/* internal function prototyes */ +/* internal function prototypes */ static int debug_init(void); static ssize_t debug_output(struct file *file, char __user *user_buf, @@ -77,12 +78,14 @@ static debug_info_t *debug_info_create(const char *name, int pages_per_area, static void debug_info_get(debug_info_t *); static void debug_info_put(debug_info_t *); static int debug_prolog_level_fn(debug_info_t *id, - struct debug_view *view, char *out_buf); + struct debug_view *view, char *out_buf, + size_t out_buf_size); static int debug_input_level_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); static int debug_prolog_pages_fn(debug_info_t *id, - struct debug_view *view, char *out_buf); + struct debug_view *view, char *out_buf, + size_t out_buf_size); static int debug_input_pages_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); @@ -90,9 +93,8 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view, struct file *file, const char __user *user_buf, size_t user_buf_size, loff_t *offset); static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf); -static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *inbuf); + char *out_buf, size_t out_buf_size, + const char *in_buf); static void debug_areas_swap(debug_info_t *a, debug_info_t *b); static void debug_events_append(debug_info_t *dest, debug_info_t *src); @@ -163,7 +165,6 @@ static const struct file_operations debug_file_ops = { .write = debug_input, .open = debug_open, .release = debug_close, - .llseek = no_llseek, }; static struct dentry *debug_debugfs_root_entry; @@ -351,7 +352,10 @@ static debug_info_t *debug_info_copy(debug_info_t *in, int mode) for (i = 0; i < in->nr_areas; i++) { for (j = 0; j < in->pages_per_area; j++) memcpy(rc->areas[i][j], in->areas[i][j], PAGE_SIZE); + rc->active_pages[i] = in->active_pages[i]; + rc->active_entries[i] = in->active_entries[i]; } + rc->active_area = in->active_area; out: spin_unlock_irqrestore(&in->lock, flags); return rc; @@ -381,7 +385,7 @@ static void debug_info_put(debug_info_t *db_info) /* * debug_format_entry: - * - format one debug entry and return size of formated data + * - format one debug entry and return size of formatted data */ static int debug_format_entry(file_private_info_t *p_info) { @@ -392,8 +396,10 @@ static int debug_format_entry(file_private_info_t *p_info) if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { /* print prolog */ - if (view->prolog_proc) - len += view->prolog_proc(id_snap, view, p_info->temp_buf); + if (view->prolog_proc) { + len += view->prolog_proc(id_snap, view, p_info->temp_buf, + sizeof(p_info->temp_buf)); + } goto out; } if (!id_snap->areas) /* this is true, if we have a prolog only view */ @@ -403,21 +409,31 @@ static int debug_format_entry(file_private_info_t *p_info) if (act_entry->clock == 0LL) goto out; /* empty entry */ - if (view->header_proc) + if (view->header_proc) { len += view->header_proc(id_snap, view, p_info->act_area, - act_entry, p_info->temp_buf + len); - if (view->format_proc) + act_entry, p_info->temp_buf + len, + sizeof(p_info->temp_buf) - len); + } + if (view->format_proc) { len += view->format_proc(id_snap, view, p_info->temp_buf + len, + sizeof(p_info->temp_buf) - len, DEBUG_DATA(act_entry)); + } out: return len; } -/* - * debug_next_entry: - * - goto next entry in p_info +/** + * debug_next_entry - Go to the next entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the next entry. If no further entry + * exists the current position is set to one after the end the return value + * indicates that no further entries exist. + * + * Return: True if there are more following entries, false otherwise */ -static inline int debug_next_entry(file_private_info_t *p_info) +static inline bool debug_next_entry(file_private_info_t *p_info) { debug_info_t *id; @@ -425,10 +441,10 @@ static inline int debug_next_entry(file_private_info_t *p_info) if (p_info->act_entry == DEBUG_PROLOG_ENTRY) { p_info->act_entry = 0; p_info->act_page = 0; - goto out; + return true; } if (!id->areas) - return 1; + return false; p_info->act_entry += id->entry_size; /* switch to next page, if we reached the end of the page */ if (p_info->act_entry > (PAGE_SIZE - id->entry_size)) { @@ -441,16 +457,93 @@ static inline int debug_next_entry(file_private_info_t *p_info) p_info->act_page = 0; } if (p_info->act_area >= id->nr_areas) - return 1; + return false; } -out: - return 0; + return true; +} + +/** + * debug_to_act_entry - Go to the currently active entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the currently active + * entry of @p_info->debug_info_snap + */ +static void debug_to_act_entry(file_private_info_t *p_info) +{ + debug_info_t *snap_id; + + snap_id = p_info->debug_info_snap; + p_info->act_area = snap_id->active_area; + p_info->act_page = snap_id->active_pages[snap_id->active_area]; + p_info->act_entry = snap_id->active_entries[snap_id->active_area]; +} + +/** + * debug_prev_entry - Go to the previous entry + * @p_info: Private info that is manipulated + * + * Sets the current position in @p_info to the previous entry. If no previous entry + * exists the current position is set left as DEBUG_PROLOG_ENTRY and the return value + * indicates that no previous entries exist. + * + * Return: True if there are more previous entries, false otherwise + */ + +static inline bool debug_prev_entry(file_private_info_t *p_info) +{ + debug_info_t *id; + + id = p_info->debug_info_snap; + if (p_info->act_entry == DEBUG_PROLOG_ENTRY) + debug_to_act_entry(p_info); + if (!id->areas) + return false; + p_info->act_entry -= id->entry_size; + /* switch to prev page, if we reached the beginning of the page */ + if (p_info->act_entry < 0) { + /* end of previous page */ + p_info->act_entry = rounddown(PAGE_SIZE, id->entry_size) - id->entry_size; + p_info->act_page--; + if (p_info->act_page < 0) { + /* previous area */ + p_info->act_area--; + p_info->act_page = id->pages_per_area - 1; + } + if (p_info->act_area < 0) + p_info->act_area = (id->nr_areas - 1) % id->nr_areas; + } + /* check full circle */ + if (id->active_area == p_info->act_area && + id->active_pages[id->active_area] == p_info->act_page && + id->active_entries[id->active_area] == p_info->act_entry) + return false; + return true; +} + +/** + * debug_move_entry - Go to next entry in either the forward or backward direction + * @p_info: Private info that is manipulated + * @reverse: If true go to the next entry in reverse i.e. previous + * + * Sets the current position in @p_info to the next (@reverse == false) or + * previous (@reverse == true) entry. + * + * Return: True if there are further entries in that direction, + * false otherwise. + */ +static bool debug_move_entry(file_private_info_t *p_info, bool reverse) +{ + if (reverse) + return debug_prev_entry(p_info); + else + return debug_next_entry(p_info); } /* * debug_output: * - called for user read() - * - copies formated debug entries to the user buffer + * - copies formatted debug entries to the user buffer */ static ssize_t debug_output(struct file *file, /* file descriptor */ char __user *user_buf, /* user buffer */ @@ -486,7 +579,7 @@ static ssize_t debug_output(struct file *file, /* file descriptor */ } if (copy_size == formatted_line_residue) { entry_offset = 0; - if (debug_next_entry(p_info)) + if (!debug_next_entry(p_info)) goto out; } } @@ -521,15 +614,51 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf, return rc; /* number of input characters */ } +static file_private_info_t *debug_file_private_alloc(debug_info_t *debug_info, + struct debug_view *view) +{ + debug_info_t *debug_info_snapshot; + file_private_info_t *p_info; + + /* + * Make snapshot of current debug areas to get it consistent. + * To copy all the areas is only needed, if we have a view which + * formats the debug areas. + */ + if (!view->format_proc && !view->header_proc) + debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); + else + debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); + + if (!debug_info_snapshot) + return NULL; + p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + if (!p_info) { + debug_info_free(debug_info_snapshot); + return NULL; + } + p_info->offset = 0; + p_info->debug_info_snap = debug_info_snapshot; + p_info->debug_info_org = debug_info; + p_info->view = view; + p_info->act_area = 0; + p_info->act_page = 0; + p_info->act_entry = DEBUG_PROLOG_ENTRY; + p_info->act_entry_offset = 0; + debug_info_get(debug_info); + + return p_info; +} + /* * debug_open: * - called for user open() - * - copies formated output to private_data area of the file + * - copies formatted output to private_data area of the file * handle */ static int debug_open(struct inode *inode, struct file *file) { - debug_info_t *debug_info, *debug_info_snapshot; + debug_info_t *debug_info; file_private_info_t *p_info; int i, rc = 0; @@ -547,42 +676,26 @@ static int debug_open(struct inode *inode, struct file *file) goto out; found: - - /* Make snapshot of current debug areas to get it consistent. */ - /* To copy all the areas is only needed, if we have a view which */ - /* formats the debug areas. */ - - if (!debug_info->views[i]->format_proc && !debug_info->views[i]->header_proc) - debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS); - else - debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS); - - if (!debug_info_snapshot) { - rc = -ENOMEM; - goto out; - } - p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL); + p_info = debug_file_private_alloc(debug_info, debug_info->views[i]); if (!p_info) { - debug_info_free(debug_info_snapshot); rc = -ENOMEM; goto out; } - p_info->offset = 0; - p_info->debug_info_snap = debug_info_snapshot; - p_info->debug_info_org = debug_info; - p_info->view = debug_info->views[i]; - p_info->act_area = 0; - p_info->act_page = 0; - p_info->act_entry = DEBUG_PROLOG_ENTRY; - p_info->act_entry_offset = 0; file->private_data = p_info; - debug_info_get(debug_info); nonseekable_open(inode, file); out: mutex_unlock(&debug_mutex); return rc; } +static void debug_file_private_free(file_private_info_t *p_info) +{ + if (p_info->debug_info_snap) + debug_info_free(p_info->debug_info_snap); + debug_info_put(p_info->debug_info_org); + kfree(p_info); +} + /* * debug_close: * - called for user close() @@ -593,13 +706,59 @@ static int debug_close(struct inode *inode, struct file *file) file_private_info_t *p_info; p_info = (file_private_info_t *) file->private_data; - if (p_info->debug_info_snap) - debug_info_free(p_info->debug_info_snap); - debug_info_put(p_info->debug_info_org); - kfree(file->private_data); + debug_file_private_free(p_info); + file->private_data = NULL; return 0; /* success */ } +/** + * debug_dump - Get a textual representation of debug info, or as much as fits + * @id: Debug information to use + * @view: View with which to dump the debug information + * @buf: Buffer the textual debug data representation is written to + * @buf_size: Size of the buffer, including the trailing '\0' byte + * @reverse: Go backwards from the last written entry + * + * This function may be used whenever a textual representation of the debug + * information is required without using an s390dbf file. + * + * Note: It is the callers responsibility to supply a view that is compatible + * with the debug information data. + * + * Return: On success returns the number of bytes written to the buffer not + * including the trailing '\0' byte. If bug_size == 0 the function returns 0. + * On failure an error code less than 0 is returned. + */ +ssize_t debug_dump(debug_info_t *id, struct debug_view *view, + char *buf, size_t buf_size, bool reverse) +{ + file_private_info_t *p_info; + size_t size, offset = 0; + + /* Need space for '\0' byte */ + if (buf_size < 1) + return 0; + buf_size--; + + p_info = debug_file_private_alloc(id, view); + if (!p_info) + return -ENOMEM; + + /* There is always at least the DEBUG_PROLOG_ENTRY */ + do { + size = debug_format_entry(p_info); + size = min(size, buf_size - offset); + memcpy(buf + offset, p_info->temp_buf, size); + offset += size; + if (offset >= buf_size) + break; + } while (debug_move_entry(p_info, reverse)); + debug_file_private_free(p_info); + buf[offset] = '\0'; + + return offset; +} + /* Create debugfs entries and add to internal list. */ static void _debug_register(debug_info_t *id) { @@ -954,7 +1113,7 @@ static int debug_active = 1; * always allow read, allow write only if debug_stoppable is set or * if debug_active is already off */ -static int s390dbf_procactive(struct ctl_table *table, int write, +static int s390dbf_procactive(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) @@ -963,7 +1122,7 @@ static int s390dbf_procactive(struct ctl_table *table, int write, return 0; } -static struct ctl_table s390dbf_table[] = { +static const struct ctl_table s390dbf_table[] = { { .procname = "debug_stoppable", .data = &debug_stoppable, @@ -1293,9 +1452,9 @@ static inline int debug_get_uint(char *buf) */ static int debug_prolog_pages_fn(debug_info_t *id, struct debug_view *view, - char *out_buf) + char *out_buf, size_t out_buf_size) { - return sprintf(out_buf, "%i\n", id->pages_per_area); + return scnprintf(out_buf, out_buf_size, "%i\n", id->pages_per_area); } /* @@ -1342,14 +1501,14 @@ out: * prints out actual debug level */ static int debug_prolog_level_fn(debug_info_t *id, struct debug_view *view, - char *out_buf) + char *out_buf, size_t out_buf_size) { int rc = 0; if (id->level == DEBUG_OFF_LEVEL) - rc = sprintf(out_buf, "-\n"); + rc = scnprintf(out_buf, out_buf_size, "-\n"); else - rc = sprintf(out_buf, "%i\n", id->level); + rc = scnprintf(out_buf, out_buf_size, "%i\n", id->level); return rc; } @@ -1466,22 +1625,24 @@ out: * prints debug data in hex/ascii format */ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf) + char *out_buf, size_t out_buf_size, const char *in_buf) { int i, rc = 0; - for (i = 0; i < id->buf_size; i++) - rc += sprintf(out_buf + rc, "%02x ", ((unsigned char *) in_buf)[i]); - rc += sprintf(out_buf + rc, "| "); + for (i = 0; i < id->buf_size; i++) { + rc += scnprintf(out_buf + rc, out_buf_size - rc, + "%02x ", ((unsigned char *)in_buf)[i]); + } + rc += scnprintf(out_buf + rc, out_buf_size - rc, "| "); for (i = 0; i < id->buf_size; i++) { unsigned char c = in_buf[i]; if (isascii(c) && isprint(c)) - rc += sprintf(out_buf + rc, "%c", c); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "%c", c); else - rc += sprintf(out_buf + rc, "."); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "."); } - rc += sprintf(out_buf + rc, "\n"); + rc += scnprintf(out_buf + rc, out_buf_size - rc, "\n"); return rc; } @@ -1489,7 +1650,8 @@ static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view, * prints header for debug entry */ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, - int area, debug_entry_t *entry, char *out_buf) + int area, debug_entry_t *entry, char *out_buf, + size_t out_buf_size) { unsigned long sec, usec; unsigned long caller; @@ -1506,22 +1668,22 @@ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view, else except_str = "-"; caller = (unsigned long) entry->caller; - rc += sprintf(out_buf, "%02i %011ld:%06lu %1u %1s %04u %px ", - area, sec, usec, level, except_str, - entry->cpu, (void *)caller); + rc += scnprintf(out_buf, out_buf_size, "%02i %011ld:%06lu %1u %1s %04u %px ", + area, sec, usec, level, except_str, + entry->cpu, (void *)caller); return rc; } EXPORT_SYMBOL(debug_dflt_header_fn); /* - * prints debug data sprintf-formated: + * prints debug data sprintf-formatted: * debug_sprinf_event/exception calls must be used together with this view */ #define DEBUG_SPRINTF_MAX_ARGS 10 -static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, - char *out_buf, const char *inbuf) +int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, + char *out_buf, size_t out_buf_size, const char *inbuf) { debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf; int num_longs, num_used_args = 0, i, rc = 0; @@ -1534,8 +1696,9 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, goto out; /* bufsize of entry too small */ if (num_longs == 1) { /* no args, we use only the string */ - strcpy(out_buf, curr_event->string); - rc = strlen(curr_event->string); + rc = strscpy(out_buf, curr_event->string, out_buf_size); + if (rc == -E2BIG) + rc = out_buf_size; goto out; } @@ -1547,15 +1710,17 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view, for (i = 0; i < num_used_args; i++) index[i] = i; - rc = sprintf(out_buf, curr_event->string, curr_event->args[index[0]], - curr_event->args[index[1]], curr_event->args[index[2]], - curr_event->args[index[3]], curr_event->args[index[4]], - curr_event->args[index[5]], curr_event->args[index[6]], - curr_event->args[index[7]], curr_event->args[index[8]], - curr_event->args[index[9]]); + rc = scnprintf(out_buf, out_buf_size, + curr_event->string, curr_event->args[index[0]], + curr_event->args[index[1]], curr_event->args[index[2]], + curr_event->args[index[3]], curr_event->args[index[4]], + curr_event->args[index[5]], curr_event->args[index[6]], + curr_event->args[index[7]], curr_event->args[index[8]], + curr_event->args[index[9]]); out: return rc; } +EXPORT_SYMBOL(debug_sprintf_format_fn); /* * debug_init: diff --git a/arch/s390/kernel/diag/Makefile b/arch/s390/kernel/diag/Makefile new file mode 100644 index 000000000000..956aee6c4090 --- /dev/null +++ b/arch/s390/kernel/diag/Makefile @@ -0,0 +1 @@ +obj-y := diag_misc.o diag324.o diag.o diag310.o diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag/diag.c index 8dee9aa0ec95..e15b8dee3228 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag/diag.c @@ -16,7 +16,8 @@ #include <asm/diag.h> #include <asm/trace/diag.h> #include <asm/sections.h> -#include "entry.h" +#include <asm/asm.h> +#include "../entry.h" struct diag_stat { unsigned int counter[NR_DIAG_STAT]; @@ -50,8 +51,11 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X2FC] = { .code = 0x2fc, .name = "Guest Performance Data" }, [DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" }, [DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" }, + [DIAG_STAT_X310] = { .code = 0x310, .name = "Memory Topology Information" }, [DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" }, [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" }, + [DIAG_STAT_X324] = { .code = 0x324, .name = "Power Information Block" }, + [DIAG_STAT_X49C] = { .code = 0x49c, .name = "Warning-Track Interruption" }, [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; @@ -185,6 +189,8 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) } EXPORT_SYMBOL(diag14); +#define DIAG204_BUSY_RC 8 + static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr) { union register_pair rp = { .even = *subcode, .odd = size }; @@ -215,16 +221,18 @@ int diag204(unsigned long subcode, unsigned long size, void *addr) { if (addr) { if (WARN_ON_ONCE(!is_vmalloc_addr(addr))) - return -1; + return -EINVAL; if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE))) - return -1; + return -EINVAL; } if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4) addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr)); diag_stat_inc(DIAG_STAT_X204); size = __diag204(&subcode, size, addr); - if (subcode) - return -1; + if (subcode == DIAG204_BUSY_RC) + return -EBUSY; + else if (subcode) + return -EOPNOTSUPP; return size; } EXPORT_SYMBOL(diag204); @@ -278,12 +286,14 @@ int diag224(void *ptr) int rc = -EOPNOTSUPP; diag_stat_inc(DIAG_STAT_X224); - asm volatile( - " diag %1,%2,0x224\n" - "0: lhi %0,0x0\n" + asm volatile("\n" + " diag %[type],%[addr],0x224\n" + "0: lhi %[rc],0\n" "1:\n" EX_TABLE(0b,1b) - : "+d" (rc) :"d" (0), "d" (addr) : "memory"); + : [rc] "+d" (rc) + , "=m" (*(struct { char buf[PAGE_SIZE]; } *)ptr) + : [type] "d" (0), [addr] "d" (addr)); return rc; } EXPORT_SYMBOL(diag224); @@ -297,3 +307,18 @@ int diag26c(void *req, void *resp, enum diag26c_sc subcode) return diag_amode31_ops.diag26c(virt_to_phys(req), virt_to_phys(resp), subcode); } EXPORT_SYMBOL(diag26c); + +int diag49c(unsigned long subcode) +{ + int cc; + + diag_stat_inc(DIAG_STAT_X49C); + asm volatile( + " diag %[subcode],0,0x49c\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [subcode] "d" (subcode) + : CC_CLOBBER); + return CC_TRANSFORM(cc); +} +EXPORT_SYMBOL(diag49c); diff --git a/arch/s390/kernel/diag/diag310.c b/arch/s390/kernel/diag/diag310.c new file mode 100644 index 000000000000..d6a34454aa5a --- /dev/null +++ b/arch/s390/kernel/diag/diag310.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Request memory topology information via diag0x310. + * + * Copyright IBM Corp. 2025 + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <asm/diag.h> +#include <asm/sclp.h> +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +#define DIAG310_LEVELMIN 1 +#define DIAG310_LEVELMAX 6 + +enum diag310_sc { + DIAG310_SUBC_0 = 0, + DIAG310_SUBC_1 = 1, + DIAG310_SUBC_4 = 4, + DIAG310_SUBC_5 = 5 +}; + +enum diag310_retcode { + DIAG310_RET_SUCCESS = 0x0001, + DIAG310_RET_BUSY = 0x0101, + DIAG310_RET_OPNOTSUPP = 0x0102, + DIAG310_RET_SC4_INVAL = 0x0401, + DIAG310_RET_SC4_NODATA = 0x0402, + DIAG310_RET_SC5_INVAL = 0x0501, + DIAG310_RET_SC5_NODATA = 0x0502, + DIAG310_RET_SC5_ESIZE = 0x0503 +}; + +union diag310_response { + u64 response; + struct { + u64 result : 32; + u64 : 16; + u64 rc : 16; + }; +}; + +union diag310_req_subcode { + u64 subcode; + struct { + u64 : 48; + u64 st : 8; + u64 sc : 8; + }; +}; + +union diag310_req_size { + u64 size; + struct { + u64 page_count : 32; + u64 : 32; + }; +}; + +static inline unsigned long diag310(unsigned long subcode, unsigned long size, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr, .odd = size }; + + diag_stat_inc(DIAG_STAT_X310); + asm volatile("diag %[rp],%[subcode],0x310\n" + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "memory"); + return rp.odd; +} + +static int diag310_result_to_errno(unsigned int result) +{ + switch (result) { + case DIAG310_RET_BUSY: + return -EBUSY; + case DIAG310_RET_OPNOTSUPP: + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + +static int diag310_get_subcode_mask(unsigned long *mask) +{ + union diag310_response res; + + res.response = diag310(DIAG310_SUBC_0, 0, NULL); + if (res.rc != DIAG310_RET_SUCCESS) + return diag310_result_to_errno(res.rc); + *mask = res.response; + return 0; +} + +static int diag310_get_memtop_stride(unsigned long *stride) +{ + union diag310_response res; + + res.response = diag310(DIAG310_SUBC_1, 0, NULL); + if (res.rc != DIAG310_RET_SUCCESS) + return diag310_result_to_errno(res.rc); + *stride = res.result; + return 0; +} + +static int diag310_get_memtop_size(unsigned long *pages, unsigned long level) +{ + union diag310_req_subcode req = { .sc = DIAG310_SUBC_4, .st = level }; + union diag310_response res; + + res.response = diag310(req.subcode, 0, NULL); + switch (res.rc) { + case DIAG310_RET_SUCCESS: + *pages = res.result; + return 0; + case DIAG310_RET_SC4_NODATA: + return -ENODATA; + case DIAG310_RET_SC4_INVAL: + return -EINVAL; + default: + return diag310_result_to_errno(res.rc); + } +} + +static int diag310_store_topology_map(void *buf, unsigned long pages, unsigned long level) +{ + union diag310_req_subcode req_sc = { .sc = DIAG310_SUBC_5, .st = level }; + union diag310_req_size req_size = { .page_count = pages }; + union diag310_response res; + + res.response = diag310(req_sc.subcode, req_size.size, buf); + switch (res.rc) { + case DIAG310_RET_SUCCESS: + return 0; + case DIAG310_RET_SC5_NODATA: + return -ENODATA; + case DIAG310_RET_SC5_ESIZE: + return -EOVERFLOW; + case DIAG310_RET_SC5_INVAL: + return -EINVAL; + default: + return diag310_result_to_errno(res.rc); + } +} + +static int diag310_check_features(void) +{ + static int features_available; + unsigned long mask; + int rc; + + if (READ_ONCE(features_available)) + return 0; + if (!sclp.has_diag310) + return -EOPNOTSUPP; + rc = diag310_get_subcode_mask(&mask); + if (rc) + return rc; + if (!test_bit_inv(DIAG310_SUBC_1, &mask)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG310_SUBC_4, &mask)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG310_SUBC_5, &mask)) + return -EOPNOTSUPP; + WRITE_ONCE(features_available, 1); + return 0; +} + +static int memtop_get_stride_len(unsigned long *res) +{ + static unsigned long memtop_stride; + unsigned long stride; + int rc; + + stride = READ_ONCE(memtop_stride); + if (!stride) { + rc = diag310_get_memtop_stride(&stride); + if (rc) + return rc; + WRITE_ONCE(memtop_stride, stride); + } + *res = stride; + return 0; +} + +static int memtop_get_page_count(unsigned long *res, unsigned long level) +{ + static unsigned long memtop_pages[DIAG310_LEVELMAX]; + unsigned long pages; + int rc; + + if (level > DIAG310_LEVELMAX || level < DIAG310_LEVELMIN) + return -EINVAL; + pages = READ_ONCE(memtop_pages[level - 1]); + if (!pages) { + rc = diag310_get_memtop_size(&pages, level); + if (rc) + return rc; + WRITE_ONCE(memtop_pages[level - 1], pages); + } + *res = pages; + return 0; +} + +long diag310_memtop_stride(unsigned long arg) +{ + size_t __user *argp = (void __user *)arg; + unsigned long stride; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + rc = memtop_get_stride_len(&stride); + if (rc) + return rc; + if (put_user(stride, argp)) + return -EFAULT; + return 0; +} + +long diag310_memtop_len(unsigned long arg) +{ + size_t __user *argp = (void __user *)arg; + unsigned long pages, level; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + if (get_user(level, argp)) + return -EFAULT; + rc = memtop_get_page_count(&pages, level); + if (rc) + return rc; + if (put_user(pages * PAGE_SIZE, argp)) + return -EFAULT; + return 0; +} + +long diag310_memtop_buf(unsigned long arg) +{ + struct diag310_memtop __user *udata = (struct diag310_memtop __user *)arg; + unsigned long level, pages, data_size; + u64 address; + void *buf; + int rc; + + rc = diag310_check_features(); + if (rc) + return rc; + if (get_user(level, &udata->nesting_lvl)) + return -EFAULT; + if (get_user(address, &udata->address)) + return -EFAULT; + rc = memtop_get_page_count(&pages, level); + if (rc) + return rc; + data_size = pages * PAGE_SIZE; + buf = __vmalloc_node(data_size, PAGE_SIZE, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, + NUMA_NO_NODE, __builtin_return_address(0)); + if (!buf) + return -ENOMEM; + rc = diag310_store_topology_map(buf, pages, level); + if (rc) + goto out; + if (copy_to_user((void __user *)address, buf, data_size)) + rc = -EFAULT; +out: + vfree(buf); + return rc; +} diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c new file mode 100644 index 000000000000..7fa4c0b7eb6c --- /dev/null +++ b/arch/s390/kernel/diag/diag324.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Request power readings for resources in a computing environment via + * diag 0x324. diag 0x324 stores the power readings in the power information + * block (pib). + * + * Copyright IBM Corp. 2024 + */ + +#define pr_fmt(fmt) "diag324: " fmt +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/ioctl.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/ktime.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> + +#include <asm/diag.h> +#include <asm/sclp.h> +#include <asm/timex.h> +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +enum subcode { + DIAG324_SUBC_0 = 0, + DIAG324_SUBC_1 = 1, + DIAG324_SUBC_2 = 2, +}; + +enum retcode { + DIAG324_RET_SUCCESS = 0x0001, + DIAG324_RET_SUBC_NOTAVAIL = 0x0103, + DIAG324_RET_INSUFFICIENT_SIZE = 0x0104, + DIAG324_RET_READING_UNAVAILABLE = 0x0105, +}; + +union diag324_response { + u64 response; + struct { + u64 installed : 32; + u64 : 16; + u64 rc : 16; + } sc0; + struct { + u64 format : 16; + u64 : 16; + u64 pib_len : 16; + u64 rc : 16; + } sc1; + struct { + u64 : 48; + u64 rc : 16; + } sc2; +}; + +union diag324_request { + u64 request; + struct { + u64 : 32; + u64 allocated : 16; + u64 : 12; + u64 sc : 4; + } sc2; +}; + +struct pib { + u32 : 8; + u32 num : 8; + u32 len : 16; + u32 : 24; + u32 hlen : 8; + u64 : 64; + u64 intv; + u8 r[]; +} __packed; + +struct pibdata { + struct pib *pib; + ktime_t expire; + u64 sequence; + size_t len; + int rc; +}; + +static DEFINE_MUTEX(pibmutex); +static struct pibdata pibdata; + +#define PIBWORK_DELAY (5 * NSEC_PER_SEC) + +static void pibwork_handler(struct work_struct *work); +static DECLARE_DELAYED_WORK(pibwork, pibwork_handler); + +static unsigned long diag324(unsigned long subcode, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr }; + + diag_stat_inc(DIAG_STAT_X324); + asm volatile("diag %[rp],%[subcode],0x324\n" + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "memory"); + return rp.odd; +} + +static void pibwork_handler(struct work_struct *work) +{ + struct pibdata *data = &pibdata; + ktime_t timedout; + + mutex_lock(&pibmutex); + timedout = ktime_add_ns(data->expire, PIBWORK_DELAY); + if (ktime_before(ktime_get(), timedout)) { + mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + goto out; + } + vfree(data->pib); + data->pib = NULL; +out: + mutex_unlock(&pibmutex); +} + +static void pib_update(struct pibdata *data) +{ + union diag324_request req = { .sc2.sc = DIAG324_SUBC_2, .sc2.allocated = data->len }; + union diag324_response res; + int rc; + + memset(data->pib, 0, data->len); + res.response = diag324(req.request, data->pib); + switch (res.sc2.rc) { + case DIAG324_RET_SUCCESS: + rc = 0; + break; + case DIAG324_RET_SUBC_NOTAVAIL: + rc = -ENOENT; + break; + case DIAG324_RET_INSUFFICIENT_SIZE: + rc = -EMSGSIZE; + break; + case DIAG324_RET_READING_UNAVAILABLE: + rc = -EBUSY; + break; + default: + rc = -EINVAL; + } + data->rc = rc; +} + +long diag324_pibbuf(unsigned long arg) +{ + struct diag324_pib __user *udata = (struct diag324_pib __user *)arg; + struct pibdata *data = &pibdata; + static bool first = true; + u64 address; + int rc; + + if (!data->len) + return -EOPNOTSUPP; + if (get_user(address, &udata->address)) + return -EFAULT; + mutex_lock(&pibmutex); + rc = -ENOMEM; + if (!data->pib) + data->pib = vmalloc(data->len); + if (!data->pib) + goto out; + if (first || ktime_after(ktime_get(), data->expire)) { + pib_update(data); + data->sequence++; + data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv)); + mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + first = false; + } + rc = data->rc; + if (rc != 0 && rc != -EBUSY) + goto out; + rc = copy_to_user((void __user *)address, data->pib, data->pib->len); + rc |= put_user(data->sequence, &udata->sequence); + if (rc) + rc = -EFAULT; +out: + mutex_unlock(&pibmutex); + return rc; +} + +long diag324_piblen(unsigned long arg) +{ + struct pibdata *data = &pibdata; + + if (!data->len) + return -EOPNOTSUPP; + if (put_user(data->len, (size_t __user *)arg)) + return -EFAULT; + return 0; +} + +static int __init diag324_init(void) +{ + union diag324_response res; + unsigned long installed; + + if (!sclp.has_diag324) + return -EOPNOTSUPP; + res.response = diag324(DIAG324_SUBC_0, NULL); + if (res.sc0.rc != DIAG324_RET_SUCCESS) + return -EOPNOTSUPP; + installed = res.response; + if (!test_bit_inv(DIAG324_SUBC_1, &installed)) + return -EOPNOTSUPP; + if (!test_bit_inv(DIAG324_SUBC_2, &installed)) + return -EOPNOTSUPP; + res.response = diag324(DIAG324_SUBC_1, NULL); + if (res.sc1.rc != DIAG324_RET_SUCCESS) + return -EOPNOTSUPP; + pibdata.len = res.sc1.pib_len; + return 0; +} +device_initcall(diag324_init); diff --git a/arch/s390/kernel/diag/diag_ioctl.h b/arch/s390/kernel/diag/diag_ioctl.h new file mode 100644 index 000000000000..7080be946785 --- /dev/null +++ b/arch/s390/kernel/diag/diag_ioctl.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _DIAG_IOCTL_H +#define _DIAG_IOCTL_H + +#include <linux/types.h> + +long diag324_pibbuf(unsigned long arg); +long diag324_piblen(unsigned long arg); + +long diag310_memtop_stride(unsigned long arg); +long diag310_memtop_len(unsigned long arg); +long diag310_memtop_buf(unsigned long arg); + +#endif /* _DIAG_IOCTL_H */ diff --git a/arch/s390/kernel/diag/diag_misc.c b/arch/s390/kernel/diag/diag_misc.c new file mode 100644 index 000000000000..efffe02ea02e --- /dev/null +++ b/arch/s390/kernel/diag/diag_misc.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide diagnose information via misc device /dev/diag. + * + * Copyright IBM Corp. 2024 + */ + +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/ioctl.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/types.h> + +#include <uapi/asm/diag.h> +#include "diag_ioctl.h" + +static long diag_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + long rc; + + switch (cmd) { + case DIAG324_GET_PIBLEN: + rc = diag324_piblen(arg); + break; + case DIAG324_GET_PIBBUF: + rc = diag324_pibbuf(arg); + break; + case DIAG310_GET_STRIDE: + rc = diag310_memtop_stride(arg); + break; + case DIAG310_GET_MEMTOPLEN: + rc = diag310_memtop_len(arg); + break; + case DIAG310_GET_MEMTOPBUF: + rc = diag310_memtop_buf(arg); + break; + default: + rc = -ENOIOCTLCMD; + break; + } + return rc; +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .unlocked_ioctl = diag_ioctl, +}; + +static struct miscdevice diagdev = { + .name = "diag", + .minor = MISC_DYNAMIC_MINOR, + .fops = &fops, + .mode = 0444, +}; + +static int diag_init(void) +{ + return misc_register(&diagdev); +} + +device_initcall(diag_init); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 89dc826a8d2e..94eb8168ea44 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -122,6 +122,7 @@ enum { U8_32, /* 8 bit unsigned value starting at 32 */ U12_16, /* 12 bit unsigned value starting at 16 */ U16_16, /* 16 bit unsigned value starting at 16 */ + U16_20, /* 16 bit unsigned value starting at 20 */ U16_32, /* 16 bit unsigned value starting at 32 */ U32_16, /* 32 bit unsigned value starting at 16 */ VX_12, /* Vector index register starting at position 12 */ @@ -184,6 +185,7 @@ static const struct s390_operand operands[] = { [U8_32] = { 8, 32, 0 }, [U12_16] = { 12, 16, 0 }, [U16_16] = { 16, 16, 0 }, + [U16_20] = { 16, 20, 0 }, [U16_32] = { 16, 32, 0 }, [U32_16] = { 32, 16, 0 }, [VX_12] = { 4, 12, OPERAND_INDEX | OPERAND_VR }, @@ -257,7 +259,6 @@ static const unsigned char formats[][6] = { [INSTR_RSL_R0RD] = { D_20, L4_8, B_16, 0, 0, 0 }, [INSTR_RSY_AARD] = { A_8, A_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_CCRD] = { C_8, C_12, D20_20, B_16, 0, 0 }, - [INSTR_RSY_RDRU] = { R_8, D20_20, B_16, U4_12, 0, 0 }, [INSTR_RSY_RRRD] = { R_8, R_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_RURD] = { R_8, U4_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_RURD2] = { R_8, D20_20, B_16, U4_12, 0, 0 }, @@ -300,14 +301,17 @@ static const unsigned char formats[][6] = { [INSTR_VRI_V0UU2] = { V_8, U16_16, U4_32, 0, 0, 0 }, [INSTR_VRI_V0UUU] = { V_8, U8_16, U8_24, U4_32, 0, 0 }, [INSTR_VRI_VR0UU] = { V_8, R_12, U8_28, U4_24, 0, 0 }, + [INSTR_VRI_VV0UU] = { V_8, V_12, U8_28, U4_24, 0, 0 }, [INSTR_VRI_VVUU] = { V_8, V_12, U16_16, U4_32, 0, 0 }, [INSTR_VRI_VVUUU] = { V_8, V_12, U12_16, U4_32, U4_28, 0 }, [INSTR_VRI_VVUUU2] = { V_8, V_12, U8_28, U8_16, U4_24, 0 }, [INSTR_VRI_VVV0U] = { V_8, V_12, V_16, U8_24, 0, 0 }, [INSTR_VRI_VVV0UU] = { V_8, V_12, V_16, U8_24, U4_32, 0 }, [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, - [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, + [INSTR_VRI_VVV0UV] = { V_8, V_12, V_16, V_32, U8_24, 0 }, + [INSTR_VRR_0V0U] = { V_12, U16_20, 0, 0, 0, 0 }, [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, + [INSTR_VRR_0VVU] = { V_12, V_16, U16_20, 0, 0, 0 }, [INSTR_VRR_RV0UU] = { R_8, V_12, U4_24, U4_28, 0, 0 }, [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, @@ -455,21 +459,21 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr) if (separator) ptr += sprintf(ptr, "%c", separator); if (operand->flags & OPERAND_GPR) - ptr += sprintf(ptr, "%%r%i", value); + ptr += sprintf(ptr, "%%r%u", value); else if (operand->flags & OPERAND_FPR) - ptr += sprintf(ptr, "%%f%i", value); + ptr += sprintf(ptr, "%%f%u", value); else if (operand->flags & OPERAND_AR) - ptr += sprintf(ptr, "%%a%i", value); + ptr += sprintf(ptr, "%%a%u", value); else if (operand->flags & OPERAND_CR) - ptr += sprintf(ptr, "%%c%i", value); + ptr += sprintf(ptr, "%%c%u", value); else if (operand->flags & OPERAND_VR) - ptr += sprintf(ptr, "%%v%i", value); + ptr += sprintf(ptr, "%%v%u", value); else if (operand->flags & OPERAND_PCREL) { void *pcrel = (void *)((int)value + addr); ptr += sprintf(ptr, "%px", pcrel); } else if (operand->flags & OPERAND_SIGNED) - ptr += sprintf(ptr, "%i", value); + ptr += sprintf(ptr, "%i", (int)value); else ptr += sprintf(ptr, "%u", value); if (operand->flags & OPERAND_DISP) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index d2012635b093..1ecd0580561f 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -61,28 +61,28 @@ static bool in_task_stack(unsigned long sp, struct task_struct *task, static bool in_irq_stack(unsigned long sp, struct stack_info *info) { - unsigned long stack = S390_lowcore.async_stack - STACK_INIT_OFFSET; + unsigned long stack = get_lowcore()->async_stack - STACK_INIT_OFFSET; return in_stack(sp, info, STACK_TYPE_IRQ, stack); } static bool in_nodat_stack(unsigned long sp, struct stack_info *info) { - unsigned long stack = S390_lowcore.nodat_stack - STACK_INIT_OFFSET; + unsigned long stack = get_lowcore()->nodat_stack - STACK_INIT_OFFSET; return in_stack(sp, info, STACK_TYPE_NODAT, stack); } static bool in_mcck_stack(unsigned long sp, struct stack_info *info) { - unsigned long stack = S390_lowcore.mcck_stack - STACK_INIT_OFFSET; + unsigned long stack = get_lowcore()->mcck_stack - STACK_INIT_OFFSET; return in_stack(sp, info, STACK_TYPE_MCCK, stack); } static bool in_restart_stack(unsigned long sp, struct stack_info *info) { - unsigned long stack = S390_lowcore.restart_stack - STACK_INIT_OFFSET; + unsigned long stack = get_lowcore()->restart_stack - STACK_INIT_OFFSET; return in_stack(sp, info, STACK_TYPE_RESTART, stack); } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index c666271433fb..2fa25164df7d 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -7,6 +7,7 @@ #define KMSG_COMPONENT "setup" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <linux/sched/debug.h> #include <linux/compiler.h> #include <linux/init.h> #include <linux/errno.h> @@ -48,6 +49,8 @@ decompressor_handled_param(dfltcc); decompressor_handled_param(facilities); decompressor_handled_param(nokaslr); decompressor_handled_param(cmma); +decompressor_handled_param(relocate_lowcore); +decompressor_handled_param(bootdebug); #if IS_ENABLED(CONFIG_KVM) decompressor_handled_param(prot_virt); #endif @@ -56,7 +59,7 @@ static void __init kasan_early_init(void) { #ifdef CONFIG_KASAN init_task.kasan_depth = 0; - sclp_early_printk("KernelAddressSanitizer initialized\n"); + pr_info("KernelAddressSanitizer initialized\n"); #endif } @@ -72,7 +75,7 @@ static void __init reset_tod_clock(void) memset(&tod_clock_base, 0, sizeof(tod_clock_base)); tod_clock_base.tod = TOD_UNIX_EPOCH; - S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; + get_lowcore()->last_update_clock = TOD_UNIX_EPOCH; } /* @@ -99,7 +102,7 @@ static noinline __init void detect_machine_type(void) /* Check current-configuration-level */ if (stsi(NULL, 0, 0, 0) <= 2) { - S390_lowcore.machine_flags |= MACHINE_FLAG_LPAR; + get_lowcore()->machine_flags |= MACHINE_FLAG_LPAR; return; } /* Get virtual-machine cpu information. */ @@ -108,9 +111,9 @@ static noinline __init void detect_machine_type(void) /* Detect known hypervisors */ if (!memcmp(vmms->vm[0].cpi, "\xd2\xe5\xd4", 3)) - S390_lowcore.machine_flags |= MACHINE_FLAG_KVM; + get_lowcore()->machine_flags |= MACHINE_FLAG_KVM; else if (!memcmp(vmms->vm[0].cpi, "\xa9\x61\xe5\xd4", 4)) - S390_lowcore.machine_flags |= MACHINE_FLAG_VM; + get_lowcore()->machine_flags |= MACHINE_FLAG_VM; } /* Remove leading, trailing and double whitespace. */ @@ -166,7 +169,7 @@ static __init void setup_topology(void) if (!test_facility(11)) return; - S390_lowcore.machine_flags |= MACHINE_FLAG_TOPOLOGY; + get_lowcore()->machine_flags |= MACHINE_FLAG_TOPOLOGY; for (max_mnest = 6; max_mnest > 1; max_mnest--) { if (stsi(&sysinfo_page, 15, 1, max_mnest) == 0) break; @@ -174,27 +177,45 @@ static __init void setup_topology(void) topology_max_mnest = max_mnest; } -void __do_early_pgm_check(struct pt_regs *regs) +void __init __do_early_pgm_check(struct pt_regs *regs) { - if (!fixup_exception(regs)) - disabled_wait(); + struct lowcore *lc = get_lowcore(); + unsigned long ip; + + regs->int_code = lc->pgm_int_code; + regs->int_parm_long = lc->trans_exc_code; + ip = __rewind_psw(regs->psw, regs->int_code >> 16); + + /* Monitor Event? Might be a warning */ + if ((regs->int_code & PGM_INT_CODE_MASK) == 0x40) { + if (report_bug(ip, regs) == BUG_TRAP_TYPE_WARN) + return; + } + if (fixup_exception(regs)) + return; + /* + * Unhandled exception - system cannot continue but try to get some + * helpful messages to the console. Use early_printk() to print + * some basic information in case it is too early for printk(). + */ + register_early_console(); + early_printk("PANIC: early exception %04x PSW: %016lx %016lx\n", + regs->int_code & 0xffff, regs->psw.mask, regs->psw.addr); + show_regs(regs); + disabled_wait(); } static noinline __init void setup_lowcore_early(void) { + struct lowcore *lc = get_lowcore(); psw_t psw; psw.addr = (unsigned long)early_pgm_check_handler; psw.mask = PSW_KERNEL_BITS; - S390_lowcore.program_new_psw = psw; - S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; -} - -static noinline __init void setup_facility_list(void) -{ - memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list)); - if (!IS_ENABLED(CONFIG_KERNEL_NOBP)) - __clear_facility(82, alt_stfle_fac_list); + lc->program_new_psw = psw; + lc->preempt_count = INIT_PREEMPT_COUNT; + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); } static __init void detect_diag9c(void) @@ -211,43 +232,45 @@ static __init void detect_diag9c(void) EX_TABLE(0b,1b) : "=d" (rc) : "0" (-EOPNOTSUPP), "d" (cpu_address) : "cc"); if (!rc) - S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; + get_lowcore()->machine_flags |= MACHINE_FLAG_DIAG9C; } static __init void detect_machine_facilities(void) { if (test_facility(8)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT1; + get_lowcore()->machine_flags |= MACHINE_FLAG_EDAT1; system_ctl_set_bit(0, CR0_EDAT_BIT); } if (test_facility(78)) - S390_lowcore.machine_flags |= MACHINE_FLAG_EDAT2; + get_lowcore()->machine_flags |= MACHINE_FLAG_EDAT2; if (test_facility(3)) - S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; + get_lowcore()->machine_flags |= MACHINE_FLAG_IDTE; if (test_facility(50) && test_facility(73)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_TE; + get_lowcore()->machine_flags |= MACHINE_FLAG_TE; system_ctl_set_bit(0, CR0_TRANSACTIONAL_EXECUTION_BIT); } if (test_facility(51)) - S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; + get_lowcore()->machine_flags |= MACHINE_FLAG_TLB_LC; if (test_facility(129)) system_ctl_set_bit(0, CR0_VECTOR_BIT); if (test_facility(130)) - S390_lowcore.machine_flags |= MACHINE_FLAG_NX; + get_lowcore()->machine_flags |= MACHINE_FLAG_NX; if (test_facility(133)) - S390_lowcore.machine_flags |= MACHINE_FLAG_GS; + get_lowcore()->machine_flags |= MACHINE_FLAG_GS; if (test_facility(139) && (tod_clock_base.tod >> 63)) { /* Enabled signed clock comparator comparisons */ - S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; + get_lowcore()->machine_flags |= MACHINE_FLAG_SCC; clock_comparator_max = -1ULL >> 1; system_ctl_set_bit(0, CR0_CLOCK_COMPARATOR_SIGN_BIT); } if (IS_ENABLED(CONFIG_PCI) && test_facility(153)) { - S390_lowcore.machine_flags |= MACHINE_FLAG_PCI_MIO; + get_lowcore()->machine_flags |= MACHINE_FLAG_PCI_MIO; /* the control bit is set during PCI initialization */ } if (test_facility(194)) - S390_lowcore.machine_flags |= MACHINE_FLAG_RDP; + get_lowcore()->machine_flags |= MACHINE_FLAG_RDP; + if (test_facility(85)) + get_lowcore()->machine_flags |= MACHINE_FLAG_SEQ_INSN; } static inline void save_vector_registers(void) @@ -291,7 +314,6 @@ void __init startup_init(void) lockdep_off(); sort_amode31_extable(); setup_lowcore_early(); - setup_facility_list(); detect_machine_type(); setup_arch_string(); setup_boot_command_line(); diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c index d9d53f44008a..cefe020a3be3 100644 --- a/arch/s390/kernel/early_printk.c +++ b/arch/s390/kernel/early_printk.c @@ -6,6 +6,7 @@ #include <linux/console.h> #include <linux/kernel.h> #include <linux/init.h> +#include <asm/setup.h> #include <asm/sclp.h> static void sclp_early_write(struct console *con, const char *s, unsigned int len) @@ -20,6 +21,16 @@ static struct console sclp_early_console = { .index = -1, }; +void __init register_early_console(void) +{ + if (early_console) + return; + if (!sclp.has_linemode && !sclp.has_vt220) + return; + early_console = &sclp_early_console; + register_console(early_console); +} + static int __init setup_early_printk(char *buf) { if (early_console) @@ -27,10 +38,7 @@ static int __init setup_early_printk(char *buf) /* Accept only "earlyprintk" and "earlyprintk=sclp" */ if (buf && !str_has_prefix(buf, "sclp")) return 0; - if (!sclp.has_linemode && !sclp.has_vt220) - return 0; - early_console = &sclp_early_console; - register_console(early_console); + register_early_console(); return 0; } early_param("earlyprintk", setup_early_printk); diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S deleted file mode 100644 index c634871f0d90..000000000000 --- a/arch/s390/kernel/earlypgm.S +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright IBM Corp. 2006, 2007 - * Author(s): Michael Holzheu <holzheu@de.ibm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> - -SYM_CODE_START(early_pgm_check_handler) - stmg %r8,%r15,__LC_SAVE_AREA_SYNC - aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) - la %r11,STACK_FRAME_OVERHEAD(%r15) - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - lgr %r2,%r11 - brasl %r14,__do_early_pgm_check - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) - lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - lpswe __LC_RETURN_PSW -SYM_CODE_END(early_pgm_check_handler) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 6a1e0fbbaa15..4cc3408c4dac 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -12,7 +12,7 @@ #include <linux/init.h> #include <linux/linkage.h> #include <asm/asm-extable.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/processor.h> #include <asm/cache.h> #include <asm/dwarf.h> @@ -28,53 +28,46 @@ #include <asm/setup.h> #include <asm/nmi.h> #include <asm/nospec-insn.h> +#include <asm/lowcore.h> _LPP_OFFSET = __LC_LPP .macro STBEAR address - ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2010000,\address", ALT_FACILITY(193) .endm .macro LBEAR address - ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193 + ALTERNATIVE "nop", ".insn s,0xb2000000,\address", ALT_FACILITY(193) .endm - .macro LPSWEY address,lpswe - ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193 + .macro LPSWEY address, lpswe + ALTERNATIVE_2 "b \lpswe;nopr", \ + ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193), \ + __stringify(.insn siy,0xeb0000000071,LOWCORE_ALT_ADDRESS+\address,0), \ + ALT_LOWCORE .endm - .macro MBEAR reg - ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193 + .macro MBEAR reg, lowcore + ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK(\lowcore)),\ + ALT_FACILITY(193) .endm - .macro CHECK_STACK savearea -#ifdef CONFIG_CHECK_STACK - tml %r15,THREAD_SIZE - CONFIG_STACK_GUARD - lghi %r14,\savearea - jz stack_overflow -#endif - .endm - - .macro CHECK_VMAP_STACK savearea,oklabel -#ifdef CONFIG_VMAP_STACK + .macro CHECK_VMAP_STACK savearea, lowcore, oklabel lgr %r14,%r15 nill %r14,0x10000 - THREAD_SIZE oill %r14,STACK_INIT_OFFSET - clg %r14,__LC_KERNEL_STACK + clg %r14,__LC_KERNEL_STACK(\lowcore) je \oklabel - clg %r14,__LC_ASYNC_STACK + clg %r14,__LC_ASYNC_STACK(\lowcore) je \oklabel - clg %r14,__LC_MCCK_STACK + clg %r14,__LC_MCCK_STACK(\lowcore) je \oklabel - clg %r14,__LC_NODAT_STACK + clg %r14,__LC_NODAT_STACK(\lowcore) je \oklabel - clg %r14,__LC_RESTART_STACK + clg %r14,__LC_RESTART_STACK(\lowcore) je \oklabel - lghi %r14,\savearea + la %r14,\savearea(\lowcore) j stack_overflow -#else - j \oklabel -#endif .endm /* @@ -100,30 +93,31 @@ _LPP_OFFSET = __LC_LPP .endm .macro BPOFF - ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_SPEC(82) .endm .macro BPON - ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82 + ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82) .endm .macro BPENTER tif_ptr,tif_mask ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \ - "j .+12; nop; nop", 82 + "j .+12; nop; nop", ALT_SPEC(82) .endm .macro BPEXIT tif_ptr,tif_mask TSTMSK \tif_ptr,\tif_mask ALTERNATIVE "jz .+8; .insn rrf,0xb2e80000,0,0,12,0", \ - "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82 + "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82) .endm #if IS_ENABLED(CONFIG_KVM) - .macro SIEEXIT sie_control - lg %r9,\sie_control # get control block pointer - ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - ni __LC_CPU_FLAGS+7,255-_CIF_SIE + .macro SIEEXIT sie_control,lowcore + lg %r9,\sie_control # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_KERNEL_ASCE(\lowcore) # load primary asce + lg %r9,__LC_CURRENT(\lowcore) + mvi __TI_sie(%r9),0 larl %r9,sie_exit # skip forward to sie_exit .endm #endif @@ -163,13 +157,14 @@ SYM_FUNC_START(__switch_to_asm) stg %r15,__THREAD_ksp(%r1,%r2) # store kernel stack of prev lg %r15,0(%r4,%r3) # start of kernel stack of next agr %r15,%r5 # end of kernel stack of next - stg %r3,__LC_CURRENT # store task struct of next - stg %r15,__LC_KERNEL_STACK # store end of kernel stack + GET_LC %r13 + stg %r3,__LC_CURRENT(%r13) # store task struct of next + stg %r15,__LC_KERNEL_STACK(%r13) # store end of kernel stack lg %r15,__THREAD_ksp(%r1,%r3) # load kernel stack of next aghi %r3,__TASK_pid - mvc __LC_CURRENT_PID(4,%r0),0(%r3) # store pid of next + mvc __LC_CURRENT_PID(4,%r13),0(%r3) # store pid of next + ALTERNATIVE "nop", "lpp _LPP_OFFSET(%r13)", ALT_FACILITY(40) lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task - ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 BR_EX %r14 SYM_FUNC_END(__switch_to_asm) @@ -179,22 +174,21 @@ SYM_FUNC_END(__switch_to_asm) * %r2 pointer to sie control block phys * %r3 pointer to sie control block virt * %r4 guest register save area + * %r5 guest asce */ SYM_FUNC_START(__sie64a) stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers - lg %r12,__LC_CURRENT + GET_LC %r13 + lg %r14,__LC_CURRENT(%r13) stg %r2,__SF_SIE_CONTROL_PHYS(%r15) # save sie block physical.. stg %r3,__SF_SIE_CONTROL(%r15) # ...and virtual addresses stg %r4,__SF_SIE_SAVEAREA(%r15) # save guest register save area + stg %r5,__SF_SIE_GUEST_ASCE(%r15) # save guest asce xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 - mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags + mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags lmg %r0,%r13,0(%r4) # load guest gprs 0-13 - lg %r14,__LC_GMAP # get gmap pointer - ltgr %r14,%r14 - jz .Lsie_gmap - oi __LC_CPU_FLAGS+7,_CIF_SIE - lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce -.Lsie_gmap: + mvi __TI_sie(%r14),1 + lctlg %c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now tm __SIE_PROG20+3(%r14),3 # last exit... @@ -212,19 +206,10 @@ SYM_FUNC_START(__sie64a) .Lsie_skip: lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - ni __LC_CPU_FLAGS+7,255-_CIF_SIE -# some program checks are suppressing. C code (e.g. do_protection_exception) -# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There -# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. -# Other instructions between __sie64a and .Lsie_done should not cause program -# interrupts. So lets use 3 nops as a landing pad for all possible rewinds. -.Lrewind_pad6: - nopr 7 -.Lrewind_pad4: - nopr 7 -.Lrewind_pad2: - nopr 7 + GET_LC %r14 + lctlg %c1,%c1,__LC_KERNEL_ASCE(%r14) # load primary asce + lg %r14,__LC_CURRENT(%r14) + mvi __TI_sie(%r14),0 SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area stmg %r0,%r13,0(%r14) # save guest gprs 0-13 @@ -236,15 +221,6 @@ SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL) lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers lg %r2,__SF_SIE_REASON(%r15) # return exit reason code BR_EX %r14 -.Lsie_fault: - lghi %r14,-EFAULT - stg %r14,__SF_SIE_REASON(%r15) # set exit reason code - j sie_exit - - EX_TABLE(.Lrewind_pad6,.Lsie_fault) - EX_TABLE(.Lrewind_pad4,.Lsie_fault) - EX_TABLE(.Lrewind_pad2,.Lsie_fault) - EX_TABLE(sie_exit,.Lsie_fault) SYM_FUNC_END(__sie64a) EXPORT_SYMBOL(__sie64a) EXPORT_SYMBOL(sie_exit) @@ -256,14 +232,15 @@ EXPORT_SYMBOL(sie_exit) */ SYM_CODE_START(system_call) - stpt __LC_SYS_ENTER_TIMER - stmg %r8,%r15,__LC_SAVE_AREA_SYNC + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF lghi %r14,0 .Lsysc_per: - STBEAR __LC_LAST_BREAK - lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r15,__LC_KERNEL_STACK + STBEAR __LC_LAST_BREAK(%r13) + lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) + lg %r15,__LC_KERNEL_STACK(%r13) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) # clear user controlled register to prevent speculative use @@ -278,17 +255,17 @@ SYM_CODE_START(system_call) xgr %r10,%r10 xgr %r11,%r11 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs - mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC - MBEAR %r2 + mvc __PT_R8(64,%r2),__LC_SAVE_AREA(%r13) + MBEAR %r2,%r13 lgr %r3,%r14 brasl %r14,__do_syscall STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lctlg %c1,%c1,__LC_USER_ASCE(%r13) + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPON LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + stpt __LC_EXIT_TIMER(%r13) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - stpt __LC_EXIT_TIMER LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE SYM_CODE_END(system_call) @@ -299,12 +276,13 @@ SYM_CODE_START(ret_from_fork) lgr %r3,%r11 brasl %r14,__ret_from_fork STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + GET_LC %r13 + lctlg %c1,%c1,__LC_USER_ASCE(%r13) + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) BPON LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) + stpt __LC_EXIT_TIMER(%r13) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - stpt __LC_EXIT_TIMER LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE SYM_CODE_END(ret_from_fork) @@ -313,41 +291,40 @@ SYM_CODE_END(ret_from_fork) */ SYM_CODE_START(pgm_check_handler) - stpt __LC_SYS_ENTER_TIMER + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_SYNC - lgr %r10,%r15 - lmg %r8,%r9,__LC_PGM_OLD_PSW + lmg %r8,%r9,__LC_PGM_OLD_PSW(%r13) + xgr %r10,%r10 tmhh %r8,0x0001 # coming from user space? jno .Lpgm_skip_asce - lctlg %c1,%c1,__LC_KERNEL_ASCE + lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) j 3f # -> fault in user space .Lpgm_skip_asce: +#if IS_ENABLED(CONFIG_KVM) + lg %r11,__LC_CURRENT(%r13) + tm __TI_sie(%r11),0xff + jz 1f + BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 + lghi %r10,_PIF_GUEST_FAULT +#endif 1: tmhh %r8,0x4000 # PER bit set in old PSW ? jnz 2f # -> enabled, can't be a double fault - tm __LC_PGM_ILC+3,0x80 # check for per exception + tm __LC_PGM_ILC+3(%r13),0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -2: CHECK_STACK __LC_SAVE_AREA_SYNC - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) +2: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) # CHECK_VMAP_STACK branches to stack_overflow or 4f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f -3: lg %r15,__LC_KERNEL_STACK + CHECK_VMAP_STACK __LC_SAVE_AREA,%r13,4f +3: lg %r15,__LC_KERNEL_STACK(%r13) 4: la %r11,STACK_FRAME_OVERHEAD(%r15) - xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + stg %r10,__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK - stctg %c1,%c1,__PT_CR1(%r11) -#if IS_ENABLED(CONFIG_KVM) - ltg %r12,__LC_GMAP - jz 5f - clc __GMAP_ASCE(8,%r12), __PT_CR1(%r11) - jne 5f - BPENTER __SF_SIE_FLAGS(%r10),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r10) -#endif -5: stmg %r8,%r9,__PT_PSW(%r11) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK(%r13) + stmg %r8,%r9,__PT_PSW(%r11) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -356,16 +333,17 @@ SYM_CODE_START(pgm_check_handler) xgr %r5,%r5 xgr %r6,%r6 xgr %r7,%r7 + xgr %r12,%r12 lgr %r2,%r11 brasl %r14,__do_pgm_check tmhh %r8,0x0001 # returning to user space? jno .Lpgm_exit_kernel STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE + lctlg %c1,%c1,__LC_USER_ASCE(%r13) BPON - stpt __LC_EXIT_TIMER + stpt __LC_EXIT_TIMER(%r13) .Lpgm_exit_kernel: - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) LBEAR STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15) lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE @@ -374,11 +352,11 @@ SYM_CODE_START(pgm_check_handler) # single stepped system call # .Lpgm_svcper: - mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW + mvc __LC_RETURN_PSW(8,%r13),__LC_SVC_NEW_PSW(%r13) larl %r14,.Lsysc_per - stg %r14,__LC_RETURN_PSW+8 + stg %r14,__LC_RETURN_PSW+8(%r13) lghi %r14,1 - LBEAR __LC_PGM_LAST_BREAK + LBEAR __LC_PGM_LAST_BREAK(%r13) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per SYM_CODE_END(pgm_check_handler) @@ -387,25 +365,26 @@ SYM_CODE_END(pgm_check_handler) */ .macro INT_HANDLER name,lc_old_psw,handler SYM_CODE_START(\name) - stckf __LC_INT_CLOCK - stpt __LC_SYS_ENTER_TIMER - STBEAR __LC_LAST_BREAK + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + stckf __LC_INT_CLOCK(%r13) + stpt __LC_SYS_ENTER_TIMER(%r13) + STBEAR __LC_LAST_BREAK(%r13) BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_ASYNC - lmg %r8,%r9,\lc_old_psw + lmg %r8,%r9,\lc_old_psw(%r13) tmhh %r8,0x0001 # interrupting from user ? jnz 1f #if IS_ENABLED(CONFIG_KVM) - TSTMSK __LC_CPU_FLAGS,_CIF_SIE + lg %r10,__LC_CURRENT(%r13) + tm __TI_sie(%r10),0xff jz 0f BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r15) + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif -0: CHECK_STACK __LC_SAVE_AREA_ASYNC - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) +0: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) j 2f -1: lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r15,__LC_KERNEL_STACK +1: lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) + lg %r15,__LC_KERNEL_STACK(%r13) 2: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -419,84 +398,67 @@ SYM_CODE_START(\name) xgr %r7,%r7 xgr %r10,%r10 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC - MBEAR %r11 + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + MBEAR %r11,%r13 stmg %r8,%r9,__PT_PSW(%r11) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,\handler - mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + mvc __LC_RETURN_PSW(16,%r13),__PT_PSW(%r11) tmhh %r8,0x0001 # returning to user ? jno 2f STACKLEAK_ERASE - lctlg %c1,%c1,__LC_USER_ASCE + lctlg %c1,%c1,__LC_USER_ASCE(%r13) BPON - stpt __LC_EXIT_TIMER + stpt __LC_EXIT_TIMER(%r13) 2: LBEAR __PT_LAST_BREAK(%r11) lmg %r0,%r15,__PT_R0(%r11) LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE SYM_CODE_END(\name) .endm + .section .irqentry.text, "ax" + INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq -/* - * Load idle PSW. - */ -SYM_FUNC_START(psw_idle) - stg %r14,(__SF_GPRS+8*8)(%r15) - stg %r3,__SF_EMPTY(%r15) - larl %r1,psw_idle_exit - stg %r1,__SF_EMPTY+8(%r15) - larl %r1,smp_cpu_mtid - llgf %r1,0(%r1) - ltgr %r1,%r1 - jz .Lpsw_idle_stcctm - .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2) -.Lpsw_idle_stcctm: - oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT - BPON - stckf __CLOCK_IDLE_ENTER(%r2) - stpt __TIMER_IDLE_ENTER(%r2) - lpswe __SF_EMPTY(%r15) -SYM_INNER_LABEL(psw_idle_exit, SYM_L_GLOBAL) - BR_EX %r14 -SYM_FUNC_END(psw_idle) + .section .kprobes.text, "ax" /* * Machine check handler routines */ SYM_CODE_START(mcck_int_handler) BPOFF - lmg %r8,%r9,__LC_MCK_OLD_PSW - TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE + GET_LC %r13 + lmg %r8,%r9,__LC_MCK_OLD_PSW(%r13) + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_SYSTEM_DAMAGE jo .Lmcck_panic # yes -> rest of mcck code invalid - TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_CR_VALID jno .Lmcck_panic # control registers invalid -> panic ptlb - lghi %r14,__LC_CPU_TIMER_SAVE_AREA - mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) - TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID + lay %r14,__LC_CPU_TIMER_SAVE_AREA(%r13) + mvc __LC_MCCK_ENTER_TIMER(8,%r13),0(%r14) + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_CPU_TIMER_VALID jo 3f - la %r14,__LC_SYS_ENTER_TIMER - clc 0(8,%r14),__LC_EXIT_TIMER + la %r14,__LC_SYS_ENTER_TIMER(%r13) + clc 0(8,%r14),__LC_EXIT_TIMER(%r13) jl 1f - la %r14,__LC_EXIT_TIMER -1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER + la %r14,__LC_EXIT_TIMER(%r13) +1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER(%r13) jl 2f - la %r14,__LC_LAST_UPDATE_TIMER + la %r14,__LC_LAST_UPDATE_TIMER(%r13) 2: spt 0(%r14) - mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) -3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID + mvc __LC_MCCK_ENTER_TIMER(8,%r13),0(%r14) +3: TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_PSW_MWP_VALID jno .Lmcck_panic tmhh %r8,0x0001 # interrupting from user ? jnz .Lmcck_user - TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID + TSTMSK __LC_MCCK_CODE(%r13),MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic #if IS_ENABLED(CONFIG_KVM) - TSTMSK __LC_CPU_FLAGS,_CIF_SIE + lg %r10,__LC_CURRENT(%r13) + tm __TI_sie(%r10),0xff jz .Lmcck_user - # Need to compare the address instead of a CIF_SIE* flag. + # Need to compare the address instead of __TI_SIE flag. # Otherwise there would be a race between setting the flag # and entering SIE (or leaving and clearing the flag). This # would cause machine checks targeted at the guest to be @@ -505,18 +467,19 @@ SYM_CODE_START(mcck_int_handler) clgrjl %r9,%r14, 4f larl %r14,.Lsie_leave clgrjhe %r9,%r14, 4f - oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST + lg %r10,__LC_PCPU + oi __PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST 4: BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST - SIEEXIT __SF_SIE_CONTROL(%r15) + SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif .Lmcck_user: - lg %r15,__LC_MCCK_STACK + lg %r15,__LC_MCCK_STACK(%r13) la %r11,STACK_FRAME_OVERHEAD(%r15) stctg %c1,%c1,__PT_CR1(%r11) - lctlg %c1,%c1,__LC_KERNEL_ASCE + lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - lghi %r14,__LC_GPREGS_SAVE_AREA+64 - stmg %r0,%r7,__PT_R0(%r11) + lay %r14,__LC_GPREGS_SAVE_AREA(%r13) + mvc __PT_R0(128,%r11),0(%r14) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 @@ -526,7 +489,6 @@ SYM_CODE_START(mcck_int_handler) xgr %r6,%r6 xgr %r7,%r7 xgr %r10,%r10 - mvc __PT_R8(64,%r11),0(%r14) stmg %r8,%r9,__PT_PSW(%r11) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) @@ -534,12 +496,13 @@ SYM_CODE_START(mcck_int_handler) brasl %r14,s390_do_machine_check lctlg %c1,%c1,__PT_CR1(%r11) lmg %r0,%r10,__PT_R0(%r11) - mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW - tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? + mvc __LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW + tm __LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ? jno 0f BPON - stpt __LC_EXIT_TIMER -0: ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193 + stpt __LC_EXIT_TIMER(%r13) +0: ALTERNATIVE "brcl 0,0", __stringify(lay %r12,__LC_LAST_BREAK_SAVE_AREA(%r13)),\ + ALT_FACILITY(193) LBEAR 0(%r12) lmg %r11,%r15,__PT_R11(%r11) LPSWEY __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE @@ -575,7 +538,7 @@ SYM_CODE_START(mcck_int_handler) SYM_CODE_END(mcck_int_handler) SYM_CODE_START(restart_int_handler) - ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40 + ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40) stg %r15,__LC_SAVE_AREA_RESTART TSTMSK __LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4 jz 0f @@ -583,15 +546,17 @@ SYM_CODE_START(restart_int_handler) 0: larl %r15,daton_psw lpswe 0(%r15) # turn dat on, keep irqs off .Ldaton: - lg %r15,__LC_RESTART_STACK + GET_LC %r15 + lg %r15,__LC_RESTART_STACK(%r15) xc STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART - mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW + GET_LC %r13 + mvc STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART(%r13) + mvc STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW(%r13) xc 0(STACK_FRAME_OVERHEAD,%r15),0(%r15) - lg %r1,__LC_RESTART_FN # load fn, parm & source cpu - lg %r2,__LC_RESTART_DATA - lgf %r3,__LC_RESTART_SOURCE + lg %r1,__LC_RESTART_FN(%r13) # load fn, parm & source cpu + lg %r2,__LC_RESTART_DATA(%r13) + lgf %r3,__LC_RESTART_SOURCE(%r13) ltgr %r3,%r3 # test source cpu address jm 1f # negative -> skip source stop 0: sigp %r4,%r3,SIGP_SENSE # sigp sense to source cpu @@ -604,16 +569,34 @@ SYM_CODE_START(restart_int_handler) 3: j 3b SYM_CODE_END(restart_int_handler) + __INIT +SYM_CODE_START(early_pgm_check_handler) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW(%r13) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + lgr %r2,%r11 + brasl %r14,__do_early_pgm_check + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(early_pgm_check_handler) + __FINIT + .section .kprobes.text, "ax" -#if defined(CONFIG_CHECK_STACK) || defined(CONFIG_VMAP_STACK) /* * The synchronous or the asynchronous stack overflowed. We are dead. * No need to properly save the registers, we are going to panic anyway. * Setup a pt_regs so that show_trace can provide a good call trace. */ SYM_CODE_START(stack_overflow) - lg %r15,__LC_NODAT_STACK # change to panic stack + GET_LC %r15 + lg %r15,__LC_NODAT_STACK(%r15) # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) stmg %r8,%r9,__PT_PSW(%r11) @@ -623,7 +606,6 @@ SYM_CODE_START(stack_overflow) lgr %r2,%r11 # pass pointer to pt_regs jg kernel_stack_overflow SYM_CODE_END(stack_overflow) -#endif .section .data, "aw" .balign 4 diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 21969520f947..a1f28879c87e 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -41,7 +41,6 @@ void do_restart(void *arg); void __init startup_init(void); void die(struct pt_regs *regs, const char *str); int setup_profiling_timer(unsigned int multiplier); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); struct s390_mmap_arg_struct; struct fadvise64_64_args; diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index fa90bbdc5ef9..6f2e87920288 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -113,7 +113,7 @@ void load_fpu_state(struct fpu *state, int flags) int mask; if (flags & KERNEL_FPC) - fpu_lfpc(&state->fpc); + fpu_lfpc_safe(&state->fpc); if (!cpu_has_vx()) { if (flags & KERNEL_VXR_V0V7) load_fp_regs_vx(state->vxrs); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index c46381ea04ec..63ba6306632e 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -7,13 +7,14 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/moduleloader.h> #include <linux/hardirq.h> #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/kmsan-checks.h> #include <linux/kprobes.h> +#include <linux/execmem.h> #include <trace/syscall.h> #include <asm/asm-offsets.h> #include <asm/text-patching.h> @@ -49,10 +50,6 @@ struct ftrace_insn { s32 disp; } __packed; -#ifdef CONFIG_MODULES -static char *ftrace_plt; -#endif /* CONFIG_MODULES */ - static const char *ftrace_shared_hotpatch_trampoline(const char **end) { const char *tstart, *tend; @@ -72,19 +69,20 @@ static const char *ftrace_shared_hotpatch_trampoline(const char **end) bool ftrace_need_init_nop(void) { - return true; + return !MACHINE_HAS_SEQ_INSN; } int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline = __ftrace_hotpatch_trampolines_start; - static const char orig[6] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 }; + static const struct ftrace_insn orig = { .opc = 0xc004, .disp = 0 }; static struct ftrace_hotpatch_trampoline *trampoline; struct ftrace_hotpatch_trampoline **next_trampoline; struct ftrace_hotpatch_trampoline *trampolines_end; struct ftrace_hotpatch_trampoline tmp; struct ftrace_insn *insn; + struct ftrace_insn old; const char *shared; s32 disp; @@ -98,7 +96,6 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) if (mod) { next_trampoline = &mod->arch.next_trampoline; trampolines_end = mod->arch.trampolines_end; - shared = ftrace_plt; } #endif @@ -106,8 +103,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return -ENOMEM; trampoline = (*next_trampoline)++; + if (copy_from_kernel_nofault(&old, (void *)rec->ip, sizeof(old))) + return -EFAULT; /* Check for the compiler-generated fentry nop (brcl 0, .). */ - if (WARN_ON_ONCE(memcmp((const void *)rec->ip, &orig, sizeof(orig)))) + if (WARN_ON_ONCE(memcmp(&orig, &old, sizeof(old)))) return -EINVAL; /* Generate the trampoline. */ @@ -143,8 +142,35 @@ static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrac return trampoline; } -int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, - unsigned long addr) +static inline struct ftrace_insn +ftrace_generate_branch_insn(unsigned long ip, unsigned long target) +{ + /* brasl r0,target or brcl 0,0 */ + return (struct ftrace_insn){ .opc = target ? 0xc005 : 0xc004, + .disp = target ? (target - ip) / 2 : 0 }; +} + +static int ftrace_patch_branch_insn(unsigned long ip, unsigned long old_target, + unsigned long target) +{ + struct ftrace_insn orig = ftrace_generate_branch_insn(ip, old_target); + struct ftrace_insn new = ftrace_generate_branch_insn(ip, target); + struct ftrace_insn old; + + if (!IS_ALIGNED(ip, 8)) + return -EINVAL; + if (copy_from_kernel_nofault(&old, (void *)ip, sizeof(old))) + return -EFAULT; + /* Verify that the to be replaced code matches what we expect. */ + if (memcmp(&orig, &old, sizeof(old))) + return -EINVAL; + s390_kernel_write((void *)ip, &new, sizeof(new)); + return 0; +} + +static int ftrace_modify_trampoline_call(struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long addr) { struct ftrace_hotpatch_trampoline *trampoline; u64 old; @@ -160,6 +186,15 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return 0; } +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + if (MACHINE_HAS_SEQ_INSN) + return ftrace_patch_branch_insn(rec->ip, old_addr, addr); + else + return ftrace_modify_trampoline_call(rec, old_addr, addr); +} + static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) { u16 old; @@ -178,11 +213,14 @@ static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - /* Expect brcl 0xf,... */ - return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); + /* Expect brcl 0xf,... for the !MACHINE_HAS_SEQ_INSN case */ + if (MACHINE_HAS_SEQ_INSN) + return ftrace_patch_branch_insn(rec->ip, addr, 0); + else + return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); } -int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +static int ftrace_make_trampoline_call(struct dyn_ftrace *rec, unsigned long addr) { struct ftrace_hotpatch_trampoline *trampoline; @@ -194,6 +232,14 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true); } +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + if (MACHINE_HAS_SEQ_INSN) + return ftrace_patch_branch_insn(rec->ip, 0, addr); + else + return ftrace_make_trampoline_call(rec, addr); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { ftrace_func = func; @@ -214,75 +260,19 @@ void ftrace_arch_code_modify_post_process(void) text_poke_sync_lock(); } -#ifdef CONFIG_MODULES +#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int __init ftrace_plt_init(void) +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) { - const char *start, *end; - - ftrace_plt = module_alloc(PAGE_SIZE); - if (!ftrace_plt) - panic("cannot allocate ftrace plt\n"); + unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; - start = ftrace_shared_hotpatch_trampoline(&end); - memcpy(ftrace_plt, start, end - start); - set_memory_rox((unsigned long)ftrace_plt, 1); - return 0; -} -device_initcall(ftrace_plt_init); - -#endif /* CONFIG_MODULES */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -/* - * Hook the return address and push it in the stack of return addresses - * in current thread info. - */ -unsigned long prepare_ftrace_return(unsigned long ra, unsigned long sp, - unsigned long ip) -{ if (unlikely(ftrace_graph_is_dead())) - goto out; + return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) - goto out; - ip -= MCOUNT_INSN_SIZE; - if (!function_graph_enter(ra, ip, 0, (void *) sp)) - ra = (unsigned long) return_to_handler; -out: - return ra; -} -NOKPROBE_SYMBOL(prepare_ftrace_return); - -/* - * Patch the kernel code at ftrace_graph_caller location. The instruction - * there is branch relative on condition. To enable the ftrace graph code - * block, we simply patch the mask field of the instruction to zero and - * turn the instruction into a nop. - * To disable the ftrace graph code the mask field will be patched to - * all ones, which turns the instruction into an unconditional branch. - */ -int ftrace_enable_ftrace_graph_caller(void) -{ - int rc; - - /* Expect brc 0xf,... */ - rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false); - if (rc) - return rc; - text_poke_sync_lock(); - return 0; -} - -int ftrace_disable_ftrace_graph_caller(void) -{ - int rc; - - /* Expect brc 0x0,... */ - rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true); - if (rc) - return rc; - text_poke_sync_lock(); - return 0; + return; + if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + *parent = (unsigned long)&return_to_handler; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -296,10 +286,14 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; + kmsan_unpoison_memory(fregs, ftrace_regs_size()); regs = ftrace_get_regs(fregs); p = get_kprobe((kprobe_opcode_t *)ip); if (!regs || unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h index 7f75a9616406..23337065f402 100644 --- a/arch/s390/kernel/ftrace.h +++ b/arch/s390/kernel/ftrace.h @@ -18,7 +18,5 @@ extern const char ftrace_shared_hotpatch_trampoline_br[]; extern const char ftrace_shared_hotpatch_trampoline_br_end[]; extern const char ftrace_shared_hotpatch_trampoline_exrl[]; extern const char ftrace_shared_hotpatch_trampoline_exrl_end[]; -extern const char ftrace_plt_template[]; -extern const char ftrace_plt_template_end[]; #endif /* _FTRACE_H */ diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 45413b04efc5..396034b2fe67 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -10,6 +10,7 @@ #include <linux/init.h> #include <linux/linkage.h> +#include <asm/lowcore.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page.h> @@ -18,14 +19,15 @@ __HEAD SYM_CODE_START(startup_continue) larl %r1,tod_clock_base - mvc 0(16,%r1),__LC_BOOT_CLOCK + GET_LC %r2 + mvc 0(16,%r1),__LC_BOOT_CLOCK(%r2) # # Setup stack # larl %r14,init_task - stg %r14,__LC_CURRENT + stg %r14,__LC_CURRENT(%r2) larl %r15,init_thread_union+STACK_INIT_OFFSET - stg %r15,__LC_KERNEL_STACK + stg %r15,__LC_KERNEL_STACK(%r2) brasl %r14,sclp_early_adjust_va # allow sclp_early_printk brasl %r14,startup_init # s390 specific early init brasl %r14,start_kernel # common init code diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c new file mode 100644 index 000000000000..7857a7e8e56c --- /dev/null +++ b/arch/s390/kernel/hiperdispatch.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define KMSG_COMPONENT "hd" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +/* + * Hiperdispatch: + * Dynamically calculates the optimum number of high capacity COREs + * by considering the state the system is in. When hiperdispatch decides + * that a capacity update is necessary, it schedules a topology update. + * During topology updates the CPU capacities are always re-adjusted. + * + * There is two places where CPU capacities are being accessed within + * hiperdispatch. + * -> hiperdispatch's reoccuring work function reads CPU capacities to + * determine high capacity CPU count. + * -> during a topology update hiperdispatch's adjustment function + * updates CPU capacities. + * These two can run on different CPUs in parallel which can cause + * hiperdispatch to make wrong decisions. This can potentially cause + * some overhead by leading to extra rebuild_sched_domains() calls + * for correction. Access to capacities within hiperdispatch has to be + * serialized to prevent the overhead. + * + * Hiperdispatch decision making revolves around steal time. + * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time + * crosses the threshold value hiperdispatch falls back to giving high + * capacities to entitled CPUs. When steal time drops below the + * threshold boundary, hiperdispatch utilizes all CPUs by giving all + * of them high capacity. + * + * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread + * performance. Comparing the throughput of; + * - single CORE, with N threads, running N tasks + * - N separate COREs running N tasks, + * using individual COREs for individual tasks yield better + * performance. This performance difference is roughly ~30% (can change + * between machine generations) + * + * Hiperdispatch tries to hint scheduler to use individual COREs for + * each task, as long as steal time on those COREs are less than 30%, + * therefore delaying the throughput loss caused by using SMP threads. + */ + +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/kernel_stat.h> +#include <linux/kstrtox.h> +#include <linux/ktime.h> +#include <linux/sysctl.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <asm/hiperdispatch.h> +#include <asm/setup.h> +#include <asm/smp.h> +#include <asm/topology.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hiperdispatch.h> + +#define HD_DELAY_FACTOR (4) +#define HD_DELAY_INTERVAL (HZ / 4) +#define HD_STEAL_THRESHOLD 30 +#define HD_STEAL_AVG_WEIGHT 16 + +static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ +static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */ +static int hd_high_capacity_cores; /* Current CORE count with high capacity */ +static int hd_entitled_cores; /* Total vertical high and medium CORE count */ +static int hd_online_cores; /* Current online CORE count */ + +static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */ +static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */ +static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */ +static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */ + +static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD; +static unsigned int hd_delay_factor = HD_DELAY_FACTOR; +static int hd_enabled; + +static void hd_capacity_work_fn(struct work_struct *work); +static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); + +static int hd_set_hiperdispatch_mode(int enable) +{ + if (!MACHINE_HAS_TOPOLOGY) + enable = 0; + if (hd_enabled == enable) + return 0; + hd_enabled = enable; + return 1; +} + +void hd_reset_state(void) +{ + cpumask_clear(&hd_vl_coremask); + cpumask_clear(&hd_vmvl_cpumask); + hd_entitled_cores = 0; + hd_online_cores = 0; +} + +void hd_add_core(int cpu) +{ + const struct cpumask *siblings; + int polarization; + + hd_online_cores++; + polarization = smp_cpu_get_polarization(cpu); + siblings = topology_sibling_cpumask(cpu); + switch (polarization) { + case POLARIZATION_VH: + hd_entitled_cores++; + break; + case POLARIZATION_VM: + hd_entitled_cores++; + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + case POLARIZATION_VL: + cpumask_set_cpu(cpu, &hd_vl_coremask); + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + } +} + +/* Serialize update and read operations of debug counters. */ +static DEFINE_MUTEX(hd_counter_mutex); + +static void hd_update_times(void) +{ + static ktime_t prev; + ktime_t now; + + /* + * Check if hiperdispatch is active, if not set the prev to 0. + * This way it is possible to differentiate the first update iteration after + * enabling hiperdispatch. + */ + if (hd_entitled_cores == 0 || hd_enabled == 0) { + prev = ktime_set(0, 0); + return; + } + now = ktime_get(); + if (ktime_after(prev, 0)) { + if (hd_high_capacity_cores == hd_online_cores) + hd_high_time += ktime_ms_delta(now, prev); + else + hd_low_time += ktime_ms_delta(now, prev); + } + prev = now; +} + +static void hd_update_capacities(void) +{ + int cpu, upscaling_cores; + unsigned long capacity; + + upscaling_cores = hd_high_capacity_cores - hd_entitled_cores; + capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW; + hd_high_capacity_cores = hd_entitled_cores; + for_each_cpu(cpu, &hd_vl_coremask) { + smp_set_core_capacity(cpu, capacity); + if (capacity != CPU_CAPACITY_HIGH) + continue; + hd_high_capacity_cores++; + upscaling_cores--; + if (upscaling_cores == 0) + capacity = CPU_CAPACITY_LOW; + } +} + +void hd_disable_hiperdispatch(void) +{ + cancel_delayed_work_sync(&hd_capacity_work); + hd_high_capacity_cores = hd_online_cores; + hd_previous_steal = 0; +} + +int hd_enable_hiperdispatch(void) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + mutex_unlock(&hd_counter_mutex); + if (hd_enabled == 0) + return 0; + if (hd_entitled_cores == 0) + return 0; + if (hd_online_cores <= hd_entitled_cores) + return 0; + mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + hd_update_capacities(); + return 1; +} + +static unsigned long hd_steal_avg(unsigned long new) +{ + static unsigned long steal; + + steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT; + return steal; +} + +static unsigned long hd_calculate_steal_percentage(void) +{ + unsigned long time_delta, steal_delta, steal, percentage; + static ktime_t prev; + int cpus, cpu; + ktime_t now; + + cpus = 0; + steal = 0; + percentage = 0; + for_each_cpu(cpu, &hd_vmvl_cpumask) { + steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + cpus++; + } + /* + * If there is no vertical medium and low CPUs steal time + * is 0 as vertical high CPUs shouldn't experience steal time. + */ + if (cpus == 0) + return percentage; + now = ktime_get(); + time_delta = ktime_to_ns(ktime_sub(now, prev)); + if (steal > hd_previous_steal && hd_previous_steal != 0) { + steal_delta = (steal - hd_previous_steal) * 100 / time_delta; + percentage = steal_delta / cpus; + } + hd_previous_steal = steal; + prev = now; + return percentage; +} + +static void hd_capacity_work_fn(struct work_struct *work) +{ + unsigned long steal_percentage, new_cores; + + mutex_lock(&smp_cpu_state_mutex); + /* + * If online cores are less or equal to entitled cores hiperdispatch + * does not need to make any adjustments, call a topology update to + * disable hiperdispatch. + * Normally this check is handled on topology update, but during cpu + * unhotplug, topology and cpu mask updates are done in reverse + * order, causing hd_enable_hiperdispatch() to get stale data. + */ + if (hd_online_cores <= hd_entitled_cores) { + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return; + } + steal_percentage = hd_steal_avg(hd_calculate_steal_percentage()); + if (steal_percentage < hd_steal_threshold) + new_cores = hd_online_cores; + else + new_cores = hd_entitled_cores; + if (hd_high_capacity_cores != new_cores) { + trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores); + hd_high_capacity_cores = new_cores; + atomic64_inc(&hd_adjustments); + topology_schedule_update(); + } + trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores); + mutex_unlock(&smp_cpu_state_mutex); + schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL); +} + +static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int hiperdispatch; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &hiperdispatch, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + hiperdispatch = hd_enabled; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + mutex_lock(&smp_cpu_state_mutex); + if (hd_set_hiperdispatch_mode(hiperdispatch)) + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return 0; +} + +static const struct ctl_table hiperdispatch_ctl_table[] = { + { + .procname = "hiperdispatch", + .mode = 0644, + .proc_handler = hiperdispatch_ctl_handler, + }, +}; + +static ssize_t hd_steal_threshold_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_steal_threshold); +} + +static ssize_t hd_steal_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (val > 100) + return -ERANGE; + hd_steal_threshold = val; + return count; +} + +static DEVICE_ATTR_RW(hd_steal_threshold); + +static ssize_t hd_delay_factor_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_delay_factor); +} + +static ssize_t hd_delay_factor_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (!val) + return -ERANGE; + hd_delay_factor = val; + return count; +} + +static DEVICE_ATTR_RW(hd_delay_factor); + +static struct attribute *hd_attrs[] = { + &dev_attr_hd_steal_threshold.attr, + &dev_attr_hd_delay_factor.attr, + NULL, +}; + +static const struct attribute_group hd_attr_group = { + .name = "hiperdispatch", + .attrs = hd_attrs, +}; + +static int hd_greedy_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_high_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n"); + +static int hd_conservative_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_low_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n"); + +static int hd_adjustment_count_get(void *unused, u64 *val) +{ + *val = atomic64_read(&hd_adjustments); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n"); + +static void __init hd_create_debugfs_counters(void) +{ + struct dentry *dir; + + dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir); + debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops); + debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops); + debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops); +} + +static void __init hd_create_attributes(void) +{ + struct device *dev; + + dev = bus_get_dev_root(&cpu_subsys); + if (!dev) + return; + if (sysfs_create_group(&dev->kobj, &hd_attr_group)) + pr_warn("Unable to create hiperdispatch attribute group\n"); + put_device(dev); +} + +static int __init hd_init(void) +{ + if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) { + hd_set_hiperdispatch_mode(1); + topology_schedule_update(); + } + if (!register_sysctl("s390", hiperdispatch_ctl_table)) + pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n"); + hd_create_debugfs_counters(); + hd_create_attributes(); + return 0; +} +late_initcall(hd_init); diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index e7239aaf428b..39cb8d0ae348 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -24,6 +24,7 @@ static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); void account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct lowcore *lc = get_lowcore(); unsigned long idle_time; u64 cycles_new[8]; int i; @@ -34,13 +35,13 @@ void account_idle_time_irq(void) this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); } - idle_time = S390_lowcore.int_clock - idle->clock_idle_enter; + idle_time = lc->int_clock - idle->clock_idle_enter; - S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock; - S390_lowcore.last_update_clock = S390_lowcore.int_clock; + lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; + lc->last_update_clock = lc->int_clock; - S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter; - S390_lowcore.last_update_timer = S390_lowcore.sys_enter_timer; + lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; + lc->last_update_timer = lc->sys_enter_timer; /* Account time spent with enabled wait psw loaded as idle time. */ WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); @@ -57,9 +58,13 @@ void noinstr arch_cpu_idle(void) psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); - - /* psw_idle() returns with interrupts disabled. */ - psw_idle(idle, psw_mask); + set_cpu_flag(CIF_ENABLED_WAIT); + if (smp_cpu_mtid) + stcctm(MT_DIAG, smp_cpu_mtid, (u64 *)&idle->mt_cycles_enter); + idle->clock_idle_enter = get_tod_clock_fast(); + idle->timer_idle_enter = get_cpu_timer(); + bpon(); + __load_psw_mask(psw_mask); } static ssize_t show_idle_count(struct device *dev, diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 1486350a4177..69be2309cde0 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -20,6 +20,7 @@ #include <linux/gfp.h> #include <linux/crash_dump.h> #include <linux/debug_locks.h> +#include <linux/vmalloc.h> #include <asm/asm-extable.h> #include <asm/diag.h> #include <asm/ipl.h> @@ -208,7 +209,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, \ char *page) \ { \ - return scnprintf(page, PAGE_SIZE, _format, ##args); \ + return sysfs_emit(page, _format, ##args); \ } #define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \ @@ -266,7 +267,11 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ struct kobj_attribute *attr, \ const char *buf, size_t len) \ { \ - strscpy(_value, buf, sizeof(_value)); \ + if (len >= sizeof(_value)) \ + return -E2BIG; \ + len = strscpy(_value, buf, sizeof(_value)); \ + if ((ssize_t)len < 0) \ + return len; \ strim(_value); \ return len; \ } \ @@ -275,6 +280,61 @@ static struct kobj_attribute sys_##_prefix##_##_name##_attr = \ sys_##_prefix##_##_name##_show, \ sys_##_prefix##_##_name##_store) +#define IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +static ssize_t sys_##_prefix##_scp_data_show(struct file *filp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t off, \ + size_t count) \ +{ \ + size_t size = _ipl_block.scp_data_len; \ + void *scp_data = _ipl_block.scp_data; \ + \ + return memory_read_from_buffer(buf, count, &off, \ + scp_data, size); \ +} + +#define IPL_ATTR_SCP_DATA_STORE_FN(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len)\ +static ssize_t sys_##_prefix##_scp_data_store(struct file *filp, \ + struct kobject *kobj, \ + const struct bin_attribute *attr, \ + char *buf, loff_t off, \ + size_t count) \ +{ \ + size_t scpdata_len = count; \ + size_t padding; \ + \ + if (off) \ + return -EINVAL; \ + \ + memcpy(_ipl_block.scp_data, buf, count); \ + if (scpdata_len % 8) { \ + padding = 8 - (scpdata_len % 8); \ + memset(_ipl_block.scp_data + scpdata_len, \ + 0, padding); \ + scpdata_len += padding; \ + } \ + \ + _ipl_block_hdr.len = _ipl_bp_len + scpdata_len; \ + _ipl_block.len = _ipl_bp0_len + scpdata_len; \ + _ipl_block.scp_data_len = scpdata_len; \ + \ + return count; \ +} + +#define DEFINE_IPL_ATTR_SCP_DATA_RO(_prefix, _ipl_block, _size) \ +IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +static const struct bin_attribute sys_##_prefix##_scp_data_attr = \ + __BIN_ATTR(scp_data, 0444, sys_##_prefix##_scp_data_show, \ + NULL, _size) + +#define DEFINE_IPL_ATTR_SCP_DATA_RW(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len, _size)\ +IPL_ATTR_SCP_DATA_SHOW_FN(_prefix, _ipl_block) \ +IPL_ATTR_SCP_DATA_STORE_FN(_prefix, _ipl_block_hdr, _ipl_block, _ipl_bp_len, _ipl_bp0_len)\ +static const struct bin_attribute sys_##_prefix##_scp_data_attr = \ + __BIN_ATTR(scp_data, 0644, sys_##_prefix##_scp_data_show, \ + sys_##_prefix##_scp_data_store, _size) + /* * ipl section */ @@ -312,7 +372,7 @@ EXPORT_SYMBOL_GPL(ipl_info); static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", ipl_type_str(ipl_info.type)); + return sysfs_emit(page, "%s\n", ipl_type_str(ipl_info.type)); } static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); @@ -320,7 +380,7 @@ static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); static ssize_t ipl_secure_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%i\n", !!ipl_secure_flag); + return sysfs_emit(page, "%i\n", !!ipl_secure_flag); } static struct kobj_attribute sys_ipl_secure_attr = @@ -329,7 +389,7 @@ static struct kobj_attribute sys_ipl_secure_attr = static ssize_t ipl_has_secure_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%i\n", !!sclp.has_sipl); + return sysfs_emit(page, "%i\n", !!sclp.has_sipl); } static struct kobj_attribute sys_ipl_has_secure_attr = @@ -342,7 +402,7 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj, if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW)) ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block); - return sprintf(page, "%s\n", parm); + return sysfs_emit(page, "%s\n", parm); } static struct kobj_attribute sys_ipl_vm_parm_attr = @@ -353,18 +413,18 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj, { switch (ipl_info.type) { case IPL_TYPE_CCW: - return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid, - ipl_block.ccw.devno); + return sysfs_emit(page, "0.%x.%04x\n", ipl_block.ccw.ssid, + ipl_block.ccw.devno); case IPL_TYPE_ECKD: case IPL_TYPE_ECKD_DUMP: - return sprintf(page, "0.%x.%04x\n", ipl_block.eckd.ssid, - ipl_block.eckd.devno); + return sysfs_emit(page, "0.%x.%04x\n", ipl_block.eckd.ssid, + ipl_block.eckd.devno); case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: - return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); + return sysfs_emit(page, "0.0.%04x\n", ipl_block.fcp.devno); case IPL_TYPE_NVME: case IPL_TYPE_NVME_DUMP: - return sprintf(page, "%08ux\n", ipl_block.nvme.fid); + return sysfs_emit(page, "%08ux\n", ipl_block.nvme.fid); default: return 0; } @@ -373,71 +433,38 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj, static struct kobj_attribute sys_ipl_device_attr = __ATTR(device, 0444, sys_ipl_device_show, NULL); -static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) +static ssize_t sys_ipl_parameter_read(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, char *buf, + loff_t off, size_t count) { return memory_read_from_buffer(buf, count, &off, &ipl_block, ipl_block.hdr.len); } -static struct bin_attribute ipl_parameter_attr = - __BIN_ATTR(binary_parameter, 0444, ipl_parameter_read, NULL, +static const struct bin_attribute sys_ipl_parameter_attr = + __BIN_ATTR(binary_parameter, 0444, sys_ipl_parameter_read, NULL, PAGE_SIZE); -static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) -{ - unsigned int size = ipl_block.fcp.scp_data_len; - void *scp_data = &ipl_block.fcp.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_fcp, ipl_block.fcp, PAGE_SIZE); -static ssize_t ipl_nvme_scp_data_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) -{ - unsigned int size = ipl_block.nvme.scp_data_len; - void *scp_data = &ipl_block.nvme.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t ipl_eckd_scp_data_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, char *buf, - loff_t off, size_t count) -{ - unsigned int size = ipl_block.eckd.scp_data_len; - void *scp_data = &ipl_block.eckd.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static struct bin_attribute ipl_scp_data_attr = - __BIN_ATTR(scp_data, 0444, ipl_scp_data_read, NULL, PAGE_SIZE); - -static struct bin_attribute ipl_nvme_scp_data_attr = - __BIN_ATTR(scp_data, 0444, ipl_nvme_scp_data_read, NULL, PAGE_SIZE); - -static struct bin_attribute ipl_eckd_scp_data_attr = - __BIN_ATTR(scp_data, 0444, ipl_eckd_scp_data_read, NULL, PAGE_SIZE); - -static struct bin_attribute *ipl_fcp_bin_attrs[] = { - &ipl_parameter_attr, - &ipl_scp_data_attr, +static const struct bin_attribute *const ipl_fcp_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_fcp_scp_data_attr, NULL, }; -static struct bin_attribute *ipl_nvme_bin_attrs[] = { - &ipl_parameter_attr, - &ipl_nvme_scp_data_attr, +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_nvme, ipl_block.nvme, PAGE_SIZE); + +static const struct bin_attribute *const ipl_nvme_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_nvme_scp_data_attr, NULL, }; -static struct bin_attribute *ipl_eckd_bin_attrs[] = { - &ipl_parameter_attr, - &ipl_eckd_scp_data_attr, +DEFINE_IPL_ATTR_SCP_DATA_RO(ipl_eckd, ipl_block.eckd, PAGE_SIZE); + +static const struct bin_attribute *const ipl_eckd_bin_attrs[] = { + &sys_ipl_parameter_attr, + &sys_ipl_eckd_scp_data_attr, NULL, }; @@ -476,12 +503,12 @@ static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj, \ if (!ipb->br_chr.cyl && \ !ipb->br_chr.head && \ !ipb->br_chr.record) \ - return sprintf(buf, "auto\n"); \ + return sysfs_emit(buf, "auto\n"); \ \ - return sprintf(buf, "0x%x,0x%x,0x%x\n", \ - ipb->br_chr.cyl, \ - ipb->br_chr.head, \ - ipb->br_chr.record); \ + return sysfs_emit(buf, "0x%x,0x%x,0x%x\n", \ + ipb->br_chr.cyl, \ + ipb->br_chr.head, \ + ipb->br_chr.record); \ } #define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb) \ @@ -546,11 +573,11 @@ static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, char loadparm[LOADPARM_LEN + 1] = {}; if (!sclp_ipl_info.is_valid) - return sprintf(page, "#unknown#\n"); + return sysfs_emit(page, "#unknown#\n"); memcpy(loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); EBCASC(loadparm, LOADPARM_LEN); strim(loadparm); - return sprintf(page, "%s\n", loadparm); + return sysfs_emit(page, "%s\n", loadparm); } static struct kobj_attribute sys_ipl_ccw_loadparm_attr = @@ -566,9 +593,9 @@ static struct attribute *ipl_fcp_attrs[] = { NULL, }; -static struct attribute_group ipl_fcp_attr_group = { +static const struct attribute_group ipl_fcp_attr_group = { .attrs = ipl_fcp_attrs, - .bin_attrs = ipl_fcp_bin_attrs, + .bin_attrs_new = ipl_fcp_bin_attrs, }; static struct attribute *ipl_nvme_attrs[] = { @@ -580,9 +607,9 @@ static struct attribute *ipl_nvme_attrs[] = { NULL, }; -static struct attribute_group ipl_nvme_attr_group = { +static const struct attribute_group ipl_nvme_attr_group = { .attrs = ipl_nvme_attrs, - .bin_attrs = ipl_nvme_bin_attrs, + .bin_attrs_new = ipl_nvme_bin_attrs, }; static struct attribute *ipl_eckd_attrs[] = { @@ -593,9 +620,9 @@ static struct attribute *ipl_eckd_attrs[] = { NULL, }; -static struct attribute_group ipl_eckd_attr_group = { +static const struct attribute_group ipl_eckd_attr_group = { .attrs = ipl_eckd_attrs, - .bin_attrs = ipl_eckd_bin_attrs, + .bin_attrs_new = ipl_eckd_bin_attrs, }; /* CCW ipl device attributes */ @@ -613,11 +640,11 @@ static struct attribute *ipl_ccw_attrs_lpar[] = { NULL, }; -static struct attribute_group ipl_ccw_attr_group_vm = { +static const struct attribute_group ipl_ccw_attr_group_vm = { .attrs = ipl_ccw_attrs_vm, }; -static struct attribute_group ipl_ccw_attr_group_lpar = { +static const struct attribute_group ipl_ccw_attr_group_lpar = { .attrs = ipl_ccw_attrs_lpar }; @@ -628,7 +655,7 @@ static struct attribute *ipl_common_attrs[] = { NULL, }; -static struct attribute_group ipl_common_attr_group = { +static const struct attribute_group ipl_common_attr_group = { .attrs = ipl_common_attrs, }; @@ -704,7 +731,7 @@ static ssize_t reipl_generic_vmparm_show(struct ipl_parameter_block *ipb, char vmparm[DIAG308_VMPARM_SIZE + 1] = {}; ipl_block_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb); - return sprintf(page, "%s\n", vmparm); + return sysfs_emit(page, "%s\n", vmparm); } static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, @@ -776,46 +803,12 @@ static struct kobj_attribute sys_reipl_ccw_vmparm_attr = /* FCP reipl device attributes */ -static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t size = reipl_block_fcp->fcp.scp_data_len; - void *scp_data = reipl_block_fcp->fcp.scp_data; +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_fcp, reipl_block_fcp->hdr, + reipl_block_fcp->fcp, + IPL_BP_FCP_LEN, IPL_BP0_FCP_LEN, + DIAG308_SCPDATA_SIZE); - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t scpdata_len = count; - size_t padding; - - - if (off) - return -EINVAL; - - memcpy(reipl_block_fcp->fcp.scp_data, buf, count); - if (scpdata_len % 8) { - padding = 8 - (scpdata_len % 8); - memset(reipl_block_fcp->fcp.scp_data + scpdata_len, - 0, padding); - scpdata_len += padding; - } - - reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN + scpdata_len; - reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN + scpdata_len; - reipl_block_fcp->fcp.scp_data_len = scpdata_len; - - return count; -} -static struct bin_attribute sys_reipl_fcp_scp_data_attr = - __BIN_ATTR(scp_data, 0644, reipl_fcp_scpdata_read, - reipl_fcp_scpdata_write, DIAG308_SCPDATA_SIZE); - -static struct bin_attribute *reipl_fcp_bin_attrs[] = { +static const struct bin_attribute *const reipl_fcp_bin_attrs[] = { &sys_reipl_fcp_scp_data_attr, NULL, }; @@ -846,7 +839,7 @@ static ssize_t reipl_generic_loadparm_show(struct ipl_parameter_block *ipb, char buf[LOADPARM_LEN + 1]; reipl_get_ascii_loadparm(buf, ipb); - return sprintf(page, "%s\n", buf); + return sysfs_emit(page, "%s\n", buf); } static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, @@ -902,7 +895,7 @@ DEFINE_GENERIC_LOADPARM(eckd); static ssize_t reipl_fcp_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_fcp_clear); + return sysfs_emit(page, "%u\n", reipl_fcp_clear); } static ssize_t reipl_fcp_clear_store(struct kobject *kobj, @@ -924,9 +917,9 @@ static struct attribute *reipl_fcp_attrs[] = { NULL, }; -static struct attribute_group reipl_fcp_attr_group = { +static const struct attribute_group reipl_fcp_attr_group = { .attrs = reipl_fcp_attrs, - .bin_attrs = reipl_fcp_bin_attrs, + .bin_attrs_new = reipl_fcp_bin_attrs, }; static struct kobj_attribute sys_reipl_fcp_clear_attr = @@ -934,46 +927,12 @@ static struct kobj_attribute sys_reipl_fcp_clear_attr = /* NVME reipl device attributes */ -static ssize_t reipl_nvme_scpdata_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t size = reipl_block_nvme->nvme.scp_data_len; - void *scp_data = reipl_block_nvme->nvme.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t reipl_nvme_scpdata_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t scpdata_len = count; - size_t padding; - - if (off) - return -EINVAL; - - memcpy(reipl_block_nvme->nvme.scp_data, buf, count); - if (scpdata_len % 8) { - padding = 8 - (scpdata_len % 8); - memset(reipl_block_nvme->nvme.scp_data + scpdata_len, - 0, padding); - scpdata_len += padding; - } +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_nvme, reipl_block_nvme->hdr, + reipl_block_nvme->nvme, + IPL_BP_NVME_LEN, IPL_BP0_NVME_LEN, + DIAG308_SCPDATA_SIZE); - reipl_block_nvme->hdr.len = IPL_BP_FCP_LEN + scpdata_len; - reipl_block_nvme->nvme.len = IPL_BP0_FCP_LEN + scpdata_len; - reipl_block_nvme->nvme.scp_data_len = scpdata_len; - - return count; -} - -static struct bin_attribute sys_reipl_nvme_scp_data_attr = - __BIN_ATTR(scp_data, 0644, reipl_nvme_scpdata_read, - reipl_nvme_scpdata_write, DIAG308_SCPDATA_SIZE); - -static struct bin_attribute *reipl_nvme_bin_attrs[] = { +static const struct bin_attribute *const reipl_nvme_bin_attrs[] = { &sys_reipl_nvme_scp_data_attr, NULL, }; @@ -996,15 +955,15 @@ static struct attribute *reipl_nvme_attrs[] = { NULL, }; -static struct attribute_group reipl_nvme_attr_group = { +static const struct attribute_group reipl_nvme_attr_group = { .attrs = reipl_nvme_attrs, - .bin_attrs = reipl_nvme_bin_attrs + .bin_attrs_new = reipl_nvme_bin_attrs }; static ssize_t reipl_nvme_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_nvme_clear); + return sysfs_emit(page, "%u\n", reipl_nvme_clear); } static ssize_t reipl_nvme_clear_store(struct kobject *kobj, @@ -1025,7 +984,7 @@ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); static ssize_t reipl_ccw_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_ccw_clear); + return sysfs_emit(page, "%u\n", reipl_ccw_clear); } static ssize_t reipl_ccw_clear_store(struct kobject *kobj, @@ -1067,46 +1026,12 @@ static struct attribute_group reipl_ccw_attr_group_lpar = { /* ECKD reipl device attributes */ -static ssize_t reipl_eckd_scpdata_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t size = reipl_block_eckd->eckd.scp_data_len; - void *scp_data = reipl_block_eckd->eckd.scp_data; - - return memory_read_from_buffer(buf, count, &off, scp_data, size); -} - -static ssize_t reipl_eckd_scpdata_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *attr, - char *buf, loff_t off, size_t count) -{ - size_t scpdata_len = count; - size_t padding; - - if (off) - return -EINVAL; - - memcpy(reipl_block_eckd->eckd.scp_data, buf, count); - if (scpdata_len % 8) { - padding = 8 - (scpdata_len % 8); - memset(reipl_block_eckd->eckd.scp_data + scpdata_len, - 0, padding); - scpdata_len += padding; - } - - reipl_block_eckd->hdr.len = IPL_BP_ECKD_LEN + scpdata_len; - reipl_block_eckd->eckd.len = IPL_BP0_ECKD_LEN + scpdata_len; - reipl_block_eckd->eckd.scp_data_len = scpdata_len; - - return count; -} - -static struct bin_attribute sys_reipl_eckd_scp_data_attr = - __BIN_ATTR(scp_data, 0644, reipl_eckd_scpdata_read, - reipl_eckd_scpdata_write, DIAG308_SCPDATA_SIZE); +DEFINE_IPL_ATTR_SCP_DATA_RW(reipl_eckd, reipl_block_eckd->hdr, + reipl_block_eckd->eckd, + IPL_BP_ECKD_LEN, IPL_BP0_ECKD_LEN, + DIAG308_SCPDATA_SIZE); -static struct bin_attribute *reipl_eckd_bin_attrs[] = { +static const struct bin_attribute *const reipl_eckd_bin_attrs[] = { &sys_reipl_eckd_scp_data_attr, NULL, }; @@ -1123,15 +1048,15 @@ static struct attribute *reipl_eckd_attrs[] = { NULL, }; -static struct attribute_group reipl_eckd_attr_group = { +static const struct attribute_group reipl_eckd_attr_group = { .attrs = reipl_eckd_attrs, - .bin_attrs = reipl_eckd_bin_attrs + .bin_attrs_new = reipl_eckd_bin_attrs }; static ssize_t reipl_eckd_clear_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%u\n", reipl_eckd_clear); + return sysfs_emit(page, "%u\n", reipl_eckd_clear); } static ssize_t reipl_eckd_clear_store(struct kobject *kobj, @@ -1161,7 +1086,7 @@ static ssize_t reipl_nss_name_show(struct kobject *kobj, char nss_name[NSS_NAME_SIZE + 1] = {}; reipl_get_ascii_nss_name(nss_name, reipl_block_nss); - return sprintf(page, "%s\n", nss_name); + return sysfs_emit(page, "%s\n", nss_name); } static ssize_t reipl_nss_name_store(struct kobject *kobj, @@ -1209,8 +1134,8 @@ static struct attribute_group reipl_nss_attr_group = { void set_os_info_reipl_block(void) { - os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual, - reipl_block_actual->hdr.len); + os_info_entry_add_data(OS_INFO_REIPL_BLOCK, reipl_block_actual, + reipl_block_actual->hdr.len); } /* reipl type */ @@ -1246,7 +1171,7 @@ static int reipl_set_type(enum ipl_type type) static ssize_t reipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", ipl_type_str(reipl_type)); + return sysfs_emit(page, "%s\n", ipl_type_str(reipl_type)); } static ssize_t reipl_type_store(struct kobject *kobj, @@ -1648,6 +1573,11 @@ DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", dump_block_fcp->fcp.devno); +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_fcp, dump_block_fcp->hdr, + dump_block_fcp->fcp, + IPL_BP_FCP_LEN, IPL_BP0_FCP_LEN, + DIAG308_SCPDATA_SIZE); + static struct attribute *dump_fcp_attrs[] = { &sys_dump_fcp_device_attr.attr, &sys_dump_fcp_wwpn_attr.attr, @@ -1657,9 +1587,15 @@ static struct attribute *dump_fcp_attrs[] = { NULL, }; -static struct attribute_group dump_fcp_attr_group = { +static const struct bin_attribute *const dump_fcp_bin_attrs[] = { + &sys_dump_fcp_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_fcp_attr_group = { .name = IPL_FCP_STR, .attrs = dump_fcp_attrs, + .bin_attrs_new = dump_fcp_bin_attrs, }; /* NVME dump device attributes */ @@ -1672,6 +1608,11 @@ DEFINE_IPL_ATTR_RW(dump_nvme, bootprog, "%lld\n", "%llx\n", DEFINE_IPL_ATTR_RW(dump_nvme, br_lba, "%lld\n", "%llx\n", dump_block_nvme->nvme.br_lba); +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_nvme, dump_block_nvme->hdr, + dump_block_nvme->nvme, + IPL_BP_NVME_LEN, IPL_BP0_NVME_LEN, + DIAG308_SCPDATA_SIZE); + static struct attribute *dump_nvme_attrs[] = { &sys_dump_nvme_fid_attr.attr, &sys_dump_nvme_nsid_attr.attr, @@ -1680,9 +1621,15 @@ static struct attribute *dump_nvme_attrs[] = { NULL, }; -static struct attribute_group dump_nvme_attr_group = { +static const struct bin_attribute *const dump_nvme_bin_attrs[] = { + &sys_dump_nvme_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_nvme_attr_group = { .name = IPL_NVME_STR, .attrs = dump_nvme_attrs, + .bin_attrs_new = dump_nvme_bin_attrs, }; /* ECKD dump device attributes */ @@ -1696,6 +1643,11 @@ IPL_ATTR_BR_CHR_STORE_FN(dump, dump_block_eckd->eckd); static struct kobj_attribute sys_dump_eckd_br_chr_attr = __ATTR(br_chr, 0644, eckd_dump_br_chr_show, eckd_dump_br_chr_store); +DEFINE_IPL_ATTR_SCP_DATA_RW(dump_eckd, dump_block_eckd->hdr, + dump_block_eckd->eckd, + IPL_BP_ECKD_LEN, IPL_BP0_ECKD_LEN, + DIAG308_SCPDATA_SIZE); + static struct attribute *dump_eckd_attrs[] = { &sys_dump_eckd_device_attr.attr, &sys_dump_eckd_bootprog_attr.attr, @@ -1703,9 +1655,15 @@ static struct attribute *dump_eckd_attrs[] = { NULL, }; -static struct attribute_group dump_eckd_attr_group = { +static const struct bin_attribute *const dump_eckd_bin_attrs[] = { + &sys_dump_eckd_scp_data_attr, + NULL, +}; + +static const struct attribute_group dump_eckd_attr_group = { .name = IPL_ECKD_STR, .attrs = dump_eckd_attrs, + .bin_attrs_new = dump_eckd_bin_attrs, }; /* CCW dump device attributes */ @@ -1734,7 +1692,7 @@ static int dump_set_type(enum dump_type type) static ssize_t dump_type_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", dump_type_str(dump_type)); + return sysfs_emit(page, "%s\n", dump_type_str(dump_type)); } static ssize_t dump_type_store(struct kobject *kobj, @@ -1759,6 +1717,24 @@ static ssize_t dump_type_store(struct kobject *kobj, static struct kobj_attribute dump_type_attr = __ATTR(dump_type, 0644, dump_type_show, dump_type_store); +static ssize_t dump_area_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sysfs_emit(page, "%lu\n", sclp.hsa_size); +} + +static struct kobj_attribute dump_area_size_attr = __ATTR_RO(dump_area_size); + +static struct attribute *dump_attrs[] = { + &dump_type_attr.attr, + &dump_area_size_attr.attr, + NULL, +}; + +static struct attribute_group dump_attr_group = { + .attrs = dump_attrs, +}; + static struct kset *dump_kset; static void diag308_dump(void *dump_block) @@ -1858,9 +1834,9 @@ static int __init dump_nvme_init(void) } dump_block_nvme->hdr.len = IPL_BP_NVME_LEN; dump_block_nvme->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_nvme->fcp.len = IPL_BP0_NVME_LEN; - dump_block_nvme->fcp.pbt = IPL_PBT_NVME; - dump_block_nvme->fcp.opt = IPL_PB0_NVME_OPT_DUMP; + dump_block_nvme->nvme.len = IPL_BP0_NVME_LEN; + dump_block_nvme->nvme.pbt = IPL_PBT_NVME; + dump_block_nvme->nvme.opt = IPL_PB0_NVME_OPT_DUMP; dump_capabilities |= DUMP_TYPE_NVME; return 0; } @@ -1895,7 +1871,7 @@ static int __init dump_init(void) dump_kset = kset_create_and_add("dump", NULL, firmware_kobj); if (!dump_kset) return -ENOMEM; - rc = sysfs_create_file(&dump_kset->kobj, &dump_type_attr.attr); + rc = sysfs_create_group(&dump_kset->kobj, &dump_attr_group); if (rc) { kset_unregister(dump_kset); return rc; @@ -1940,7 +1916,7 @@ static void dump_reipl_run(struct shutdown_trigger *trigger) reipl_type == IPL_TYPE_NSS || reipl_type == IPL_TYPE_UNKNOWN) os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR; - os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags)); + os_info_entry_add_data(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags)); csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0); abs_lc = get_abs_lowcore(); abs_lc->ipib = __pa(reipl_block_actual); @@ -1958,11 +1934,13 @@ static struct shutdown_action __refdata dump_reipl_action = { * vmcmd shutdown action: Trigger vm command on shutdown. */ -static char vmcmd_on_reboot[128]; -static char vmcmd_on_panic[128]; -static char vmcmd_on_halt[128]; -static char vmcmd_on_poff[128]; -static char vmcmd_on_restart[128]; +#define VMCMD_MAX_SIZE 240 + +static char vmcmd_on_reboot[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_panic[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_halt[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_poff[VMCMD_MAX_SIZE + 1]; +static char vmcmd_on_restart[VMCMD_MAX_SIZE + 1]; DEFINE_IPL_ATTR_STR_RW(vmcmd, on_reboot, "%s\n", "%s\n", vmcmd_on_reboot); DEFINE_IPL_ATTR_STR_RW(vmcmd, on_panic, "%s\n", "%s\n", vmcmd_on_panic); @@ -2074,7 +2052,7 @@ static struct shutdown_trigger on_reboot_trigger = {ON_REIPL_STR, static ssize_t on_reboot_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_reboot_trigger.action->name); + return sysfs_emit(page, "%s\n", on_reboot_trigger.action->name); } static ssize_t on_reboot_store(struct kobject *kobj, @@ -2100,7 +2078,7 @@ static struct shutdown_trigger on_panic_trigger = {ON_PANIC_STR, &stop_action}; static ssize_t on_panic_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_panic_trigger.action->name); + return sysfs_emit(page, "%s\n", on_panic_trigger.action->name); } static ssize_t on_panic_store(struct kobject *kobj, @@ -2126,7 +2104,7 @@ static struct shutdown_trigger on_restart_trigger = {ON_RESTART_STR, static ssize_t on_restart_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_restart_trigger.action->name); + return sysfs_emit(page, "%s\n", on_restart_trigger.action->name); } static ssize_t on_restart_store(struct kobject *kobj, @@ -2152,7 +2130,7 @@ void do_restart(void *arg) tracing_off(); debug_locks_off(); lgr_info_log(); - smp_call_online_cpu(__do_restart, arg); + smp_call_ipl_cpu(__do_restart, arg); } /* on halt */ @@ -2162,7 +2140,7 @@ static struct shutdown_trigger on_halt_trigger = {ON_HALT_STR, &stop_action}; static ssize_t on_halt_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_halt_trigger.action->name); + return sysfs_emit(page, "%s\n", on_halt_trigger.action->name); } static ssize_t on_halt_store(struct kobject *kobj, @@ -2188,7 +2166,7 @@ static struct shutdown_trigger on_poff_trigger = {ON_POFF_STR, &stop_action}; static ssize_t on_poff_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return sprintf(page, "%s\n", on_poff_trigger.action->name); + return sysfs_emit(page, "%s\n", on_poff_trigger.action->name); } static ssize_t on_poff_store(struct kobject *kobj, @@ -2288,8 +2266,8 @@ static int __init vmcmd_on_reboot_setup(char *str) { if (!MACHINE_IS_VM) return 1; - strncpy_skip_quote(vmcmd_on_reboot, str, 127); - vmcmd_on_reboot[127] = 0; + strncpy_skip_quote(vmcmd_on_reboot, str, VMCMD_MAX_SIZE); + vmcmd_on_reboot[VMCMD_MAX_SIZE] = 0; on_reboot_trigger.action = &vmcmd_action; return 1; } @@ -2299,8 +2277,8 @@ static int __init vmcmd_on_panic_setup(char *str) { if (!MACHINE_IS_VM) return 1; - strncpy_skip_quote(vmcmd_on_panic, str, 127); - vmcmd_on_panic[127] = 0; + strncpy_skip_quote(vmcmd_on_panic, str, VMCMD_MAX_SIZE); + vmcmd_on_panic[VMCMD_MAX_SIZE] = 0; on_panic_trigger.action = &vmcmd_action; return 1; } @@ -2310,8 +2288,8 @@ static int __init vmcmd_on_halt_setup(char *str) { if (!MACHINE_IS_VM) return 1; - strncpy_skip_quote(vmcmd_on_halt, str, 127); - vmcmd_on_halt[127] = 0; + strncpy_skip_quote(vmcmd_on_halt, str, VMCMD_MAX_SIZE); + vmcmd_on_halt[VMCMD_MAX_SIZE] = 0; on_halt_trigger.action = &vmcmd_action; return 1; } @@ -2321,8 +2299,8 @@ static int __init vmcmd_on_poff_setup(char *str) { if (!MACHINE_IS_VM) return 1; - strncpy_skip_quote(vmcmd_on_poff, str, 127); - vmcmd_on_poff[127] = 0; + strncpy_skip_quote(vmcmd_on_poff, str, VMCMD_MAX_SIZE); + vmcmd_on_poff[VMCMD_MAX_SIZE] = 0; on_poff_trigger.action = &vmcmd_action; return 1; } diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 6f71b0ce1068..ef7be599e1f7 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -29,6 +29,8 @@ #include <asm/hw_irq.h> #include <asm/stacktrace.h> #include <asm/softirq_stack.h> +#include <asm/vtime.h> +#include <asm/asm.h> #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); @@ -75,6 +77,7 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQEXT_CMS, .name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"}, {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, + {.irq = IRQEXT_WTI, .name = "WTI", .desc = "[EXT] Warning Track"}, {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, @@ -99,8 +102,8 @@ static const struct irq_class irqclass_sub_desc[] = { static void do_IRQ(struct pt_regs *regs, int irq) { - if (tod_after_eq(S390_lowcore.int_clock, - S390_lowcore.clock_comparator)) + if (tod_after_eq(get_lowcore()->int_clock, + get_lowcore()->clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); @@ -110,7 +113,7 @@ static int on_async_stack(void) { unsigned long frame = current_frame_address(); - return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; + return ((get_lowcore()->async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; } static void do_irq_async(struct pt_regs *regs, int irq) @@ -118,7 +121,7 @@ static void do_irq_async(struct pt_regs *regs, int irq) if (on_async_stack()) { do_IRQ(regs, irq); } else { - call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + call_on_stack(2, get_lowcore()->async_stack, void, do_IRQ, struct pt_regs *, regs, int, irq); } } @@ -127,9 +130,13 @@ static int irq_pending(struct pt_regs *regs) { int cc; - asm volatile("tpi 0\n" - "ipm %0" : "=d" (cc) : : "cc"); - return cc >> 28; + asm volatile( + " tpi 0\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : + : CC_CLOBBER); + return CC_TRANSFORM(cc); } void noinstr do_io_irq(struct pt_regs *regs) @@ -150,9 +157,10 @@ void noinstr do_io_irq(struct pt_regs *regs) if (from_idle) account_idle_time_irq(); + set_cpu_flag(CIF_NOHZ_DELAY); do { - regs->tpi_info = S390_lowcore.tpi_info; - if (S390_lowcore.tpi_info.adapter_IO) + regs->tpi_info = get_lowcore()->tpi_info; + if (get_lowcore()->tpi_info.adapter_IO) do_irq_async(regs, THIN_INTERRUPT); else do_irq_async(regs, IO_INTERRUPT); @@ -181,9 +189,9 @@ void noinstr do_ext_irq(struct pt_regs *regs) current->thread.last_break = regs->last_break; } - regs->int_code = S390_lowcore.ext_int_code_addr; - regs->int_parm = S390_lowcore.ext_params; - regs->int_parm_long = S390_lowcore.ext_params2; + regs->int_code = get_lowcore()->ext_int_code_addr; + regs->int_parm = get_lowcore()->ext_params; + regs->int_parm_long = get_lowcore()->ext_params2; from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) @@ -250,7 +258,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); goto out; } - if (index < nr_irqs) { + if (index < irq_get_nr_irqs()) { show_msi_interrupt(p, index); goto out; } diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index f0cf20d4b3c5..8b80ea57125f 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -9,7 +9,6 @@ #define pr_fmt(fmt) "kprobes: " fmt -#include <linux/moduleloader.h> #include <linux/kprobes.h> #include <linux/ptrace.h> #include <linux/preempt.h> @@ -21,10 +20,11 @@ #include <linux/slab.h> #include <linux/hardirq.h> #include <linux/ftrace.h> +#include <linux/execmem.h> +#include <asm/text-patching.h> #include <asm/set_memory.h> #include <asm/sections.h> #include <asm/dis.h> -#include "kprobes.h" #include "entry.h" DEFINE_PER_CPU(struct kprobe *, current_kprobe); @@ -32,39 +32,17 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; -static int insn_page_in_use; - void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; set_memory_rox((unsigned long)page, 1); return page; } -static void *alloc_s390_insn_page(void) -{ - if (xchg(&insn_page_in_use, 1) == 1) - return NULL; - return &kprobes_insn_page; -} - -static void free_s390_insn_page(void *page) -{ - xchg(&insn_page_in_use, 0); -} - -struct kprobe_insn_cache kprobe_s390_insn_slots = { - .mutex = __MUTEX_INITIALIZER(kprobe_s390_insn_slots.mutex), - .alloc = alloc_s390_insn_page, - .free = free_s390_insn_page, - .pages = LIST_HEAD_INIT(kprobe_s390_insn_slots.pages), - .insn_size = MAX_INSN_SIZE, -}; - static void copy_instruction(struct kprobe *p) { kprobe_opcode_t insn[MAX_INSN_SIZE]; @@ -78,10 +56,10 @@ static void copy_instruction(struct kprobe *p) if (probe_is_insn_relative_long(&insn[0])) { /* * For pc-relative instructions in RIL-b or RIL-c format patch - * the RI2 displacement field. We have already made sure that - * the insn slot for the patched instruction is within the same - * 2GB area as the original instruction (either kernel image or - * module area). Therefore the new displacement will always fit. + * the RI2 displacement field. The insn slot for the to be + * patched instruction is within the same 4GB area like the + * original instruction. Therefore the new displacement will + * always fit. */ disp = *(s32 *)&insn[1]; addr = (u64)(unsigned long)p->addr; @@ -93,34 +71,6 @@ static void copy_instruction(struct kprobe *p) } NOKPROBE_SYMBOL(copy_instruction); -static int s390_get_insn_slot(struct kprobe *p) -{ - /* - * Get an insn slot that is within the same 2GB area like the original - * instruction. That way instructions with a 32bit signed displacement - * field can be patched and executed within the insn slot. - */ - p->ainsn.insn = NULL; - if (is_kernel((unsigned long)p->addr)) - p->ainsn.insn = get_s390_insn_slot(); - else if (is_module_addr(p->addr)) - p->ainsn.insn = get_insn_slot(); - return p->ainsn.insn ? 0 : -ENOMEM; -} -NOKPROBE_SYMBOL(s390_get_insn_slot); - -static void s390_free_insn_slot(struct kprobe *p) -{ - if (!p->ainsn.insn) - return; - if (is_kernel((unsigned long)p->addr)) - free_s390_insn_slot(p->ainsn.insn, 0); - else - free_insn_slot(p->ainsn.insn, 0); - p->ainsn.insn = NULL; -} -NOKPROBE_SYMBOL(s390_free_insn_slot); - /* Check if paddr is at an instruction boundary */ static bool can_probe(unsigned long paddr) { @@ -174,7 +124,8 @@ int arch_prepare_kprobe(struct kprobe *p) /* Make sure the probe isn't going on a difficult instruction */ if (probe_is_prohibited_opcode(p->addr)) return -EINVAL; - if (s390_get_insn_slot(p)) + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) return -ENOMEM; copy_instruction(p); return 0; @@ -202,7 +153,12 @@ void arch_arm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; - stop_machine_cpuslocked(swap_instruction, &args, NULL); + if (MACHINE_HAS_SEQ_INSN) { + swap_instruction(&args); + text_poke_sync(); + } else { + stop_machine_cpuslocked(swap_instruction, &args, NULL); + } } NOKPROBE_SYMBOL(arch_arm_kprobe); @@ -210,13 +166,21 @@ void arch_disarm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; - stop_machine_cpuslocked(swap_instruction, &args, NULL); + if (MACHINE_HAS_SEQ_INSN) { + swap_instruction(&args); + text_poke_sync(); + } else { + stop_machine_cpuslocked(swap_instruction, &args, NULL); + } } NOKPROBE_SYMBOL(arch_disarm_kprobe); void arch_remove_kprobe(struct kprobe *p) { - s390_free_insn_slot(p); + if (!p->ainsn.insn) + return; + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; } NOKPROBE_SYMBOL(arch_remove_kprobe); @@ -525,6 +489,12 @@ int __init arch_init_kprobes(void) return 0; } +int __init arch_populate_kprobe_blacklist(void) +{ + return kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); +} + int arch_trampoline_kprobe(struct kprobe *p) { return 0; diff --git a/arch/s390/kernel/kprobes.h b/arch/s390/kernel/kprobes.h deleted file mode 100644 index dc3ed5098ee7..000000000000 --- a/arch/s390/kernel/kprobes.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -#ifndef _ARCH_S390_KPROBES_H -#define _ARCH_S390_KPROBES_H - -#include <linux/kprobes.h> - -DEFINE_INSN_CACHE_OPS(s390_insn); - -#endif diff --git a/arch/s390/kernel/kprobes_insn_page.S b/arch/s390/kernel/kprobes_insn_page.S deleted file mode 100644 index 0fe4d725e98b..000000000000 --- a/arch/s390/kernel/kprobes_insn_page.S +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#include <linux/linkage.h> - -/* - * insn_page is a special 4k aligned dummy function for kprobes. - * It will contain all kprobed instructions that are out-of-line executed. - * The page must be within the kernel image to guarantee that the - * out-of-line instructions are within 2GB distance of their original - * location. Using a dummy function ensures that the insn_page is within - * the text section of the kernel and mapped read-only/executable from - * the beginning on, thus avoiding to split large mappings if the page - * would be in the data section instead. - */ - .section .kprobes.text, "ax" - .balign 4096 -SYM_CODE_START(kprobes_insn_page) - .rept 2048 - .word 0x07fe - .endr -SYM_CODE_END(kprobes_insn_page) - .previous diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 6652e54cf3db..6d1ffca5f798 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -166,7 +166,7 @@ static struct timer_list lgr_timer; */ static void lgr_timer_set(void) { - mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC)); + mod_timer(&lgr_timer, jiffies + secs_to_jiffies(LGR_TIMER_INTERVAL_SECS)); } /* diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 3aee98efc374..8f681ccfb83a 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -52,7 +52,7 @@ static void __do_machine_kdump(void *data) purgatory = (purgatory_t)image->start; /* store_status() saved the prefix register to lowcore */ - prefix = (unsigned long) S390_lowcore.prefixreg_save_area; + prefix = (unsigned long)get_lowcore()->prefixreg_save_area; /* Now do the reset */ s390_reset_system(); @@ -62,7 +62,7 @@ static void __do_machine_kdump(void *data) * This need to be done *after* s390_reset_system set the * prefix register of this CPU to zero */ - memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA), + memcpy(absolute_pointer(get_lowcore()->floating_pt_save_area), phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512); call_nodat(1, int, purgatory, int, 1); @@ -91,7 +91,7 @@ static noinline void __machine_kdump(void *image) continue; } /* Store status of the boot CPU */ - mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + mcesa = __va(get_lowcore()->mcesad & MCESA_ORIGIN_MASK); if (cpu_has_vx()) save_vx_regs((__vector128 *) mcesa->vector_save_area); if (MACHINE_HAS_GS) { diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index ae4d4fd9afcd..1fec370fecf4 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -9,6 +9,7 @@ #include <asm/ftrace.h> #include <asm/nospec-insn.h> #include <asm/ptrace.h> +#include <asm/march.h> #define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE) #define STACK_PTREGS (STACK_FRAME_OVERHEAD) @@ -88,7 +89,7 @@ SYM_CODE_START(ftrace_caller) SYM_CODE_END(ftrace_caller) SYM_CODE_START(ftrace_common) -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#ifdef MARCH_HAS_Z196_FEATURES aghik %r2,%r0,-MCOUNT_INSN_SIZE lgrl %r4,function_trace_op lgrl %r1,ftrace_func @@ -103,19 +104,8 @@ SYM_CODE_START(ftrace_common) lgr %r3,%r14 la %r5,STACK_FREGS(%r15) BASR_EX %r14,%r1 -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -# The j instruction gets runtime patched to a nop instruction. -# See ftrace_enable_ftrace_graph_caller. -SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) - j .Lftrace_graph_caller_end - lmg %r2,%r3,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15) - lg %r4,(STACK_FREGS_PTREGS_PSW+8)(%r15) - brasl %r14,prepare_ftrace_return - stg %r2,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15) -.Lftrace_graph_caller_end: -#endif lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#ifdef MARCH_HAS_Z196_FEATURES ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) locgrz %r1,%r0 #else @@ -133,14 +123,14 @@ SYM_CODE_END(ftrace_common) SYM_FUNC_START(return_to_handler) stmg %r2,%r5,32(%r15) lgr %r1,%r15 - aghi %r15,-(STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE) + # allocate ftrace_regs and stack frame for ftrace_return_to_handler + aghi %r15,-STACK_FRAME_SIZE_FREGS stg %r1,__SF_BACKCHAIN(%r15) - la %r3,STACK_FRAME_OVERHEAD(%r15) - stg %r1,__FGRAPH_RET_FP(%r3) - stg %r2,__FGRAPH_RET_GPR2(%r3) - lgr %r2,%r3 + stg %r2,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) + stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15) + la %r2,STACK_FRAME_OVERHEAD(%r15) brasl %r14,ftrace_return_to_handler - aghi %r15,STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE + aghi %r15,STACK_FRAME_SIZE_FREGS lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 42215f9404af..91e207b50394 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -21,6 +21,7 @@ #include <linux/moduleloader.h> #include <linux/bug.h> #include <linux/memory.h> +#include <linux/execmem.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> #include <asm/facility.h> @@ -36,47 +37,10 @@ #define PLT_ENTRY_SIZE 22 -static unsigned long get_module_load_offset(void) -{ - static DEFINE_MUTEX(module_kaslr_mutex); - static unsigned long module_load_offset; - - if (!kaslr_enabled()) - return 0; - /* - * Calculate the module_load_offset the first time this code - * is called. Once calculated it stays the same until reboot. - */ - mutex_lock(&module_kaslr_mutex); - if (!module_load_offset) - module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; - mutex_unlock(&module_kaslr_mutex); - return module_load_offset; -} - -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, PAGE_KERNEL, - VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, - NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - return p; -} - #ifdef CONFIG_FUNCTION_TRACER void module_arch_cleanup(struct module *mod) { - module_memfree(mod->arch.trampolines_start); + execmem_free(mod->arch.trampolines_start); } #endif @@ -510,7 +474,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct module *me, size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size); numpages = DIV_ROUND_UP(size, PAGE_SIZE); - start = module_alloc(numpages * PAGE_SIZE); + start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE); if (!start) return -ENOMEM; set_memory_rox((unsigned long)start, numpages); diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index c77382a67325..fbd218b6fc8e 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -31,6 +31,7 @@ #include <asm/crw.h> #include <asm/asm-offsets.h> #include <asm/pai.h> +#include <asm/vtime.h> struct mcck_struct { unsigned int kill_task : 1; @@ -116,6 +117,7 @@ static __always_inline char *u64_to_hex(char *dest, u64 val) static notrace void s390_handle_damage(void) { + struct lowcore *lc = get_lowcore(); union ctlreg0 cr0, cr0_new; char message[100]; psw_t psw_save; @@ -124,7 +126,7 @@ static notrace void s390_handle_damage(void) smp_emergency_stop(); diag_amode31_ops.diag308_reset(); ptr = nmi_puts(message, "System stopped due to unrecoverable machine check, code: 0x"); - u64_to_hex(ptr, S390_lowcore.mcck_interruption_code); + u64_to_hex(ptr, lc->mcck_interruption_code); /* * Disable low address protection and make machine check new PSW a @@ -134,17 +136,17 @@ static notrace void s390_handle_damage(void) cr0_new = cr0; cr0_new.lap = 0; local_ctl_load(0, &cr0_new.reg); - psw_save = S390_lowcore.mcck_new_psw; - psw_bits(S390_lowcore.mcck_new_psw).io = 0; - psw_bits(S390_lowcore.mcck_new_psw).ext = 0; - psw_bits(S390_lowcore.mcck_new_psw).wait = 1; + psw_save = lc->mcck_new_psw; + psw_bits(lc->mcck_new_psw).io = 0; + psw_bits(lc->mcck_new_psw).ext = 0; + psw_bits(lc->mcck_new_psw).wait = 1; sclp_emergency_printk(message); /* * Restore machine check new PSW and control register 0 to original * values. This makes possible system dump analysis easier. */ - S390_lowcore.mcck_new_psw = psw_save; + lc->mcck_new_psw = psw_save; local_ctl_load(0, &cr0.reg); disabled_wait(); while (1); @@ -225,7 +227,7 @@ static bool notrace nmi_registers_valid(union mci mci) /* * Set the clock comparator register to the next expected value. */ - set_clock_comparator(S390_lowcore.clock_comparator); + set_clock_comparator(get_lowcore()->clock_comparator); if (!mci.gr || !mci.fp || !mci.fc) return false; /* @@ -251,7 +253,7 @@ static bool notrace nmi_registers_valid(union mci mci) * check handling must take care of this. The host values are saved by * KVM and are not affected. */ - cr2.reg = S390_lowcore.cregs_save_area[2]; + cr2.reg = get_lowcore()->cregs_save_area[2]; if (cr2.gse && !mci.gs && !test_cpu_flag(CIF_MCCK_GUEST)) return false; if (!mci.ms || !mci.pm || !mci.ia) @@ -277,11 +279,10 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs) sie_page = container_of(sie_block, struct sie_page, sie_block); mcck_backup = &sie_page->mcck_info; - mcck_backup->mcic = S390_lowcore.mcck_interruption_code & + mcck_backup->mcic = get_lowcore()->mcck_interruption_code & ~(MCCK_CODE_CP | MCCK_CODE_EXT_DAMAGE); - mcck_backup->ext_damage_code = S390_lowcore.external_damage_code; - mcck_backup->failing_storage_address - = S390_lowcore.failing_storage_address; + mcck_backup->ext_damage_code = get_lowcore()->external_damage_code; + mcck_backup->failing_storage_address = get_lowcore()->failing_storage_address; } NOKPROBE_SYMBOL(s390_backup_mcck_info); @@ -301,6 +302,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) static int ipd_count; static DEFINE_SPINLOCK(ipd_lock); static unsigned long long last_ipd; + struct lowcore *lc = get_lowcore(); struct mcck_struct *mcck; unsigned long long tmp; irqentry_state_t irq_state; @@ -313,7 +315,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs) if (user_mode(regs)) update_timer_mcck(); inc_irq_stat(NMI_NMI); - mci.val = S390_lowcore.mcck_interruption_code; + mci.val = lc->mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); /* @@ -381,9 +383,9 @@ void notrace s390_do_machine_check(struct pt_regs *regs) } if (mci.ed && mci.ec) { /* External damage */ - if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC)) + if (lc->external_damage_code & (1U << ED_STP_SYNC)) mcck->stp_queue |= stp_sync_check(); - if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND)) + if (lc->external_damage_code & (1U << ED_STP_ISLAND)) mcck->stp_queue |= stp_island_check(); mcck_pending = 1; } diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index d1b16d83e49a..e11ec15960a1 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -4,6 +4,8 @@ #include <linux/cpu.h> #include <asm/nospec-branch.h> +int nobp = IS_ENABLED(CONFIG_KERNEL_NOBP); + static int __init nobp_setup_early(char *str) { bool enabled; @@ -17,11 +19,11 @@ static int __init nobp_setup_early(char *str) * The user explicitly requested nobp=1, enable it and * disable the expoline support. */ - __set_facility(82, alt_stfle_fac_list); + nobp = 1; if (IS_ENABLED(CONFIG_EXPOLINE)) nospec_disable = 1; } else { - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } return 0; } @@ -29,7 +31,7 @@ early_param("nobp", nobp_setup_early); static int __init nospec_setup_early(char *str) { - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; return 0; } early_param("nospec", nospec_setup_early); @@ -40,7 +42,7 @@ static int __init nospec_report(void) pr_info("Spectre V2 mitigation: etokens\n"); if (nospec_uses_trampoline()) pr_info("Spectre V2 mitigation: execute trampolines\n"); - if (__test_facility(82, alt_stfle_fac_list)) + if (nobp_enabled()) pr_info("Spectre V2 mitigation: limited branch prediction\n"); return 0; } @@ -66,14 +68,14 @@ void __init nospec_auto_detect(void) */ if (__is_defined(CC_USING_EXPOLINE)) nospec_disable = 1; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } else if (__is_defined(CC_USING_EXPOLINE)) { /* * The kernel has been compiled with expolines. * Keep expolines enabled and disable nobp. */ nospec_disable = 0; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } /* * If the kernel has not been compiled with expolines the @@ -86,7 +88,7 @@ static int __init spectre_v2_setup_early(char *str) { if (str && !strncmp(str, "on", 2)) { nospec_disable = 0; - __clear_facility(82, alt_stfle_fac_list); + nobp = 0; } if (str && !strncmp(str, "off", 3)) nospec_disable = 1; @@ -114,10 +116,10 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) type = BRASL_EXPOLINE; /* brasl instruction */ else continue; - thunk = instr + (*(int *)(instr + 2)) * 2; + thunk = instr + (long)(*(int *)(instr + 2)) * 2; if (thunk[0] == 0xc6 && thunk[1] == 0x00) /* exrl %r0,<target-br> */ - br = thunk + (*(int *)(thunk + 2)) * 2; + br = thunk + (long)(*(int *)(thunk + 2)) * 2; else continue; if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0) diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index 52d4353188ad..5970dd3ee7c5 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -7,17 +7,17 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return sysfs_emit(buf, "Mitigation: __user pointer sanitization\n"); } ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { if (test_facility(156)) - return sprintf(buf, "Mitigation: etokens\n"); + return sysfs_emit(buf, "Mitigation: etokens\n"); if (nospec_uses_trampoline()) - return sprintf(buf, "Mitigation: execute trampolines\n"); - if (__test_facility(82, alt_stfle_fac_list)) - return sprintf(buf, "Mitigation: limited branch prediction\n"); - return sprintf(buf, "Vulnerable\n"); + return sysfs_emit(buf, "Mitigation: execute trampolines\n"); + if (nobp_enabled()) + return sysfs_emit(buf, "Mitigation: limited branch prediction\n"); + return sysfs_emit(buf, "Vulnerable\n"); } diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 23ab9f02f278..2fc40f97c0ad 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -14,9 +14,6 @@ #include <linux/node.h> #include <asm/numa.h> -struct pglist_data *node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(node_data); - void __init numa_setup(void) { int nid; @@ -24,12 +21,8 @@ void __init numa_setup(void) nodes_clear(node_possible_map); node_set(0, node_possible_map); node_set_online(0); - for (nid = 0; nid < MAX_NUMNODES; nid++) { - NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); - if (!NODE_DATA(nid)) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(pg_data_t), 8); - } + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = memblock_alloc_or_panic(sizeof(pg_data_t), 8); NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index a801e6bd5341..c2a468986212 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -15,8 +15,11 @@ #include <asm/checksum.h> #include <asm/abs_lowcore.h> #include <asm/os_info.h> +#include <asm/physmem_info.h> #include <asm/maccess.h> #include <asm/asm-offsets.h> +#include <asm/sections.h> +#include <asm/ipl.h> /* * OS info structure has to be page aligned @@ -43,9 +46,9 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size) } /* - * Add OS info entry and update checksum + * Add OS info data entry and update checksum */ -void os_info_entry_add(int nr, void *ptr, u64 size) +void os_info_entry_add_data(int nr, void *ptr, u64 size) { os_info.entry[nr].addr = __pa(ptr); os_info.entry[nr].size = size; @@ -54,15 +57,36 @@ void os_info_entry_add(int nr, void *ptr, u64 size) } /* + * Add OS info value entry and update checksum + */ +void os_info_entry_add_val(int nr, u64 value) +{ + os_info.entry[nr].val = value; + os_info.entry[nr].size = 0; + os_info.entry[nr].csum = 0; + os_info.csum = os_info_csum(&os_info); +} + +/* * Initialize OS info structure and set lowcore pointer */ void __init os_info_init(void) { struct lowcore *abs_lc; + BUILD_BUG_ON(sizeof(struct os_info) != PAGE_SIZE); os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; + os_info_entry_add_val(OS_INFO_IDENTITY_BASE, __identity_base); + os_info_entry_add_val(OS_INFO_KASLR_OFFSET, kaslr_offset()); + os_info_entry_add_val(OS_INFO_KASLR_OFF_PHYS, __kaslr_offset_phys); + os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap); + os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START); + os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END); + os_info_entry_add_val(OS_INFO_IMAGE_START, (unsigned long)_stext); + os_info_entry_add_val(OS_INFO_IMAGE_END, (unsigned long)_end); + os_info_entry_add_val(OS_INFO_IMAGE_PHYS, __pa_symbol(_stext)); os_info.csum = os_info_csum(&os_info); abs_lc = get_abs_lowcore(); abs_lc->os_info = __pa(&os_info); @@ -125,7 +149,7 @@ static void os_info_old_init(void) if (os_info_init) return; - if (!oldmem_data.start) + if (!oldmem_data.start && !is_ipl_type_dump()) goto fail; if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr))) goto fail; @@ -157,7 +181,7 @@ fail: } /* - * Return pointer to os infor entry and its size + * Return pointer to os info entry and its size */ void *os_info_old_entry(int nr, unsigned long *size) { diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 41ed6e0f0a2a..33205dd410e4 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -22,6 +22,10 @@ #include <asm/hwctrset.h> #include <asm/debug.h> +/* Perf PMU definitions for the counter facility */ +#define PERF_CPUM_CF_MAX_CTR 0xffffUL /* Max ctr for ECCTR */ +#define PERF_EVENT_CPUM_CF_DIAG 0xBC000UL /* Event: Counter sets */ + enum cpumf_ctr_set { CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ @@ -428,7 +432,7 @@ static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset) case CPUMF_CTR_SET_CRYPTO: if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5) ctrset_size = 16; - else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) + else if (cpumf_ctr_info.csvn >= 6) ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: @@ -556,25 +560,31 @@ static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) struct cf_trailer_entry *trailer_start, *trailer_stop; struct cf_ctrset_entry *ctrstart, *ctrstop; size_t offset = 0; + int i; - auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; - do { + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + /* Counter set not authorized */ + if (!(auth & cpumf_ctr_ctl[i])) + continue; + /* Counter set size zero was not saved */ + if (!cpum_cf_read_setsize(i)) + continue; + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { pr_err_once("cpum_cf_diag counter set compare error " "in set %i\n", ctrstart->set); return 0; } - auth &= ~cpumf_ctr_ctl[ctrstart->set]; if (ctrstart->def == CF_DIAG_CTRSET_DEF) { cfdiag_diffctrset((u64 *)(ctrstart + 1), (u64 *)(ctrstop + 1), ctrstart->ctr); offset += ctrstart->ctr * sizeof(u64) + sizeof(*ctrstart); } - } while (ctrstart->def && auth); + } /* Save time_stamp from start of event in stop's trailer */ trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); @@ -825,7 +835,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) return validate_ctr_version(hwc->config, set); } -/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different +/* Events CPU_CYCLES and INSTRUCTIONS can be submitted with two different * attribute::type values: * - PERF_TYPE_HARDWARE: * - pmu->type: @@ -869,8 +879,8 @@ static int hw_perf_event_reset(struct perf_event *event) u64 prev, new; int err; + prev = local64_read(&event->hw.prev_count); do { - prev = local64_read(&event->hw.prev_count); err = ecctr(event->hw.config, &new); if (err) { if (err != 3) @@ -882,7 +892,7 @@ static int hw_perf_event_reset(struct perf_event *event) */ new = 0; } - } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new)); return err; } @@ -892,12 +902,12 @@ static void hw_perf_event_update(struct perf_event *event) u64 prev, new, delta; int err; + prev = local64_read(&event->hw.prev_count); do { - prev = local64_read(&event->hw.prev_count); err = ecctr(event->hw.config, &new); if (err) return; - } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new)); delta = (prev <= new) ? new - prev : (-1ULL - prev) + new + 1; /* overflow */ @@ -971,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = cpuhw->usedss; raw.frag.data = cpuhw->stop; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); @@ -1044,7 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) * * When a new perf event has been added but not yet started, this can * clear enable control and resets all counters in a set. Therefore, - * cpumf_pmu_start() always has to reenable a counter set. + * cpumf_pmu_start() always has to re-enable a counter set. */ for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) if (!atomic_read(&cpuhw->ctr_set[i])) @@ -1688,7 +1698,6 @@ static const struct file_operations cfset_fops = { .release = cfset_release, .unlocked_ioctl = cfset_ioctl, .compat_ioctl = cfset_ioctl, - .llseek = no_llseek }; static struct miscdevice cfset_dev = { @@ -1854,7 +1863,7 @@ static const struct attribute_group *cfdiag_attr_groups[] = { /* Performance monitoring unit for event CF_DIAG. Since this event * is also started and stopped via the perf_event_open() system call, use * the same event enable/disable call back functions. They do not - * have a pointer to the perf_event strcture as first parameter. + * have a pointer to the perf_event structure as first parameter. * * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. * Reuse them and distinguish the event (always first parameter) via diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 0d64aafd158f..e4a6bfc91080 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -855,16 +855,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void) } /* Determine version specific crypto set */ - switch (ci.csvn) { - case 1 ... 5: + csvn = none; + if (ci.csvn >= 1 && ci.csvn <= 5) csvn = cpumcf_svn_12345_pmu_event_attr; - break; - case 6 ... 7: + else if (ci.csvn >= 6) csvn = cpumcf_svn_67_pmu_event_attr; - break; - default: - csvn = none; - } /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 06efad5b4f93..5f60248cb468 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -24,6 +24,22 @@ #include <asm/timex.h> #include <linux/io.h> +/* Perf PMU definitions for the sampling facility */ +#define PERF_CPUM_SF_MAX_CTR 2 +#define PERF_EVENT_CPUM_SF 0xB0000UL /* Event: Basic-sampling */ +#define PERF_EVENT_CPUM_SF_DIAG 0xBD000UL /* Event: Combined-sampling */ +#define PERF_CPUM_SF_BASIC_MODE 0x0001 /* Basic-sampling flag */ +#define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */ +#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ + +#define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config) +#define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc) +#define TEAR_REG(hwc) ((hwc)->last_tag) +#define SAMPL_RATE(hwc) ((hwc)->event_base) +#define SAMPL_FLAGS(hwc) ((hwc)->config_base) +#define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) +#define SAMPL_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) + /* Minimum number of sample-data-block-tables: * At least one table is required for the sampling buffer structure. * A single table contains up to 511 pointers to sample-data-blocks. @@ -113,17 +129,6 @@ static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, return USEC_PER_SEC * qsi->cpu_speed / rate; } -/* Return TOD timestamp contained in an trailer entry */ -static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te) -{ - /* TOD in STCKE format */ - if (te->header.t) - return *((unsigned long long *)&te->timestamp[1]); - - /* TOD in STCK format */ - return *((unsigned long long *)&te->timestamp[0]); -} - /* Return pointer to trailer entry of an sample data block */ static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v) { @@ -154,12 +159,12 @@ static inline unsigned long *get_next_sdbt(unsigned long *s) /* * sf_disable() - Switch off sampling facility */ -static int sf_disable(void) +static void sf_disable(void) { struct hws_lsctl_request_block sreq; memset(&sreq, 0, sizeof(sreq)); - return lsctl(&sreq); + lsctl(&sreq); } /* @@ -175,41 +180,27 @@ static int sf_buffer_available(struct cpu_hw_sf *cpuhw) */ static void free_sampling_buffer(struct sf_buffer *sfb) { - unsigned long *sdbt, *curr; - - if (!sfb->sdbt) - return; + unsigned long *sdbt, *curr, *head; sdbt = sfb->sdbt; - curr = sdbt; - + if (!sdbt) + return; + sfb->sdbt = NULL; /* Free the SDBT after all SDBs are processed... */ - while (1) { - if (!*curr || !sdbt) - break; - - /* Process table-link entries */ + head = sdbt; + curr = sdbt; + do { if (is_link_entry(curr)) { + /* Process table-link entries */ curr = get_next_sdbt(curr); - if (sdbt) - free_page((unsigned long)sdbt); - - /* If the origin is reached, sampling buffer is freed */ - if (curr == sfb->sdbt) - break; - else - sdbt = curr; + free_page((unsigned long)sdbt); + sdbt = curr; } else { /* Process SDB pointer */ - if (*curr) { - free_page((unsigned long)phys_to_virt(*curr)); - curr++; - } + free_page((unsigned long)phys_to_virt(*curr)); + curr++; } - } - - debug_sprintf_event(sfdbg, 5, "%s: freed sdbt %#lx\n", __func__, - (unsigned long)sfb->sdbt); + } while (curr != head); memset(sfb, 0, sizeof(*sfb)); } @@ -265,10 +256,8 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, * the sampling buffer origin. */ if (sfb->sdbt != get_next_sdbt(tail)) { - debug_sprintf_event(sfdbg, 3, "%s: " - "sampling buffer is not linked: origin %#lx" - " tail %#lx\n", __func__, - (unsigned long)sfb->sdbt, + debug_sprintf_event(sfdbg, 3, "%s buffer not linked origin %#lx tail %#lx\n", + __func__, (unsigned long)sfb->sdbt, (unsigned long)tail); return -EINVAL; } @@ -318,9 +307,6 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; - debug_sprintf_event(sfdbg, 4, "%s: new buffer" - " settings: sdbt %lu sdb %lu\n", __func__, - sfb->num_sdbt, sfb->num_sdb); return rc; } @@ -357,15 +343,8 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) /* Allocate requested number of sample-data-blocks */ rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); - if (rc) { + if (rc) free_sampling_buffer(sfb); - debug_sprintf_event(sfdbg, 4, "%s: " - "realloc_sampling_buffer failed with rc %i\n", - __func__, rc); - } else - debug_sprintf_event(sfdbg, 4, - "%s: tear %#lx dear %#lx\n", __func__, - (unsigned long)sfb->sdbt, (unsigned long)*sfb->sdbt); return rc; } @@ -377,8 +356,8 @@ static void sfb_set_limits(unsigned long min, unsigned long max) CPUM_SF_MAX_SDB = max; memset(&si, 0, sizeof(si)); - if (!qsi(&si)) - CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); + qsi(&si); + CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); } static unsigned long sfb_max_limit(struct hw_perf_event *hwc) @@ -397,12 +376,6 @@ static unsigned long sfb_pending_allocs(struct sf_buffer *sfb, return 0; } -static int sfb_has_pending_allocs(struct sf_buffer *sfb, - struct hw_perf_event *hwc) -{ - return sfb_pending_allocs(sfb, hwc) > 0; -} - static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc) { /* Limit the number of SDBs to not exceed the maximum */ @@ -419,14 +392,13 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc) static void deallocate_buffers(struct cpu_hw_sf *cpuhw) { - if (cpuhw->sfb.sdbt) + if (sf_buffer_available(cpuhw)) free_sampling_buffer(&cpuhw->sfb); } static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { unsigned long n_sdb, freq; - size_t sample_size; /* Calculate sampling buffers using 4K pages * @@ -457,7 +429,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up * to 511 SDBs). */ - sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); @@ -473,12 +444,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) if (sf_buffer_available(cpuhw)) return 0; - debug_sprintf_event(sfdbg, 3, - "%s: rate %lu f %lu sdb %lu/%lu" - " sample_size %lu cpuhw %p\n", __func__, - SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc), - sample_size, cpuhw); - return alloc_sampling_buffer(&cpuhw->sfb, sfb_pending_allocs(&cpuhw->sfb, hwc)); } @@ -535,8 +500,6 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, if (num) sfb_account_allocs(num, hwc); - debug_sprintf_event(sfdbg, 5, "%s: overflow %llu ratio %lu num %lu\n", - __func__, OVERFLOW_REG(hwc), ratio, num); OVERFLOW_REG(hwc) = 0; } @@ -554,13 +517,11 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, static void extend_sampling_buffer(struct sf_buffer *sfb, struct hw_perf_event *hwc) { - unsigned long num, num_old; - int rc; + unsigned long num; num = sfb_pending_allocs(sfb, hwc); if (!num) return; - num_old = sfb->num_sdb; /* Disable the sampling facility to reset any states and also * clear pending measurement alerts. @@ -572,51 +533,32 @@ static void extend_sampling_buffer(struct sf_buffer *sfb, * called by perf. Because this is a reallocation, it is fine if the * new SDB-request cannot be satisfied immediately. */ - rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC); - if (rc) - debug_sprintf_event(sfdbg, 5, "%s: realloc failed with rc %i\n", - __func__, rc); - - if (sfb_has_pending_allocs(sfb, hwc)) - debug_sprintf_event(sfdbg, 5, "%s: " - "req %lu alloc %lu remaining %lu\n", - __func__, num, sfb->num_sdb - num_old, - sfb_pending_allocs(sfb, hwc)); + realloc_sampling_buffer(sfb, num, GFP_ATOMIC); } /* Number of perf events counting hardware events */ -static atomic_t num_events; +static refcount_t num_events; /* Used to avoid races in calling reserve/release_cpumf_hardware */ static DEFINE_MUTEX(pmc_reserve_mutex); #define PMC_INIT 0 #define PMC_RELEASE 1 -#define PMC_FAILURE 2 static void setup_pmc_cpu(void *flags) { - struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf); - int err = 0; + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + sf_disable(); switch (*((int *)flags)) { case PMC_INIT: - memset(cpusf, 0, sizeof(*cpusf)); - err = qsi(&cpusf->qsi); - if (err) - break; - cpusf->flags |= PMU_F_RESERVED; - err = sf_disable(); + memset(cpuhw, 0, sizeof(*cpuhw)); + qsi(&cpuhw->qsi); + cpuhw->flags |= PMU_F_RESERVED; break; case PMC_RELEASE: - cpusf->flags &= ~PMU_F_RESERVED; - err = sf_disable(); - if (!err) - deallocate_buffers(cpusf); + cpuhw->flags &= ~PMU_F_RESERVED; + deallocate_buffers(cpuhw); break; } - if (err) { - *((int *)flags) |= PMC_FAILURE; - pr_err("Switching off the sampling facility failed with rc %i\n", err); - } } static void release_pmc_hardware(void) @@ -627,27 +569,19 @@ static void release_pmc_hardware(void) on_each_cpu(setup_pmc_cpu, &flags, 1); } -static int reserve_pmc_hardware(void) +static void reserve_pmc_hardware(void) { int flags = PMC_INIT; on_each_cpu(setup_pmc_cpu, &flags, 1); - if (flags & PMC_FAILURE) { - release_pmc_hardware(); - return -ENODEV; - } irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; } static void hw_perf_event_destroy(struct perf_event *event) { /* Release PMC if this is the last perf event */ - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - release_pmc_hardware(); + if (refcount_dec_and_mutex_lock(&num_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -751,9 +685,6 @@ static unsigned long getrate(bool freq, unsigned long sample, */ if (sample_rate_to_freq(si, rate) > sysctl_perf_event_sample_rate) { - debug_sprintf_event(sfdbg, 1, "%s: " - "Sampling rate exceeds maximum " - "perf sample rate\n", __func__); rate = 0; } } @@ -798,9 +729,6 @@ static int __hw_perf_event_init_rate(struct perf_event *event, attr->sample_period = rate; SAMPL_RATE(hwc) = rate; hw_init_period(hwc, SAMPL_RATE(hwc)); - debug_sprintf_event(sfdbg, 4, "%s: cpu %d period %#llx freq %d,%#lx\n", - __func__, event->cpu, event->attr.sample_period, - event->attr.freq, SAMPLE_FREQ_MODE(hwc)); return 0; } @@ -810,23 +738,16 @@ static int __hw_perf_event_init(struct perf_event *event) struct hws_qsi_info_block si; struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; - int cpu, err; + int cpu, err = 0; /* Reserve CPU-measurement sampling facility */ - err = 0; - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && reserve_pmc_hardware()) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); + mutex_lock(&pmc_reserve_mutex); + if (!refcount_inc_not_zero(&num_events)) { + reserve_pmc_hardware(); + refcount_set(&num_events, 1); } event->destroy = hw_perf_event_destroy; - if (err) - goto out; - /* Access per-CPU sampling information (query sampling info) */ /* * The event->cpu value can be -1 to count on every CPU, for example, @@ -838,9 +759,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ memset(&si, 0, sizeof(si)); cpuhw = NULL; - if (event->cpu == -1) + if (event->cpu == -1) { qsi(&si); - else { + } else { /* Event is pinned to a particular CPU, retrieve the per-CPU * sampling structure for accessing the CPU-specific QSI. */ @@ -881,13 +802,9 @@ static int __hw_perf_event_init(struct perf_event *event) if (err) goto out; - /* Initialize sample data overflow accounting */ - hwc->extra_reg.reg = REG_OVERFLOW; - OVERFLOW_REG(hwc) = 0; - /* Use AUX buffer. No need to allocate it by ourself */ if (attr->config == PERF_EVENT_CPUM_SF_DIAG) - return 0; + goto out; /* Allocate the per-CPU sampling buffer using the CPU information * from the event. If the event is not pinned to a particular @@ -917,6 +834,7 @@ static int __hw_perf_event_init(struct perf_event *event) if (is_default_overflow_handler(event)) event->overflow_handler = cpumsf_output_event_pid; out: + mutex_unlock(&pmc_reserve_mutex); return err; } @@ -979,10 +897,14 @@ static void cpumsf_pmu_enable(struct pmu *pmu) struct hw_perf_event *hwc; int err; - if (cpuhw->flags & PMU_F_ENABLED) - return; - - if (cpuhw->flags & PMU_F_ERR_MASK) + /* + * Event must be + * - added/started on this CPU (PMU_F_IN_USE set) + * - and CPU must be available (PMU_F_RESERVED set) + * - and not already enabled (PMU_F_ENABLED not set) + * - and not in error condition (PMU_F_ERR_MASK not set) + */ + if (cpuhw->flags != (PMU_F_IN_USE | PMU_F_RESERVED)) return; /* Check whether to extent the sampling buffer. @@ -996,39 +918,27 @@ static void cpumsf_pmu_enable(struct pmu *pmu) * facility, but it can be fully re-enabled using sampling controls that * have been saved in cpumsf_pmu_disable(). */ - if (cpuhw->event) { - hwc = &cpuhw->event->hw; - if (!(SAMPL_DIAG_MODE(hwc))) { - /* - * Account number of overflow-designated - * buffer extents - */ - sfb_account_overflows(cpuhw, hwc); - extend_sampling_buffer(&cpuhw->sfb, hwc); - } - /* Rate may be adjusted with ioctl() */ - cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw); + hwc = &cpuhw->event->hw; + if (!(SAMPL_DIAG_MODE(hwc))) { + /* + * Account number of overflow-designated buffer extents + */ + sfb_account_overflows(cpuhw, hwc); + extend_sampling_buffer(&cpuhw->sfb, hwc); } + /* Rate may be adjusted with ioctl() */ + cpuhw->lsctl.interval = SAMPL_RATE(hwc); /* (Re)enable the PMU and sampling facility */ - cpuhw->flags |= PMU_F_ENABLED; - barrier(); - err = lsctl(&cpuhw->lsctl); if (err) { - cpuhw->flags &= ~PMU_F_ENABLED; pr_err("Loading sampling controls failed: op 1 err %i\n", err); return; } /* Load current program parameter */ - lpp(&S390_lowcore.lpp); - - debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i " - "interval %#lx tear %#lx dear %#lx\n", __func__, - cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, - cpuhw->lsctl.cd, cpuhw->lsctl.interval, - cpuhw->lsctl.tear, cpuhw->lsctl.dear); + lpp(&get_lowcore()->lpp); + cpuhw->flags |= PMU_F_ENABLED; } static void cpumsf_pmu_disable(struct pmu *pmu) @@ -1055,26 +965,23 @@ static void cpumsf_pmu_disable(struct pmu *pmu) return; } - /* Save state of TEAR and DEAR register contents */ - err = qsi(&si); - if (!err) { - /* TEAR/DEAR values are valid only if the sampling facility is - * enabled. Note that cpumsf_pmu_disable() might be called even - * for a disabled sampling facility because cpumsf_pmu_enable() - * controls the enable/disable state. - */ - if (si.es) { - cpuhw->lsctl.tear = si.tear; - cpuhw->lsctl.dear = si.dear; - } - } else - debug_sprintf_event(sfdbg, 3, "%s: qsi() failed with err %i\n", - __func__, err); + /* + * Save state of TEAR and DEAR register contents. + * TEAR/DEAR values are valid only if the sampling facility is + * enabled. Note that cpumsf_pmu_disable() might be called even + * for a disabled sampling facility because cpumsf_pmu_enable() + * controls the enable/disable state. + */ + qsi(&si); + if (si.es) { + cpuhw->lsctl.tear = si.tear; + cpuhw->lsctl.dear = si.dear; + } cpuhw->flags &= ~PMU_F_ENABLED; } -/* perf_exclude_event() - Filter event +/* perf_event_exclude() - Filter event * @event: The perf event * @regs: pt_regs structure * @sde_regs: Sample-data-entry (sde) regs structure @@ -1083,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) * * Return non-zero if the event shall be excluded. */ -static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, +static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, struct perf_sf_sde_regs *sde_regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -1166,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event, data.tid_entry.pid = basic->hpp & LPP_PID_MASK; overflow = 0; - if (perf_exclude_event(event, ®s, sde_regs)) + if (perf_event_exclude(event, ®s, sde_regs)) goto out; if (perf_event_overflow(event, &data, ®s)) { overflow = 1; @@ -1235,11 +1142,6 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, /* Count discarded samples */ *overflow += 1; } else { - debug_sprintf_event(sfdbg, 4, - "%s: Found unknown" - " sampling data entry: te->f %i" - " basic.def %#4x (%p)\n", __func__, - te->header.f, sample->def, sample); /* Sample slot is not yet written or other record. * * This condition can occur if the buffer was reused @@ -1274,8 +1176,8 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, static void hw_perf_event_update(struct perf_event *event, int flush_all) { unsigned long long event_overflow, sampl_overflow, num_sdb; - union hws_trailer_header old, prev, new; struct hw_perf_event *hwc = &event->hw; + union hws_trailer_header prev, new; struct hws_trailer_entry *te; unsigned long *sdbt, sdb; int done; @@ -1284,7 +1186,7 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) * AUX buffer is used when in diagnostic sampling mode. * No perf events/samples are created. */ - if (SAMPL_DIAG_MODE(&event->hw)) + if (SAMPL_DIAG_MODE(hwc)) return; sdbt = (unsigned long *)TEAR_REG(hwc); @@ -1309,13 +1211,6 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) */ sampl_overflow += te->header.overflow; - /* Timestamps are valid for full sample-data-blocks only */ - debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx/%#lx " - "overflow %llu timestamp %#llx\n", - __func__, sdb, (unsigned long)sdbt, - te->header.overflow, - (te->header.f) ? trailer_timestamp(te) : 0ULL); - /* Collect all samples from a single sample-data-block and * flag if an (perf) event overflow happened. If so, the PMU * is stopped and remaining samples will be discarded. @@ -1326,13 +1221,11 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) /* Reset trailer (using compare-double-and-swap) */ prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - old.val = prev.val; new.val = prev.val; new.f = 0; new.a = 1; new.overflow = 0; - prev.val = cmpxchg128(&te->header.val, old.val, new.val); - } while (prev.val != old.val); + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); /* Advance to next sample-data-block */ sdbt++; @@ -1340,7 +1233,7 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) sdbt = get_next_sdbt(sdbt); /* Update event hardware registers */ - TEAR_REG(hwc) = (unsigned long) sdbt; + TEAR_REG(hwc) = (unsigned long)sdbt; /* Stop processing sample-data if all samples of the current * sample-data-block were flushed even if it was not full. @@ -1362,19 +1255,8 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) * are dropped. * Slightly increase the interval to avoid hitting this limit. */ - if (event_overflow) { + if (event_overflow) SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); - debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n", - __func__, - DIV_ROUND_UP(SAMPL_RATE(hwc), 10)); - } - - if (sampl_overflow || event_overflow) - debug_sprintf_event(sfdbg, 4, "%s: " - "overflows: sample %llu event %llu" - " total %llu num_sdb %llu\n", - __func__, sampl_overflow, event_overflow, - OVERFLOW_REG(hwc), num_sdb); } static inline unsigned long aux_sdb_index(struct aux_buffer *aux, @@ -1442,9 +1324,6 @@ static void aux_output_end(struct perf_output_handle *handle) /* Remove alert indicators in the buffer */ te = aux_sdb_trailer(aux, aux->alert_mark); te->header.a = 0; - - debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n", - __func__, i, range_scan, aux->head); } /* @@ -1463,7 +1342,7 @@ static int aux_output_begin(struct perf_output_handle *handle, unsigned long range, i, range_scan, idx, head, base, offset; struct hws_trailer_entry *te; - if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) + if (handle->head & ~PAGE_MASK) return -EINVAL; aux->head = handle->head >> PAGE_SHIFT; @@ -1475,10 +1354,6 @@ static int aux_output_begin(struct perf_output_handle *handle, * SDBs between aux->head and aux->empty_mark are already ready * for new data. range_scan is num of SDBs not within them. */ - debug_sprintf_event(sfdbg, 6, - "%s: range %ld head %ld alert %ld empty %ld\n", - __func__, range, aux->head, aux->alert_mark, - aux->empty_mark); if (range > aux_sdb_num_empty(aux)) { range_scan = range - aux_sdb_num_empty(aux); idx = aux->empty_mark + 1; @@ -1504,12 +1379,6 @@ static int aux_output_begin(struct perf_output_handle *handle, cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long); cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]); - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld " - "index %ld tear %#lx dear %#lx\n", __func__, - aux->head, aux->alert_mark, aux->empty_mark, - head / CPUM_SF_SDB_PER_TABLE, - cpuhw->lsctl.tear, cpuhw->lsctl.dear); - return 0; } @@ -1522,16 +1391,15 @@ static int aux_output_begin(struct perf_output_handle *handle, static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, unsigned long long *overflow) { - union hws_trailer_header old, prev, new; + union hws_trailer_header prev, new; struct hws_trailer_entry *te; te = aux_sdb_trailer(aux, alert_index); prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - old.val = prev.val; new.val = prev.val; - *overflow = old.overflow; - if (old.f) { + *overflow = prev.overflow; + if (prev.f) { /* * SDB is already set by hardware. * Abort and try to set somewhere @@ -1541,8 +1409,7 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, } new.a = 1; new.overflow = 0; - prev.val = cmpxchg128(&te->header.val, old.val, new.val); - } while (prev.val != old.val); + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); return true; } @@ -1571,14 +1438,11 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, unsigned long long *overflow) { - unsigned long i, range_scan, idx, idx_old; - union hws_trailer_header old, prev, new; + union hws_trailer_header prev, new; + unsigned long i, range_scan, idx; unsigned long long orig_overflow; struct hws_trailer_entry *te; - debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld " - "empty %ld\n", __func__, range, aux->head, - aux->alert_mark, aux->empty_mark); if (range <= aux_sdb_num_empty(aux)) /* * No need to scan. All SDBs in range are marked as empty. @@ -1601,31 +1465,26 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, * indicator fall into this range, set it. */ range_scan = range - aux_sdb_num_empty(aux); - idx_old = idx = aux->empty_mark + 1; + idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); prev.val = READ_ONCE_ALIGNED_128(te->header.val); do { - old.val = prev.val; new.val = prev.val; - orig_overflow = old.overflow; + orig_overflow = prev.overflow; new.f = 0; new.overflow = 0; if (idx == aux->alert_mark) new.a = 1; else new.a = 0; - prev.val = cmpxchg128(&te->header.val, old.val, new.val); - } while (prev.val != old.val); + } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val)); *overflow += orig_overflow; } /* Update empty_mark to new position */ aux->empty_mark = aux->head + range - 1; - debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld " - "empty %ld\n", __func__, range_scan, idx_old, - idx - 1, aux->empty_mark); return true; } @@ -1642,12 +1501,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) unsigned long num_sdb; aux = perf_get_aux(handle); - if (WARN_ON_ONCE(!aux)) + if (!aux) return; /* Inform user space new data arrived */ size = aux_sdb_num_alert(aux) << PAGE_SHIFT; - debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__, + debug_sprintf_event(sfdbg, 6, "%s #alert %ld\n", __func__, size >> PAGE_SHIFT); perf_aux_output_end(handle, size); @@ -1661,7 +1520,7 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) num_sdb); break; } - if (WARN_ON_ONCE(!aux)) + if (!aux) return; /* Update head and alert_mark to new position */ @@ -1681,23 +1540,11 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) perf_aux_output_end(&cpuhw->handle, size); pr_err("Sample data caused the AUX buffer with %lu " "pages to overflow\n", aux->sfb.num_sdb); - debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld " - "overflow %lld\n", __func__, - aux->head, range, overflow); } else { size = aux_sdb_num_alert(aux) << PAGE_SHIFT; perf_aux_output_end(&cpuhw->handle, size); - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " - "already full, try another\n", - __func__, - aux->head, aux->alert_mark); } } - - if (done) - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " - "empty %ld\n", __func__, aux->head, - aux->alert_mark, aux->empty_mark); } /* @@ -1719,8 +1566,6 @@ static void aux_buffer_free(void *data) kfree(aux->sdbt_index); kfree(aux->sdb_index); kfree(aux); - - debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt); } static void aux_sdb_init(unsigned long sdb) @@ -1828,9 +1673,6 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, */ aux->empty_mark = sfb->num_sdb - 1; - debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__, - sfb->num_sdbt, sfb->num_sdb); - return aux; no_sdbt: @@ -1863,8 +1705,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) memset(&si, 0, sizeof(si)); if (event->cpu == -1) { - if (qsi(&si)) - return -ENODEV; + qsi(&si); } else { /* Event is pinned to a particular CPU, retrieve the per-CPU * sampling structure for accessing the CPU-specific QSI. @@ -1874,7 +1715,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) si = cpuhw->qsi; } - do_freq = !!SAMPLE_FREQ_MODE(&event->hw); + do_freq = !!SAMPL_FREQ_MODE(&event->hw); rate = getrate(do_freq, value, &si); if (!rate) return -EINVAL; @@ -1882,10 +1723,6 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) event->attr.sample_period = rate; SAMPL_RATE(&event->hw) = rate; hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); - debug_sprintf_event(sfdbg, 4, "%s:" - " cpu %d value %#llx period %#llx freq %d\n", - __func__, event->cpu, value, - event->attr.sample_period, do_freq); return 0; } @@ -1896,12 +1733,8 @@ static void cpumsf_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + if (!(event->hw.state & PERF_HES_STOPPED)) return; - - if (flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - perf_pmu_disable(event->pmu); event->hw.state = 0; cpuhw->lsctl.cs = 1; @@ -1926,7 +1759,9 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags) event->hw.state |= PERF_HES_STOPPED; if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) { - hw_perf_event_update(event, 1); + /* CPU hotplug off removes SDBs. No samples to extract. */ + if (cpuhw->flags & PMU_F_RESERVED) + hw_perf_event_update(event, 1); event->hw.state |= PERF_HES_UPTODATE; } perf_pmu_enable(event->pmu); @@ -1936,15 +1771,14 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); struct aux_buffer *aux; - int err; + int err = 0; if (cpuhw->flags & PMU_F_IN_USE) return -EAGAIN; - if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt) + if (!SAMPL_DIAG_MODE(&event->hw) && !sf_buffer_available(cpuhw)) return -EINVAL; - err = 0; perf_pmu_disable(event->pmu); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; @@ -2104,18 +1938,17 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Program alert request */ if (alert & CPU_MF_INT_SF_PRA) { - if (cpuhw->flags & PMU_F_IN_USE) + if (cpuhw->flags & PMU_F_IN_USE) { if (SAMPL_DIAG_MODE(&cpuhw->event->hw)) hw_collect_aux(cpuhw); else hw_perf_event_update(cpuhw->event, 0); - else - WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE)); + } } /* Report measurement alerts only for non-PRA codes */ if (alert != CPU_MF_INT_SF_PRA) - debug_sprintf_event(sfdbg, 6, "%s: alert %#x\n", __func__, + debug_sprintf_event(sfdbg, 6, "%s alert %#x\n", __func__, alert); /* Sampling authorization change request */ @@ -2131,7 +1964,7 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Invalid sampling buffer entry */ if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) { - pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n", + pr_err("A sampling buffer entry is incorrect (alert=%#x)\n", alert); cpuhw->flags |= PMU_F_ERR_IBE; sf_disable(); @@ -2143,7 +1976,7 @@ static int cpusf_pmu_setup(unsigned int cpu, int flags) /* Ignore the notification if no events are scheduled on the PMU. * This might be racy... */ - if (!atomic_read(&num_events)) + if (!refcount_read(&num_events)) return 0; local_irq_disable(); @@ -2205,10 +2038,12 @@ static const struct kernel_param_ops param_ops_sfb_size = { .get = param_get_sfb_size, }; -#define RS_INIT_FAILURE_QSI 0x0001 -#define RS_INIT_FAILURE_BSDES 0x0002 -#define RS_INIT_FAILURE_ALRT 0x0003 -#define RS_INIT_FAILURE_PERF 0x0004 +enum { + RS_INIT_FAILURE_BSDES = 2, /* Bad basic sampling size */ + RS_INIT_FAILURE_ALRT = 3, /* IRQ registration failure */ + RS_INIT_FAILURE_PERF = 4 /* PMU registration failure */ +}; + static void __init pr_cpumsf_err(unsigned int reason) { pr_err("Sampling facility support for perf is not available: " @@ -2224,11 +2059,7 @@ static int __init init_cpum_sampling_pmu(void) return -ENODEV; memset(&si, 0, sizeof(si)); - if (qsi(&si)) { - pr_cpumsf_err(RS_INIT_FAILURE_QSI); - return -ENODEV; - } - + qsi(&si); if (!si.as && !si.ad) return -ENODEV; diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index dfa77da2fd2e..2b9611c4718e 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -57,7 +57,7 @@ static unsigned long instruction_pointer_guest(struct pt_regs *regs) return sie_block(regs)->gpsw.addr; } -unsigned long perf_instruction_pointer(struct pt_regs *regs) +unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) { return is_in_guest(regs) ? instruction_pointer_guest(regs) : instruction_pointer(regs); @@ -84,7 +84,7 @@ static unsigned long perf_misc_flags_sf(struct pt_regs *regs) return flags; } -unsigned long perf_misc_flags(struct pt_regs *regs) +unsigned long perf_arch_misc_flags(struct pt_regs *regs) { /* Check if the cpum_sf PMU has created the pt_regs structure. * In this case, perf misc flags can be easily extracted. Otherwise, @@ -218,39 +218,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - struct stack_frame_user __user *sf; - unsigned long ip, sp; - bool first = true; - - if (is_compat_task()) - return; - perf_callchain_store(entry, instruction_pointer(regs)); - sf = (void __user *)user_stack_pointer(regs); - pagefault_disable(); - while (entry->nr < entry->max_stack) { - if (__get_user(sp, &sf->back_chain)) - break; - if (__get_user(ip, &sf->gprs[8])) - break; - if (ip & 0x1) { - /* - * If the instruction address is invalid, and this - * is the first stack frame, assume r14 has not - * been written to the stack yet. Otherwise exit. - */ - if (first && !(regs->gprs[14] & 0x1)) - ip = regs->gprs[14]; - else - break; - } - perf_callchain_store(entry, ip); - /* Sanity check: ABI requires SP to be aligned 8 bytes. */ - if (!sp || sp & 0x7) - break; - sf = (void __user *)sp; - first = false; - } - pagefault_enable(); + arch_stack_walk_user_common(NULL, NULL, entry, regs, true); } /* Perf definitions for PMU event attributes in sysfs */ @@ -260,5 +228,5 @@ ssize_t cpumf_events_sysfs_show(struct device *dev, struct perf_pmu_events_attr *pmu_attr; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - return sprintf(page, "event=0x%04llx\n", pmu_attr->id); + return sysfs_emit(page, "event=0x%04llx\n", pmu_attr->id); } diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 4ad472d130a3..10725f5a6f0f 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -36,8 +36,8 @@ struct paicrypt_map { struct pai_userdata *save; /* Page to store no-zero counters */ unsigned int active_events; /* # of PAI crypto users */ refcount_t refcnt; /* Reference count mapped buffers */ - enum paievt_mode mode; /* Type of event */ struct perf_event *event; /* Perf event for sampling */ + struct list_head syswide_list; /* List system-wide sampling events */ }; struct paicrypt_mapptr { @@ -84,20 +84,16 @@ static DEFINE_MUTEX(pai_reserve_mutex); /* Adjust usage counters and remove allocated memory when all users are * gone. */ -static void paicrypt_event_destroy(struct perf_event *event) +static void paicrypt_event_destroy_cpu(struct perf_event *event, int cpu) { - struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, - event->cpu); + struct paicrypt_mapptr *mp = per_cpu_ptr(paicrypt_root.mapptr, cpu); struct paicrypt_map *cpump = mp->mapptr; - static_branch_dec(&pai_key); mutex_lock(&pai_reserve_mutex); - debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d" - " mode %d refcnt %u\n", __func__, - event->attr.config, event->cpu, - cpump->active_events, cpump->mode, + debug_sprintf_event(cfm_dbg, 5, "%s event %#llx cpu %d users %d " + "refcnt %u\n", __func__, event->attr.config, + event->cpu, cpump->active_events, refcount_read(&cpump->refcnt)); - free_page(PAI_SAVE_AREA(event)); if (refcount_dec_and_test(&cpump->refcnt)) { debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n", __func__, (unsigned long)cpump->page, @@ -111,6 +107,23 @@ static void paicrypt_event_destroy(struct perf_event *event) mutex_unlock(&pai_reserve_mutex); } +static void paicrypt_event_destroy(struct perf_event *event) +{ + int cpu; + + static_branch_dec(&pai_key); + free_page(PAI_SAVE_AREA(event)); + if (event->cpu == -1) { + struct cpumask *mask = PAI_CPU_MASK(event); + + for_each_cpu(cpu, mask) + paicrypt_event_destroy_cpu(event, cpu); + kfree(mask); + } else { + paicrypt_event_destroy_cpu(event, event->cpu); + } +} + static u64 paicrypt_getctr(unsigned long *page, int nr, bool kernel) { if (kernel) @@ -156,23 +169,15 @@ static u64 paicrypt_getall(struct perf_event *event) return sum; } -/* Used to avoid races in checking concurrent access of counting and - * sampling for crypto events - * - * Only one instance of event pai_crypto/CRYPTO_ALL/ for sampling is - * allowed and when this event is running, no counting event is allowed. - * Several counting events are allowed in parallel, but no sampling event - * is allowed while one (or more) counting events are running. - * +/* Check concurrent access of counting and sampling for crypto events. * This function is called in process context and it is save to block. * When the event initialization functions fails, no other call back will * be invoked. * * Allocate the memory for the event. */ -static struct paicrypt_map *paicrypt_busy(struct perf_event *event) +static struct paicrypt_map *paicrypt_busy(struct perf_event *event, int cpu) { - struct perf_event_attr *a = &event->attr; struct paicrypt_map *cpump = NULL; struct paicrypt_mapptr *mp; int rc; @@ -185,7 +190,7 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event) goto unlock; /* Allocate node for this event */ - mp = per_cpu_ptr(paicrypt_root.mapptr, event->cpu); + mp = per_cpu_ptr(paicrypt_root.mapptr, cpu); cpump = mp->mapptr; if (!cpump) { /* Paicrypt_map allocated? */ cpump = kzalloc(sizeof(*cpump), GFP_KERNEL); @@ -193,25 +198,9 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event) rc = -ENOMEM; goto free_root; } + INIT_LIST_HEAD(&cpump->syswide_list); } - if (a->sample_period) { /* Sampling requested */ - if (cpump->mode != PAI_MODE_NONE) - rc = -EBUSY; /* ... sampling/counting active */ - } else { /* Counting requested */ - if (cpump->mode == PAI_MODE_SAMPLING) - rc = -EBUSY; /* ... and sampling active */ - } - /* - * This error case triggers when there is a conflict: - * Either sampling requested and counting already active, or visa - * versa. Therefore the struct paicrypto_map for this CPU is - * needed or the error could not have occurred. Only adjust root - * node refcount. - */ - if (rc) - goto free_root; - /* Allocate memory for counter page and counter extraction. * Only the first counting event has to allocate a page. */ @@ -235,26 +224,58 @@ static struct paicrypt_map *paicrypt_busy(struct perf_event *event) /* Set mode and reference count */ rc = 0; refcount_set(&cpump->refcnt, 1); - cpump->mode = a->sample_period ? PAI_MODE_SAMPLING : PAI_MODE_COUNTING; mp->mapptr = cpump; - debug_sprintf_event(cfm_dbg, 5, "%s sample_period %#llx users %d" - " mode %d refcnt %u page %#lx save %p rc %d\n", - __func__, a->sample_period, cpump->active_events, - cpump->mode, refcount_read(&cpump->refcnt), + debug_sprintf_event(cfm_dbg, 5, "%s users %d refcnt %u page %#lx " + "save %p rc %d\n", __func__, cpump->active_events, + refcount_read(&cpump->refcnt), (unsigned long)cpump->page, cpump->save, rc); goto unlock; free_paicrypt_map: + /* Undo memory allocation */ kfree(cpump); mp->mapptr = NULL; free_root: paicrypt_root_free(); - unlock: mutex_unlock(&pai_reserve_mutex); return rc ? ERR_PTR(rc) : cpump; } +static int paicrypt_event_init_all(struct perf_event *event) +{ + struct paicrypt_map *cpump; + struct cpumask *maskptr; + int cpu, rc = -ENOMEM; + + maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); + if (!maskptr) + goto out; + + for_each_online_cpu(cpu) { + cpump = paicrypt_busy(event, cpu); + if (IS_ERR(cpump)) { + for_each_cpu(cpu, maskptr) + paicrypt_event_destroy_cpu(event, cpu); + kfree(maskptr); + rc = PTR_ERR(cpump); + goto out; + } + cpumask_set_cpu(cpu, maskptr); + } + + /* + * On error all cpumask are freed and all events have been destroyed. + * Save of which CPUs data structures have been allocated for. + * Release them in paicrypt_event_destroy call back function + * for this event. + */ + PAI_CPU_MASK(event) = maskptr; + rc = 0; +out: + return rc; +} + /* Might be called on different CPU than the one the event is intended for. */ static int paicrypt_event_init(struct perf_event *event) { @@ -269,10 +290,7 @@ static int paicrypt_event_init(struct perf_event *event) if (a->config < PAI_CRYPTO_BASE || a->config > PAI_CRYPTO_BASE + paicrypt_cnt) return -EINVAL; - /* Allow only CPU wide operation, no process context for now. */ - if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1) - return -ENOENT; - /* Allow only CRYPTO_ALL for sampling. */ + /* Allow only CRYPTO_ALL for sampling */ if (a->sample_period && a->config != PAI_CRYPTO_BASE) return -EINVAL; /* Get a page to store last counter values for sampling */ @@ -284,13 +302,17 @@ static int paicrypt_event_init(struct perf_event *event) } } - cpump = paicrypt_busy(event); - if (IS_ERR(cpump)) { + if (event->cpu >= 0) { + cpump = paicrypt_busy(event, event->cpu); + if (IS_ERR(cpump)) + rc = PTR_ERR(cpump); + } else { + rc = paicrypt_event_init_all(event); + } + if (rc) { free_page(PAI_SAVE_AREA(event)); - rc = PTR_ERR(cpump); goto out; } - event->destroy = paicrypt_event_destroy; if (a->sample_period) { @@ -331,8 +353,14 @@ static void paicrypt_start(struct perf_event *event, int flags) sum = paicrypt_getall(event); /* Get current value */ local64_set(&event->hw.prev_count, sum); } else { /* Sampling */ - cpump->event = event; - perf_sched_cb_inc(event->pmu); + memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE); + /* Enable context switch callback for system-wide sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); + perf_sched_cb_inc(event->pmu); + } else { + cpump->event = event; + } } } @@ -344,7 +372,7 @@ static int paicrypt_add(struct perf_event *event, int flags) if (++cpump->active_events == 1) { ccd = virt_to_phys(cpump->page) | PAI_CRYPTO_KERNEL_OFFSET; - WRITE_ONCE(S390_lowcore.ccd, ccd); + WRITE_ONCE(get_lowcore()->ccd, ccd); local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); } if (flags & PERF_EF_START) @@ -353,6 +381,7 @@ static int paicrypt_add(struct perf_event *event, int flags) return 0; } +static void paicrypt_have_sample(struct perf_event *, struct paicrypt_map *); static void paicrypt_stop(struct perf_event *event, int flags) { struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); @@ -361,8 +390,13 @@ static void paicrypt_stop(struct perf_event *event, int flags) if (!event->attr.sample_period) { /* Counting */ paicrypt_read(event); } else { /* Sampling */ - perf_sched_cb_dec(event->pmu); - cpump->event = NULL; + if (!(event->attach_state & PERF_ATTACH_TASK)) { + perf_sched_cb_dec(event->pmu); + list_del(PAI_SWLIST(event)); + } else { + paicrypt_have_sample(event, cpump); + cpump->event = NULL; + } } event->hw.state = PERF_HES_STOPPED; } @@ -375,7 +409,7 @@ static void paicrypt_del(struct perf_event *event, int flags) paicrypt_stop(event, PERF_EF_UPDATE); if (--cpump->active_events == 0) { local_ctl_clear_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT); - WRITE_ONCE(S390_lowcore.ccd, 0); + WRITE_ONCE(get_lowcore()->ccd, 0); } } @@ -444,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); @@ -455,23 +489,30 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, } /* Check if there is data to be saved on schedule out of a task. */ -static int paicrypt_have_sample(void) +static void paicrypt_have_sample(struct perf_event *event, + struct paicrypt_map *cpump) { - struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); - struct paicrypt_map *cpump = mp->mapptr; - struct perf_event *event = cpump->event; size_t rawsize; - int rc = 0; if (!event) /* No event active */ - return 0; + return; rawsize = paicrypt_copy(cpump->save, cpump->page, (unsigned long *)PAI_SAVE_AREA(event), - cpump->event->attr.exclude_user, - cpump->event->attr.exclude_kernel); + event->attr.exclude_user, + event->attr.exclude_kernel); if (rawsize) /* No incremented counters */ - rc = paicrypt_push_sample(rawsize, cpump, event); - return rc; + paicrypt_push_sample(rawsize, cpump, event); +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void paicrypt_have_samples(void) +{ + struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr); + struct paicrypt_map *cpump = mp->mapptr; + struct perf_event *event; + + list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) + paicrypt_have_sample(event, cpump); } /* Called on schedule-in and schedule-out. No access to event structure, @@ -480,10 +521,10 @@ static int paicrypt_have_sample(void) static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { /* We started with a clean page on event installation. So read out - * results on schedule_out and if page was dirty, clear values. + * results on schedule_out and if page was dirty, save old values. */ if (!sched_in) - paicrypt_have_sample(); + paicrypt_have_samples(); } /* Attribute definitions for paicrypt interface. As with other CPU @@ -527,7 +568,7 @@ static const struct attribute_group *paicrypt_attr_groups[] = { /* Performance monitoring unit for mapped counters */ static struct pmu paicrypt = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = paicrypt_event_init, .add = paicrypt_add, .del = paicrypt_del, @@ -697,6 +738,22 @@ static const char * const paicrypt_ctrnames[] = { [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", [155] = "IBM_RESERVED_155", [156] = "IBM_RESERVED_156", + [157] = "KM_FULL_XTS_AES_128", + [158] = "KM_FULL_XTS_AES_256", + [159] = "KM_FULL_XTS_ENCRYPTED_AES_128", + [160] = "KM_FULL_XTS_ENCRYPTED_AES_256", + [161] = "KMAC_HMAC_SHA_224", + [162] = "KMAC_HMAC_SHA_256", + [163] = "KMAC_HMAC_SHA_384", + [164] = "KMAC_HMAC_SHA_512", + [165] = "KMAC_HMAC_ENCRYPTED_SHA_224", + [166] = "KMAC_HMAC_ENCRYPTED_SHA_256", + [167] = "KMAC_HMAC_ENCRYPTED_SHA_384", + [168] = "KMAC_HMAC_ENCRYPTED_SHA_512", + [169] = "PCKMO_ENCRYPT_HMAC_512_KEY", + [170] = "PCKMO_ENCRYPT_HMAC_1024_KEY", + [171] = "PCKMO_ENCRYPT_AES_XTS_128", + [172] = "PCKMO_ENCRYPT_AES_XTS_256", }; static void __init attr_event_free(struct attribute **attrs, int num) diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index a6da7e0cc7a6..a8f0bad99cf0 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -47,11 +47,11 @@ struct paiext_cb { /* PAI extension 1 control block */ struct paiext_map { unsigned long *area; /* Area for CPU to store counters */ struct pai_userdata *save; /* Area to store non-zero counters */ - enum paievt_mode mode; /* Type of event */ unsigned int active_events; /* # of PAI Extension users */ refcount_t refcnt; struct perf_event *event; /* Perf event for sampling */ struct paiext_cb *paiext_cb; /* PAI extension control block area */ + struct list_head syswide_list; /* List system-wide sampling events */ }; struct paiext_mapptr { @@ -70,6 +70,8 @@ static void paiext_root_free(void) free_percpu(paiext_root.mapptr); paiext_root.mapptr = NULL; } + debug_sprintf_event(paiext_dbg, 5, "%s root.refcount %d\n", __func__, + refcount_read(&paiext_root.refcnt)); } /* On initialization of first event also allocate per CPU data dynamically. @@ -115,20 +117,34 @@ static void paiext_free(struct paiext_mapptr *mp) } /* Release the PMU if event is the last perf event */ -static void paiext_event_destroy(struct perf_event *event) +static void paiext_event_destroy_cpu(struct perf_event *event, int cpu) { - struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, cpu); struct paiext_map *cpump = mp->mapptr; - free_page(PAI_SAVE_AREA(event)); mutex_lock(&paiext_reserve_mutex); if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */ paiext_free(mp); paiext_root_free(); mutex_unlock(&paiext_reserve_mutex); - debug_sprintf_event(paiext_dbg, 4, "%s cpu %d mapptr %p\n", __func__, - event->cpu, mp->mapptr); +} + +static void paiext_event_destroy(struct perf_event *event) +{ + int cpu; + + free_page(PAI_SAVE_AREA(event)); + if (event->cpu == -1) { + struct cpumask *mask = PAI_CPU_MASK(event); + for_each_cpu(cpu, mask) + paiext_event_destroy_cpu(event, cpu); + kfree(mask); + } else { + paiext_event_destroy_cpu(event, event->cpu); + } + debug_sprintf_event(paiext_dbg, 4, "%s cpu %d\n", __func__, + event->cpu); } /* Used to avoid races in checking concurrent access of counting and @@ -145,19 +161,18 @@ static void paiext_event_destroy(struct perf_event *event) * * Allocate the memory for the event. */ -static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event) +static int paiext_alloc_cpu(struct perf_event *event, int cpu) { struct paiext_mapptr *mp; struct paiext_map *cpump; int rc; mutex_lock(&paiext_reserve_mutex); - rc = paiext_root_alloc(); if (rc) goto unlock; - mp = per_cpu_ptr(paiext_root.mapptr, event->cpu); + mp = per_cpu_ptr(paiext_root.mapptr, cpu); cpump = mp->mapptr; if (!cpump) { /* Paiext_map allocated? */ rc = -ENOMEM; @@ -185,24 +200,13 @@ static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event) paiext_free(mp); goto undo; } + INIT_LIST_HEAD(&cpump->syswide_list); refcount_set(&cpump->refcnt, 1); - cpump->mode = a->sample_period ? PAI_MODE_SAMPLING - : PAI_MODE_COUNTING; + rc = 0; } else { - /* Multiple invocation, check what is active. - * Supported are multiple counter events or only one sampling - * event concurrently at any one time. - */ - if (cpump->mode == PAI_MODE_SAMPLING || - (cpump->mode == PAI_MODE_COUNTING && a->sample_period)) { - rc = -EBUSY; - goto undo; - } refcount_inc(&cpump->refcnt); } - rc = 0; - undo: if (rc) { /* Error in allocation of event, decrement anchor. Since @@ -217,6 +221,38 @@ unlock: return rc; } +static int paiext_alloc(struct perf_event *event) +{ + struct cpumask *maskptr; + int cpu, rc = -ENOMEM; + + maskptr = kzalloc(sizeof(*maskptr), GFP_KERNEL); + if (!maskptr) + goto out; + + for_each_online_cpu(cpu) { + rc = paiext_alloc_cpu(event, cpu); + if (rc) { + for_each_cpu(cpu, maskptr) + paiext_event_destroy_cpu(event, cpu); + kfree(maskptr); + goto out; + } + cpumask_set_cpu(cpu, maskptr); + } + + /* + * On error all cpumask are freed and all events have been destroyed. + * Save of which CPUs data structures have been allocated for. + * Release them in paicrypt_event_destroy call back function + * for this event. + */ + PAI_CPU_MASK(event) = maskptr; + rc = 0; +out: + return rc; +} + /* The PAI extension 1 control block supports up to 128 entries. Return * the index within PAIE1_CB given the event number. Also validate event * number. @@ -246,9 +282,6 @@ static int paiext_event_init(struct perf_event *event) rc = paiext_event_valid(event); if (rc) return rc; - /* Allow only CPU wide operation, no process context for now. */ - if ((event->attach_state & PERF_ATTACH_TASK) || event->cpu == -1) - return -ENOENT; /* Allow only event NNPA_ALL for sampling. */ if (a->sample_period && a->config != PAI_NNPA_BASE) return -EINVAL; @@ -262,7 +295,10 @@ static int paiext_event_init(struct perf_event *event) return -ENOMEM; } - rc = paiext_alloc(a, event); + if (event->cpu >= 0) + rc = paiext_alloc_cpu(event, event->cpu); + else + rc = paiext_alloc(event); if (rc) { free_page(PAI_SAVE_AREA(event)); return rc; @@ -334,8 +370,15 @@ static void paiext_start(struct perf_event *event, int flags) sum = paiext_getall(event); /* Get current value */ local64_set(&event->hw.prev_count, sum); } else { /* Sampling */ - cpump->event = event; - perf_sched_cb_inc(event->pmu); + memcpy((void *)PAI_SAVE_AREA(event), cpump->area, + PAIE1_CTRBLOCK_SZ); + /* Enable context switch callback for system-wide sampling */ + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_add_tail(PAI_SWLIST(event), &cpump->syswide_list); + perf_sched_cb_inc(event->pmu); + } else { + cpump->event = event; + } } } @@ -346,12 +389,10 @@ static int paiext_add(struct perf_event *event, int flags) struct paiext_cb *pcb = cpump->paiext_cb; if (++cpump->active_events == 1) { - S390_lowcore.aicd = virt_to_phys(cpump->paiext_cb); + get_lowcore()->aicd = virt_to_phys(cpump->paiext_cb); pcb->acc = virt_to_phys(cpump->area) | 0x1; /* Enable CPU instruction lookup for PAIE1 control block */ local_ctl_set_bit(0, CR0_PAI_EXTENSION_BIT); - debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", - __func__, S390_lowcore.aicd, pcb->acc); } if (flags & PERF_EF_START) paiext_start(event, PERF_EF_RELOAD); @@ -359,6 +400,7 @@ static int paiext_add(struct perf_event *event, int flags) return 0; } +static void paiext_have_sample(struct perf_event *, struct paiext_map *); static void paiext_stop(struct perf_event *event, int flags) { struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); @@ -367,8 +409,13 @@ static void paiext_stop(struct perf_event *event, int flags) if (!event->attr.sample_period) { /* Counting */ paiext_read(event); } else { /* Sampling */ - perf_sched_cb_dec(event->pmu); - cpump->event = NULL; + if (!(event->attach_state & PERF_ATTACH_TASK)) { + list_del(PAI_SWLIST(event)); + perf_sched_cb_dec(event->pmu); + } else { + paiext_have_sample(event, cpump); + cpump->event = NULL; + } } event->hw.state = PERF_HES_STOPPED; } @@ -384,9 +431,7 @@ static void paiext_del(struct perf_event *event, int flags) /* Disable CPU instruction lookup for PAIE1 control block */ local_ctl_clear_bit(0, CR0_PAI_EXTENSION_BIT); pcb->acc = 0; - S390_lowcore.aicd = 0; - debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n", - __func__, S390_lowcore.aicd, pcb->acc); + get_lowcore()->aicd = 0; } } @@ -458,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); @@ -470,21 +515,28 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, } /* Check if there is data to be saved on schedule out of a task. */ -static int paiext_have_sample(void) +static void paiext_have_sample(struct perf_event *event, + struct paiext_map *cpump) { - struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); - struct paiext_map *cpump = mp->mapptr; - struct perf_event *event = cpump->event; size_t rawsize; - int rc = 0; if (!event) - return 0; + return; rawsize = paiext_copy(cpump->save, cpump->area, (unsigned long *)PAI_SAVE_AREA(event)); if (rawsize) /* Incremented counters */ - rc = paiext_push_sample(rawsize, cpump, event); - return rc; + paiext_push_sample(rawsize, cpump, event); +} + +/* Check if there is data to be saved on schedule out of a task. */ +static void paiext_have_samples(void) +{ + struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr); + struct paiext_map *cpump = mp->mapptr; + struct perf_event *event; + + list_for_each_entry(event, &cpump->syswide_list, hw.tp_list) + paiext_have_sample(event, cpump); } /* Called on schedule-in and schedule-out. No access to event structure, @@ -493,10 +545,10 @@ static int paiext_have_sample(void) static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) { /* We started with a clean page on event installation. So read out - * results on schedule_out and if page was dirty, clear values. + * results on schedule_out and if page was dirty, save old values. */ if (!sched_in) - paiext_have_sample(); + paiext_have_samples(); } /* Attribute definitions for pai extension1 interface. As with other CPU @@ -542,7 +594,7 @@ static const struct attribute_group *paiext_attr_groups[] = { /* Performance monitoring unit for mapped counters */ static struct pmu paiext = { - .task_ctx_nr = perf_invalid_context, + .task_ctx_nr = perf_hw_context, .event_init = paiext_event_init, .add = paiext_add, .del = paiext_del, @@ -583,6 +635,15 @@ static const char * const paiext_ctrnames[] = { [25] = "NNPA_1MFRAME", [26] = "NNPA_2GFRAME", [27] = "NNPA_ACCESSEXCEPT", + [28] = "NNPA_TRANSFORM", + [29] = "NNPA_GELU", + [30] = "NNPA_MOMENTS", + [31] = "NNPA_LAYERNORM", + [32] = "NNPA_MATMUL_OP_BCAST1", + [33] = "NNPA_SQRT", + [34] = "NNPA_INVSQRT", + [35] = "NNPA_NORM", + [36] = "NNPA_REDUCE", }; static void __init attr_event_free(struct attribute **attrs, int num) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index dd456b475861..9637aee43c40 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -71,10 +71,10 @@ void flush_thread(void) void arch_setup_new_exec(void) { - if (S390_lowcore.current_pid != current->pid) { - S390_lowcore.current_pid = current->pid; + if (get_lowcore()->current_pid != current->pid) { + get_lowcore()->current_pid = current->pid; if (test_facility(40)) - lpp(&S390_lowcore.lpp); + lpp(&get_lowcore()->lpp); } } @@ -86,11 +86,6 @@ void arch_release_task_struct(struct task_struct *tsk) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - /* - * Save the floating-point or vector register state of the current - * task and set the TIF_FPU flag to lazy restore the FPU register - * state when returning to user space. - */ save_user_fpu_regs(); *dst = *src; diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 65c1464eea4f..5ce9a795a0fe 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -17,7 +17,8 @@ #include <linux/mm_types.h> #include <linux/delay.h> #include <linux/cpu.h> - +#include <linux/smp.h> +#include <asm/text-patching.h> #include <asm/diag.h> #include <asm/facility.h> #include <asm/elf.h> @@ -79,6 +80,23 @@ void notrace stop_machine_yield(const struct cpumask *cpumask) } } +static void do_sync_core(void *info) +{ + sync_core(); +} + +void text_poke_sync(void) +{ + on_each_cpu(do_sync_core, NULL, 1); +} + +void text_poke_sync_lock(void) +{ + cpus_read_lock(); + text_poke_sync(); + cpus_read_unlock(); +} + /* * cpu_init - initializes state that is per-CPU. */ diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 88087a32ebc6..69fcaf54d5ca 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -9,6 +9,7 @@ #include <asm/asm-offsets.h> #include <asm/nospec-insn.h> #include <asm/sigp.h> +#include <asm/lowcore.h> GEN_BR_THUNK %r9 @@ -20,20 +21,15 @@ # r3 = Parameter for function # SYM_CODE_START(store_status) - /* Save register one and load save area base */ - stg %r1,__LC_SAVE_AREA_RESTART + STMG_LC %r0,%r15,__LC_GPREGS_SAVE_AREA /* General purpose registers */ - lghi %r1,__LC_GPREGS_SAVE_AREA - stmg %r0,%r15,0(%r1) - mvc 8(8,%r1),__LC_SAVE_AREA_RESTART + GET_LC %r13 /* Control registers */ - lghi %r1,__LC_CREGS_SAVE_AREA - stctg %c0,%c15,0(%r1) + stctg %c0,%c15,__LC_CREGS_SAVE_AREA(%r13) /* Access registers */ - lghi %r1,__LC_AREGS_SAVE_AREA - stam %a0,%a15,0(%r1) + stamy %a0,%a15,__LC_AREGS_SAVE_AREA(%r13) /* Floating point registers */ - lghi %r1,__LC_FPREGS_SAVE_AREA + lay %r1,__LC_FPREGS_SAVE_AREA(%r13) std %f0, 0x00(%r1) std %f1, 0x08(%r1) std %f2, 0x10(%r1) @@ -51,21 +47,21 @@ SYM_CODE_START(store_status) std %f14,0x70(%r1) std %f15,0x78(%r1) /* Floating point control register */ - lghi %r1,__LC_FP_CREG_SAVE_AREA + lay %r1,__LC_FP_CREG_SAVE_AREA(%r13) stfpc 0(%r1) /* CPU timer */ - lghi %r1,__LC_CPU_TIMER_SAVE_AREA + lay %r1,__LC_CPU_TIMER_SAVE_AREA(%r13) stpt 0(%r1) /* Store prefix register */ - lghi %r1,__LC_PREFIX_SAVE_AREA + lay %r1,__LC_PREFIX_SAVE_AREA(%r13) stpx 0(%r1) /* Clock comparator - seven bytes */ - lghi %r1,__LC_CLOCK_COMP_SAVE_AREA larl %r4,clkcmp stckc 0(%r4) + lay %r1,__LC_CLOCK_COMP_SAVE_AREA(%r13) mvc 1(7,%r1),1(%r4) /* Program status word */ - lghi %r1,__LC_PSW_SAVE_AREA + lay %r1,__LC_PSW_SAVE_AREA(%r13) epsw %r4,%r5 st %r4,0(%r1) st %r5,4(%r1) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 24ed33f044ec..d78bcfe707b5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -146,30 +146,35 @@ static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31; static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31; unsigned long __bootdata_preserved(max_mappable); -unsigned long __bootdata(ident_map_size); struct physmem_info __bootdata(physmem_info); -unsigned long __bootdata_preserved(__kaslr_offset); +struct vm_layout __bootdata_preserved(vm_layout); +EXPORT_SYMBOL(vm_layout); int __bootdata_preserved(__kaslr_enabled); unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); u64 __bootdata_preserved(stfle_fac_list[16]); EXPORT_SYMBOL(stfle_fac_list); -u64 __bootdata_preserved(alt_stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); -unsigned long VMALLOC_START; +char __bootdata(boot_rb)[PAGE_SIZE * 2]; +bool __bootdata(boot_earlyprintk); +size_t __bootdata(boot_rb_off); +char __bootdata(bootdebug_filter)[128]; +bool __bootdata(bootdebug); + +unsigned long __bootdata_preserved(VMALLOC_START); EXPORT_SYMBOL(VMALLOC_START); -unsigned long VMALLOC_END; +unsigned long __bootdata_preserved(VMALLOC_END); EXPORT_SYMBOL(VMALLOC_END); -struct page *vmemmap; +struct page *__bootdata_preserved(vmemmap); EXPORT_SYMBOL(vmemmap); -unsigned long vmemmap_size; +unsigned long __bootdata_preserved(vmemmap_size); -unsigned long MODULES_VADDR; -unsigned long MODULES_END; +unsigned long __bootdata_preserved(MODULES_VADDR); +unsigned long __bootdata_preserved(MODULES_END); /* An array with a pointer to the lowcore of every CPU. */ struct lowcore *lowcore_ptr[NR_CPUS]; @@ -360,36 +365,24 @@ void *restart_stack; unsigned long stack_alloc(void) { -#ifdef CONFIG_VMAP_STACK - void *ret; + void *stack; - ret = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP, - NUMA_NO_NODE, __builtin_return_address(0)); - kmemleak_not_leak(ret); - return (unsigned long)ret; -#else - return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); -#endif + stack = __vmalloc_node(THREAD_SIZE, THREAD_SIZE, THREADINFO_GFP, + NUMA_NO_NODE, __builtin_return_address(0)); + kmemleak_not_leak(stack); + return (unsigned long)stack; } void stack_free(unsigned long stack) { -#ifdef CONFIG_VMAP_STACK - vfree((void *) stack); -#else - free_pages(stack, THREAD_SIZE_ORDER); -#endif + vfree((void *)stack); } static unsigned long __init stack_alloc_early(void) { unsigned long stack; - stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!stack) { - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - } + stack = (unsigned long)memblock_alloc_or_panic(THREAD_SIZE, THREAD_SIZE); return stack; } @@ -406,6 +399,7 @@ static void __init setup_lowcore(void) panic("%s: Failed to allocate %zu bytes align=%zx\n", __func__, sizeof(*lc), sizeof(*lc)); + lc->pcpu = (unsigned long)per_cpu_ptr(&pcpu_devices, 0); lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT; lc->restart_psw.addr = __pa(restart_int_handler); lc->external_new_psw.mask = PSW_KERNEL_BITS; @@ -421,16 +415,16 @@ static void __init setup_lowcore(void) lc->clock_comparator = clock_comparator_max; lc->current_task = (unsigned long)&init_task; lc->lpp = LPP_MAGIC; - lc->machine_flags = S390_lowcore.machine_flags; - lc->preempt_count = S390_lowcore.preempt_count; + lc->machine_flags = get_lowcore()->machine_flags; + lc->preempt_count = get_lowcore()->preempt_count; nmi_alloc_mcesa_early(&lc->mcesad); - lc->sys_enter_timer = S390_lowcore.sys_enter_timer; - lc->exit_timer = S390_lowcore.exit_timer; - lc->user_timer = S390_lowcore.user_timer; - lc->system_timer = S390_lowcore.system_timer; - lc->steal_timer = S390_lowcore.steal_timer; - lc->last_update_timer = S390_lowcore.last_update_timer; - lc->last_update_clock = S390_lowcore.last_update_clock; + lc->sys_enter_timer = get_lowcore()->sys_enter_timer; + lc->exit_timer = get_lowcore()->exit_timer; + lc->user_timer = get_lowcore()->user_timer; + lc->system_timer = get_lowcore()->system_timer; + lc->steal_timer = get_lowcore()->steal_timer; + lc->last_update_timer = get_lowcore()->last_update_timer; + lc->last_update_clock = get_lowcore()->last_update_clock; /* * Allocate the global restart stack which is the same for * all CPUs in case *one* of them does a PSW restart. @@ -439,7 +433,7 @@ static void __init setup_lowcore(void) lc->mcck_stack = stack_alloc_early() + STACK_INIT_OFFSET; lc->async_stack = stack_alloc_early() + STACK_INIT_OFFSET; lc->nodat_stack = stack_alloc_early() + STACK_INIT_OFFSET; - lc->kernel_stack = S390_lowcore.kernel_stack; + lc->kernel_stack = get_lowcore()->kernel_stack; /* * Set up PSW restart to call ipl.c:do_restart(). Copy the relevant * restart data to the absolute zero lowcore. This is necessary if @@ -455,8 +449,8 @@ static void __init setup_lowcore(void) lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); lc->preempt_count = PREEMPT_DISABLED; - lc->kernel_asce = S390_lowcore.kernel_asce; - lc->user_asce = S390_lowcore.user_asce; + lc->kernel_asce = get_lowcore()->kernel_asce; + lc->user_asce = get_lowcore()->user_asce; system_ctlreg_init_save_area(lc); abs_lc = get_abs_lowcore(); @@ -512,10 +506,7 @@ static void __init setup_resources(void) bss_resource.end = __pa_symbol(__bss_stop) - 1; for_each_mem_range(i, &start, &end) { - res = memblock_alloc(sizeof(*res), 8); - if (!res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*res), 8); + res = memblock_alloc_or_panic(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; @@ -534,10 +525,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = memblock_alloc(sizeof(*sub_res), 8); - if (!sub_res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*sub_res), 8); + sub_res = memblock_alloc_or_panic(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; @@ -704,7 +692,7 @@ static void __init reserve_physmem_info(void) { unsigned long addr, size; - if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size)) + if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size)) memblock_reserve(addr, size); } @@ -712,7 +700,7 @@ static void __init free_physmem_info(void) { unsigned long addr, size; - if (get_physmem_reserved(RR_MEM_DETECT_EXTENDED, &addr, &size)) + if (get_physmem_reserved(RR_MEM_DETECT_EXT, &addr, &size)) memblock_phys_free(addr, size); } @@ -734,7 +722,23 @@ static void __init memblock_add_physmem_info(void) } /* - * Reserve memory used for lowcore/command line/kernel image. + * Reserve memory used for lowcore. + */ +static void __init reserve_lowcore(void) +{ + void *lowcore_start = get_lowcore(); + void *lowcore_end = lowcore_start + sizeof(struct lowcore); + void *start, *end; + + if (absolute_pointer(__identity_base) < lowcore_end) { + start = max(lowcore_start, (void *)__identity_base); + end = min(lowcore_end, (void *)(__identity_base + ident_map_size)); + memblock_reserve(__pa(start), __pa(end)); + } +} + +/* + * Reserve memory used for absolute lowcore/command line/kernel image. */ static void __init reserve_kernel(void) { @@ -765,7 +769,7 @@ static void __init relocate_amode31_section(void) unsigned long amode31_size = __eamode31 - __samode31; long amode31_offset, *ptr; - amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31; + amode31_offset = AMODE31_START - (unsigned long)__samode31; pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size); /* Move original AMODE31 section to the new one */ @@ -808,9 +812,7 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!vmms) - panic("Failed to allocate memory for sysinfo structure\n"); + vmms = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free(vmms, PAGE_SIZE); @@ -870,6 +872,23 @@ static void __init log_component_list(void) } /* + * Print avoiding interpretation of % in buf and taking bootdebug option + * into consideration. + */ +static void __init print_rb_entry(const char *buf) +{ + char fmt[] = KERN_SOH "0boot: %s"; + int level = printk_get_level(buf); + + buf = skip_timestamp(printk_skip_level(buf)); + if (level == KERN_DEBUG[1] && (!bootdebug || !bootdebug_filter_match(buf))) + return; + + fmt[1] = level; + printk(fmt, buf); +} + +/* * Setup function called from init/main.c just after the banner * was printed. */ @@ -888,6 +907,12 @@ void __init setup_arch(char **cmdline_p) pr_info("Linux is running natively in 64-bit mode\n"); else pr_info("Linux is running as a guest in 64-bit mode\n"); + /* Print decompressor messages if not already printed */ + if (!boot_earlyprintk) + boot_rb_foreach(print_rb_entry); + + if (have_relocated_lowcore()) + pr_info("Lowcore relocated to 0x%px\n", get_lowcore()); log_component_list(); @@ -915,6 +940,7 @@ void __init setup_arch(char **cmdline_p) /* Do some memory reservations *before* memory is added to memblock */ reserve_pgtables(); + reserve_lowcore(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); @@ -959,6 +985,7 @@ void __init setup_arch(char **cmdline_p) if (test_facility(193)) static_branch_enable(&cpu_has_bear); + setup_protection_map(); /* * Create kernel page tables. */ @@ -986,3 +1013,8 @@ void __init setup_arch(char **cmdline_p) /* Add system specific data to the random pool */ setup_randomness(); } + +void __init arch_cpu_finalize_init(void) +{ + sclp_init(); +} diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 6c2cb345402f..e48013cd832c 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -30,9 +30,9 @@ #include <linux/compat.h> #include <asm/ucontext.h> #include <linux/uaccess.h> +#include <asm/vdso-symbols.h> #include <asm/access-regs.h> #include <asm/lowcore.h> -#include <asm/vdso.h> #include "entry.h" /* diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 0324649aae0a..7b08399b0846 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -74,18 +74,15 @@ enum { CPU_STATE_CONFIGURED, }; -static DEFINE_PER_CPU(struct cpu *, cpu_device); - -struct pcpu { - unsigned long ec_mask; /* bit mask for ec_xxx functions */ - unsigned long ec_clk; /* sigp timestamp for ec_xxx */ - signed char state; /* physical cpu state */ - signed char polarization; /* physical polarization */ - u16 address; /* physical cpu address */ -}; - static u8 boot_core_type; -static struct pcpu pcpu_devices[NR_CPUS]; +DEFINE_PER_CPU(struct pcpu, pcpu_devices); +/* + * Pointer to the pcpu area of the boot CPU. This is required when a restart + * interrupt is triggered on an offline CPU. For that case accessing percpu + * data with the common primitives does not work, since the percpu offset is + * stored in a non existent lowcore. + */ +static struct pcpu *ipl_pcpu; unsigned int smp_cpu_mt_shift; EXPORT_SYMBOL(smp_cpu_mt_shift); @@ -176,8 +173,8 @@ static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address) int cpu; for_each_cpu(cpu, mask) - if (pcpu_devices[cpu].address == address) - return pcpu_devices + cpu; + if (per_cpu(pcpu_devices, cpu).address == address) + return &per_cpu(pcpu_devices, cpu); return NULL; } @@ -203,7 +200,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) mcck_stack = stack_alloc(); if (!lc || !nodat_stack || !async_stack || !mcck_stack) goto out; - memcpy(lc, &S390_lowcore, 512); + memcpy(lc, get_lowcore(), 512); memset((char *) lc + 512, 0, sizeof(*lc) - 512); lc->async_stack = async_stack + STACK_INIT_OFFSET; lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET; @@ -232,13 +229,11 @@ out: return -ENOMEM; } -static void pcpu_free_lowcore(struct pcpu *pcpu) +static void pcpu_free_lowcore(struct pcpu *pcpu, int cpu) { unsigned long async_stack, nodat_stack, mcck_stack; struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET; async_stack = lc->async_stack - STACK_INIT_OFFSET; @@ -261,13 +256,14 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); lc->cpu_nr = cpu; + lc->pcpu = (unsigned long)pcpu; lc->restart_flags = RESTART_FLAG_CTLREGS; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; lc->percpu_offset = __per_cpu_offset[cpu]; - lc->kernel_asce = S390_lowcore.kernel_asce; + lc->kernel_asce = get_lowcore()->kernel_asce; lc->user_asce = s390_invalid_asce; - lc->machine_flags = S390_lowcore.machine_flags; + lc->machine_flags = get_lowcore()->machine_flags; lc->user_timer = lc->system_timer = lc->steal_timer = lc->avg_steal_timer = 0; abs_lc = get_abs_lowcore(); @@ -279,12 +275,10 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) arch_spin_lock_setup(cpu); } -static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) +static void pcpu_attach_task(int cpu, struct task_struct *tsk) { struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET; lc->current_task = (unsigned long)tsk; @@ -298,18 +292,16 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) lc->steal_timer = 0; } -static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) +static void pcpu_start_fn(int cpu, void (*func)(void *), void *data) { struct lowcore *lc; - int cpu; - cpu = pcpu - pcpu_devices; lc = lowcore_ptr[cpu]; lc->restart_stack = lc->kernel_stack; lc->restart_fn = (unsigned long) func; lc->restart_data = (unsigned long) data; lc->restart_source = -1U; - pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); + pcpu_sigp_retry(per_cpu_ptr(&pcpu_devices, cpu), SIGP_RESTART, 0); } typedef void (pcpu_delegate_fn)(void *); @@ -322,14 +314,14 @@ static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) func(data); /* should not return */ } -static void pcpu_delegate(struct pcpu *pcpu, +static void pcpu_delegate(struct pcpu *pcpu, int cpu, pcpu_delegate_fn *func, void *data, unsigned long stack) { struct lowcore *lc, *abs_lc; unsigned int source_cpu; - lc = lowcore_ptr[pcpu - pcpu_devices]; + lc = lowcore_ptr[cpu]; source_cpu = stap(); if (pcpu->address == source_cpu) { @@ -379,38 +371,22 @@ static int pcpu_set_smt(unsigned int mtid) smp_cpu_mt_shift = 0; while (smp_cpu_mtid >= (1U << smp_cpu_mt_shift)) smp_cpu_mt_shift++; - pcpu_devices[0].address = stap(); + per_cpu(pcpu_devices, 0).address = stap(); } return cc; } /* - * Call function on an online CPU. - */ -void smp_call_online_cpu(void (*func)(void *), void *data) -{ - struct pcpu *pcpu; - - /* Use the current cpu if it is online. */ - pcpu = pcpu_find_address(cpu_online_mask, stap()); - if (!pcpu) - /* Use the first online cpu. */ - pcpu = pcpu_devices + cpumask_first(cpu_online_mask); - pcpu_delegate(pcpu, func, data, (unsigned long) restart_stack); -} - -/* * Call function on the ipl CPU. */ void smp_call_ipl_cpu(void (*func)(void *), void *data) { struct lowcore *lc = lowcore_ptr[0]; - if (pcpu_devices[0].address == stap()) - lc = &S390_lowcore; + if (ipl_pcpu->address == stap()) + lc = get_lowcore(); - pcpu_delegate(&pcpu_devices[0], func, data, - lc->nodat_stack); + pcpu_delegate(ipl_pcpu, 0, func, data, lc->nodat_stack); } int smp_find_processor_id(u16 address) @@ -418,21 +394,21 @@ int smp_find_processor_id(u16 address) int cpu; for_each_present_cpu(cpu) - if (pcpu_devices[cpu].address == address) + if (per_cpu(pcpu_devices, cpu).address == address) return cpu; return -1; } void schedule_mcck_handler(void) { - pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending); + pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_mcck_pending); } bool notrace arch_vcpu_is_preempted(int cpu) { if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) return false; - if (pcpu_running(pcpu_devices + cpu)) + if (pcpu_running(per_cpu_ptr(&pcpu_devices, cpu))) return false; return true; } @@ -444,7 +420,7 @@ void notrace smp_yield_cpu(int cpu) return; diag_stat_inc_norecursion(DIAG_STAT_X09C); asm volatile("diag %0,0,0x9c" - : : "d" (pcpu_devices[cpu].address)); + : : "d" (per_cpu(pcpu_devices, cpu).address)); } EXPORT_SYMBOL_GPL(smp_yield_cpu); @@ -465,7 +441,7 @@ void notrace smp_emergency_stop(void) end = get_tod_clock() + (1000000UL << 12); for_each_cpu(cpu, &cpumask) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); set_bit(ec_stop_cpu, &pcpu->ec_mask); while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL, 0, NULL) == SIGP_CC_BUSY && @@ -474,7 +450,7 @@ void notrace smp_emergency_stop(void) } while (get_tod_clock() < end) { for_each_cpu(cpu, &cpumask) - if (pcpu_stopped(pcpu_devices + cpu)) + if (pcpu_stopped(per_cpu_ptr(&pcpu_devices, cpu))) cpumask_clear_cpu(cpu, &cpumask); if (cpumask_empty(&cpumask)) break; @@ -489,6 +465,7 @@ NOKPROBE_SYMBOL(smp_emergency_stop); */ void smp_send_stop(void) { + struct pcpu *pcpu; int cpu; /* Disable all interrupts/machine checks */ @@ -504,8 +481,9 @@ void smp_send_stop(void) for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; - pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0); - while (!pcpu_stopped(pcpu_devices + cpu)) + pcpu = per_cpu_ptr(&pcpu_devices, cpu); + pcpu_sigp_retry(pcpu, SIGP_STOP, 0); + while (!pcpu_stopped(pcpu)) cpu_relax(); } } @@ -519,7 +497,7 @@ static void smp_handle_ext_call(void) unsigned long bits; /* handle bit signal external calls */ - bits = xchg(&pcpu_devices[smp_processor_id()].ec_mask, 0); + bits = this_cpu_xchg(pcpu_devices.ec_mask, 0); if (test_bit(ec_stop_cpu, &bits)) smp_stop_cpu(); if (test_bit(ec_schedule, &bits)) @@ -544,12 +522,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) int cpu; for_each_cpu(cpu, mask) - pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single); } void arch_send_call_function_single_ipi(int cpu) { - pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single); } /* @@ -559,13 +537,13 @@ void arch_send_call_function_single_ipi(int cpu) */ void arch_smp_send_reschedule(int cpu) { - pcpu_ec_call(pcpu_devices + cpu, ec_schedule); + pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_schedule); } #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { - pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work); + pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_irq_work); } #endif @@ -577,7 +555,7 @@ int smp_store_status(int cpu) struct pcpu *pcpu; unsigned long pa; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); lc = lowcore_ptr[cpu]; pa = __pa(&lc->floating_pt_save_area); if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS, @@ -596,7 +574,7 @@ int smp_store_status(int cpu) /* * Collect CPU state of the previous, crashed system. - * There are four cases: + * There are three cases: * 1) standard zfcp/nvme dump * condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true * The state for all CPUs except the boot CPU needs to be collected @@ -609,16 +587,16 @@ int smp_store_status(int cpu) * with sigp stop-and-store-status. The firmware or the boot-loader * stored the registers of the boot CPU in the absolute lowcore in the * memory of the old system. - * 3) kdump and the old kernel did not store the CPU state, - * or stand-alone kdump for DASD - * condition: OLDMEM_BASE != NULL && !is_kdump_kernel() + * 3) kdump or stand-alone kdump for DASD + * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == false * The state for all CPUs except the boot CPU needs to be collected * with sigp stop-and-store-status. The kexec code or the boot-loader * stored the registers of the boot CPU in the memory of the old system. - * 4) kdump and the old kernel stored the CPU state - * condition: OLDMEM_BASE != NULL && is_kdump_kernel() - * This case does not exist for s390 anymore, setup_arch explicitly - * deactivates the elfcorehdr= kernel parameter + * + * Note that the legacy kdump mode where the old kernel stored the CPU states + * does no longer exist: setup_arch() explicitly deactivates the elfcorehdr= + * kernel parameter. The is_kdump_kernel() implementation on s390 is independent + * of the elfcorehdr= parameter. */ static bool dump_available(void) { @@ -633,9 +611,7 @@ void __init smp_save_dump_ipl_cpu(void) if (!dump_available()) return; sa = save_area_alloc(true); - regs = memblock_alloc(512, 8); - if (!sa || !regs) - panic("could not allocate memory for boot CPU save area\n"); + regs = memblock_alloc_or_panic(512, 8); copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); memblock_free(regs, 512); @@ -668,8 +644,6 @@ void __init smp_save_dump_secondary_cpus(void) SIGP_CC_NOT_OPERATIONAL) continue; sa = save_area_alloc(false); - if (!sa) - panic("could not allocate memory for save area\n"); __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); save_area_add_regs(sa, page); if (cpu_has_vx()) { @@ -685,17 +659,36 @@ void __init smp_save_dump_secondary_cpus(void) void smp_cpu_set_polarization(int cpu, int val) { - pcpu_devices[cpu].polarization = val; + per_cpu(pcpu_devices, cpu).polarization = val; } int smp_cpu_get_polarization(int cpu) { - return pcpu_devices[cpu].polarization; + return per_cpu(pcpu_devices, cpu).polarization; +} + +void smp_cpu_set_capacity(int cpu, unsigned long val) +{ + per_cpu(pcpu_devices, cpu).capacity = val; +} + +unsigned long smp_cpu_get_capacity(int cpu) +{ + return per_cpu(pcpu_devices, cpu).capacity; +} + +void smp_set_core_capacity(int cpu, unsigned long val) +{ + int i; + + cpu = smp_get_base_cpu(cpu); + for (i = cpu; (i <= cpu + smp_cpu_mtid) && (i < nr_cpu_ids); i++) + smp_cpu_set_capacity(i, val); } int smp_cpu_get_cpu_address(int cpu) { - return pcpu_devices[cpu].address; + return per_cpu(pcpu_devices, cpu).address; } static void __ref smp_get_core_info(struct sclp_core_info *info, int early) @@ -719,8 +712,6 @@ static void __ref smp_get_core_info(struct sclp_core_info *info, int early) } } -static int smp_add_present_cpu(int cpu); - static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, bool configured, bool early) { @@ -736,15 +727,16 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, for (i = 0; (i <= smp_cpu_mtid) && (cpu < nr_cpu_ids); i++) { if (pcpu_find_address(cpu_present_mask, address + i)) continue; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); pcpu->address = address + i; if (configured) pcpu->state = CPU_STATE_CONFIGURED; else pcpu->state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); set_cpu_present(cpu, true); - if (!early && smp_add_present_cpu(cpu) != 0) + if (!early && arch_register_cpu(cpu)) set_cpu_present(cpu, false); else nr++; @@ -771,7 +763,7 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) * that all SMT threads get subsequent logical CPU numbers. */ if (early) { - core_id = pcpu_devices[0].address >> smp_cpu_mt_shift; + core_id = per_cpu(pcpu_devices, 0).address >> smp_cpu_mt_shift; for (i = 0; i < info->configured; i++) { core = &info->core[i]; if (core->core_id == core_id) { @@ -796,10 +788,7 @@ void __init smp_detect_cpus(void) u16 address; /* Get CPU information */ - info = memblock_alloc(sizeof(*info), 8); - if (!info) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*info), 8); + info = memblock_alloc_or_panic(sizeof(*info), 8); smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { @@ -831,9 +820,6 @@ void __init smp_detect_cpus(void) s_cpus += smp_cpu_mtid + 1; } pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); - - /* Add CPUs present at boot */ - __smp_rescan_cpus(info, true); memblock_free(info, sizeof(*info)); } @@ -842,15 +828,16 @@ void __init smp_detect_cpus(void) */ static void smp_start_secondary(void *cpuvoid) { + struct lowcore *lc = get_lowcore(); int cpu = raw_smp_processor_id(); - S390_lowcore.last_update_clock = get_tod_clock(); - S390_lowcore.restart_stack = (unsigned long)restart_stack; - S390_lowcore.restart_fn = (unsigned long)do_restart; - S390_lowcore.restart_data = 0; - S390_lowcore.restart_source = -1U; - S390_lowcore.restart_flags = 0; - restore_access_regs(S390_lowcore.access_regs_save_area); + lc->last_update_clock = get_tod_clock(); + lc->restart_stack = (unsigned long)restart_stack; + lc->restart_fn = (unsigned long)do_restart; + lc->restart_data = 0; + lc->restart_source = -1U; + lc->restart_flags = 0; + restore_access_regs(lc->access_regs_save_area); cpu_init(); rcutree_report_cpu_starting(cpu); init_cpu_timer(); @@ -873,7 +860,7 @@ static void smp_start_secondary(void *cpuvoid) /* Upping and downing of CPUs */ int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - struct pcpu *pcpu = pcpu_devices + cpu; + struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu); int rc; if (pcpu->state != CPU_STATE_CONFIGURED) @@ -891,8 +878,8 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) */ system_ctlreg_lock(); pcpu_prepare_secondary(pcpu, cpu); - pcpu_attach_task(pcpu, tidle); - pcpu_start_fn(pcpu, smp_start_secondary, NULL); + pcpu_attach_task(cpu, tidle); + pcpu_start_fn(cpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ while (!cpu_online(cpu)) cpu_relax(); @@ -937,18 +924,19 @@ void __cpu_die(unsigned int cpu) struct pcpu *pcpu; /* Wait until target cpu is down */ - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); while (!pcpu_stopped(pcpu)) cpu_relax(); - pcpu_free_lowcore(pcpu); + pcpu_free_lowcore(pcpu, cpu); cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask); + pcpu->flags = 0; } void __noreturn cpu_die(void) { idle_task_exit(); - pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0); + pcpu_sigp_retry(this_cpu_ptr(&pcpu_devices), SIGP_STOP, 0); for (;;) ; } @@ -973,24 +961,30 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); system_ctl_set_bit(0, 13); + smp_rescan_cpus(true); } void __init smp_prepare_boot_cpu(void) { - struct pcpu *pcpu = pcpu_devices; + struct lowcore *lc = get_lowcore(); WARN_ON(!cpu_present(0) || !cpu_online(0)); - pcpu->state = CPU_STATE_CONFIGURED; - S390_lowcore.percpu_offset = __per_cpu_offset[0]; + lc->percpu_offset = __per_cpu_offset[0]; + ipl_pcpu = per_cpu_ptr(&pcpu_devices, 0); + ipl_pcpu->state = CPU_STATE_CONFIGURED; + lc->pcpu = (unsigned long)ipl_pcpu; smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); + smp_cpu_set_capacity(0, CPU_CAPACITY_HIGH); } void __init smp_setup_processor_id(void) { - pcpu_devices[0].address = stap(); - S390_lowcore.cpu_nr = 0; - S390_lowcore.spinlock_lockval = arch_spin_lockval(0); - S390_lowcore.spinlock_index = 0; + struct lowcore *lc = get_lowcore(); + + lc->cpu_nr = 0; + per_cpu(pcpu_devices, 0).address = stap(); + lc->spinlock_lockval = arch_spin_lockval(0); + lc->spinlock_index = 0; } /* @@ -1010,7 +1004,7 @@ static ssize_t cpu_configure_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", pcpu_devices[dev->id].state); + count = sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).state); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -1036,7 +1030,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) if (cpu_online(cpu + i)) goto out; - pcpu = pcpu_devices + cpu; + pcpu = per_cpu_ptr(&pcpu_devices, cpu); rc = 0; switch (val) { case 0: @@ -1048,7 +1042,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) { if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) continue; - pcpu[i].state = CPU_STATE_STANDBY; + per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu + i, POLARIZATION_UNKNOWN); } @@ -1063,7 +1057,7 @@ static ssize_t cpu_configure_store(struct device *dev, for (i = 0; i <= smp_cpu_mtid; i++) { if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i)) continue; - pcpu[i].state = CPU_STATE_CONFIGURED; + per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_CONFIGURED; smp_cpu_set_polarization(cpu + i, POLARIZATION_UNKNOWN); } @@ -1082,7 +1076,7 @@ static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store); static ssize_t show_cpu_address(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pcpu_devices[dev->id].address); + return sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).address); } static DEVICE_ATTR(address, 0444, show_cpu_address, NULL); @@ -1108,35 +1102,34 @@ static struct attribute_group cpu_online_attr_group = { static int smp_cpu_online(unsigned int cpu) { - struct device *s = &per_cpu(cpu_device, cpu)->dev; + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); - return sysfs_create_group(&s->kobj, &cpu_online_attr_group); + return sysfs_create_group(&c->dev.kobj, &cpu_online_attr_group); } static int smp_cpu_pre_down(unsigned int cpu) { - struct device *s = &per_cpu(cpu_device, cpu)->dev; + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); - sysfs_remove_group(&s->kobj, &cpu_online_attr_group); + sysfs_remove_group(&c->dev.kobj, &cpu_online_attr_group); return 0; } -static int smp_add_present_cpu(int cpu) +bool arch_cpu_is_hotpluggable(int cpu) { - struct device *s; - struct cpu *c; + return !!cpu; +} + +int arch_register_cpu(int cpu) +{ + struct cpu *c = per_cpu_ptr(&cpu_devices, cpu); int rc; - c = kzalloc(sizeof(*c), GFP_KERNEL); - if (!c) - return -ENOMEM; - per_cpu(cpu_device, cpu) = c; - s = &c->dev; - c->hotpluggable = !!cpu; + c->hotpluggable = arch_cpu_is_hotpluggable(cpu); rc = register_cpu(c, cpu); if (rc) goto out; - rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group); + rc = sysfs_create_group(&c->dev.kobj, &cpu_common_attr_group); if (rc) goto out_cpu; rc = topology_cpu_init(c); @@ -1145,14 +1138,14 @@ static int smp_add_present_cpu(int cpu) return 0; out_topology: - sysfs_remove_group(&s->kobj, &cpu_common_attr_group); + sysfs_remove_group(&c->dev.kobj, &cpu_common_attr_group); out_cpu: unregister_cpu(c); out: return rc; } -int __ref smp_rescan_cpus(void) +int __ref smp_rescan_cpus(bool early) { struct sclp_core_info *info; int nr; @@ -1161,7 +1154,7 @@ int __ref smp_rescan_cpus(void) if (!info) return -ENOMEM; smp_get_core_info(info, 0); - nr = __smp_rescan_cpus(info, false); + nr = __smp_rescan_cpus(info, early); kfree(info); if (nr) topology_schedule_update(); @@ -1178,7 +1171,7 @@ static ssize_t __ref rescan_store(struct device *dev, rc = lock_device_hotplug_sysfs(); if (rc) return rc; - rc = smp_rescan_cpus(); + rc = smp_rescan_cpus(false); unlock_device_hotplug(); return rc ? rc : count; } @@ -1187,7 +1180,7 @@ static DEVICE_ATTR_WO(rescan); static int __init s390_smp_init(void) { struct device *dev_root; - int cpu, rc = 0; + int rc; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { @@ -1196,17 +1189,9 @@ static int __init s390_smp_init(void) if (rc) return rc; } - - for_each_present_cpu(cpu) { - rc = smp_add_present_cpu(cpu); - if (rc) - goto out; - } - rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", smp_cpu_online, smp_cpu_pre_down); rc = rc <= 0 ? rc : 0; -out: return rc; } subsys_initcall(s390_smp_init); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 94f440e38303..40edfde25f5b 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -5,6 +5,7 @@ * Copyright IBM Corp. 2006 */ +#include <linux/perf_event.h> #include <linux/stacktrace.h> #include <linux/uaccess.h> #include <linux/compat.h> @@ -62,42 +63,102 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, return 0; } -void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, - const struct pt_regs *regs) +static inline bool store_ip(stack_trace_consume_fn consume_entry, void *cookie, + struct perf_callchain_entry_ctx *entry, bool perf, + unsigned long ip) +{ +#ifdef CONFIG_PERF_EVENTS + if (perf) { + if (perf_callchain_store(entry, ip)) + return false; + return true; + } +#endif + return consume_entry(cookie, ip); +} + +static inline bool ip_invalid(unsigned long ip) { + /* + * Perform some basic checks if an instruction address taken + * from unreliable source is invalid. + */ + if (ip & 1) + return true; + if (ip < mmap_min_addr) + return true; + if (ip >= current->mm->context.asce_limit) + return true; + return false; +} + +static inline bool ip_within_vdso(unsigned long ip) +{ + return in_range(ip, current->mm->context.vdso_base, vdso_text_size()); +} + +void arch_stack_walk_user_common(stack_trace_consume_fn consume_entry, void *cookie, + struct perf_callchain_entry_ctx *entry, + const struct pt_regs *regs, bool perf) +{ + struct stack_frame_vdso_wrapper __user *sf_vdso; struct stack_frame_user __user *sf; unsigned long ip, sp; bool first = true; if (is_compat_task()) return; - if (!consume_entry(cookie, instruction_pointer(regs))) + if (!current->mm) + return; + ip = instruction_pointer(regs); + if (!store_ip(consume_entry, cookie, entry, perf, ip)) return; sf = (void __user *)user_stack_pointer(regs); pagefault_disable(); while (1) { if (__get_user(sp, &sf->back_chain)) break; - if (__get_user(ip, &sf->gprs[8])) + /* + * VDSO entry code has a non-standard stack frame layout. + * See VDSO user wrapper code for details. + */ + if (!sp && ip_within_vdso(ip)) { + sf_vdso = (void __user *)sf; + if (__get_user(ip, &sf_vdso->return_address)) + break; + sp = (unsigned long)sf + STACK_FRAME_VDSO_OVERHEAD; + sf = (void __user *)sp; + if (__get_user(sp, &sf->back_chain)) + break; + } else { + sf = (void __user *)sp; + if (__get_user(ip, &sf->gprs[8])) + break; + } + /* Sanity check: ABI requires SP to be 8 byte aligned. */ + if (sp & 0x7) break; - if (ip & 0x1) { + if (ip_invalid(ip)) { /* * If the instruction address is invalid, and this * is the first stack frame, assume r14 has not * been written to the stack yet. Otherwise exit. */ - if (first && !(regs->gprs[14] & 0x1)) - ip = regs->gprs[14]; - else + if (!first) + break; + ip = regs->gprs[14]; + if (ip_invalid(ip)) break; } - if (!consume_entry(cookie, ip)) + if (!store_ip(consume_entry, cookie, entry, perf, ip)) break; - /* Sanity check: ABI requires SP to be aligned 8 bytes. */ - if (!sp || sp & 0x7) - break; - sf = (void __user *)sp; first = false; } pagefault_enable(); } + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + arch_stack_walk_user_common(consume_entry, cookie, NULL, regs, false); +} diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 30bb20461db4..d40f0b983e74 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -17,6 +17,7 @@ #include <asm/ebcdic.h> #include <asm/facility.h> #include <asm/sthyi.h> +#include <asm/asm.h> #include "entry.h" #define DED_WEIGHT 0xffff @@ -300,33 +301,56 @@ static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, return (struct diag204_x_part_block *)&block->cpus[i]; } -static void fill_diag(struct sthyi_sctns *sctns) +static void *diag204_get_data(bool diag204_allow_busy) { - int i, r, pages; - bool this_lpar; + unsigned long subcode; void *diag204_buf; - void *diag224_buf = NULL; - struct diag204_x_info_blk_hdr *ti_hdr; - struct diag204_x_part_block *part_block; - struct diag204_x_phys_block *phys_block; - struct lpar_cpu_inf lpar_inf = {}; - - /* Errors are handled through the validity bits in the response. */ - pages = diag204((unsigned long)DIAG204_SUBC_RSI | - (unsigned long)DIAG204_INFO_EXT, 0, NULL); - if (pages <= 0) - return; - + int pages, rc; + + subcode = DIAG204_SUBC_RSI; + subcode |= DIAG204_INFO_EXT; + pages = diag204(subcode, 0, NULL); + if (pages < 0) + return ERR_PTR(pages); + if (pages == 0) + return ERR_PTR(-ENODATA); diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE), PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); if (!diag204_buf) - return; + return ERR_PTR(-ENOMEM); + subcode = DIAG204_SUBC_STIB7; + subcode |= DIAG204_INFO_EXT; + if (diag204_has_bif() && diag204_allow_busy) + subcode |= DIAG204_BIF_BIT; + rc = diag204(subcode, pages, diag204_buf); + if (rc < 0) { + vfree(diag204_buf); + return ERR_PTR(rc); + } + return diag204_buf; +} - r = diag204((unsigned long)DIAG204_SUBC_STIB7 | - (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); - if (r < 0) - goto out; +static bool is_diag204_cached(struct sthyi_sctns *sctns) +{ + /* + * Check if validity bits are set when diag204 data + * is gathered. + */ + if (sctns->par.infpval1) + return true; + return false; +} + +static void fill_diag(struct sthyi_sctns *sctns, void *diag204_buf) +{ + int i; + bool this_lpar; + void *diag224_buf = NULL; + struct diag204_x_info_blk_hdr *ti_hdr; + struct diag204_x_part_block *part_block; + struct diag204_x_phys_block *phys_block; + struct lpar_cpu_inf lpar_inf = {}; diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); if (!diag224_buf || diag224(diag224_buf)) @@ -392,7 +416,6 @@ static void fill_diag(struct sthyi_sctns *sctns) out: free_page((unsigned long)diag224_buf); - vfree(diag204_buf); } static int sthyi(u64 vaddr, u64 *rc) @@ -403,30 +426,41 @@ static int sthyi(u64 vaddr, u64 *rc) asm volatile( ".insn rre,0xB2560000,%[r1],%[r2]\n" - "ipm %[cc]\n" - "srl %[cc],28\n" - : [cc] "=&d" (cc), [r2] "+&d" (r2.pair) + CC_IPM(cc) + : CC_OUT(cc, cc), [r2] "+&d" (r2.pair) : [r1] "d" (r1.pair) - : "memory", "cc"); + : CC_CLOBBER_LIST("memory")); *rc = r2.odd; - return cc; + return CC_TRANSFORM(cc); } static int fill_dst(void *dst, u64 *rc) { + void *diag204_buf; + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; /* * If the facility is on, we don't want to emulate the instruction. * We ask the hypervisor to provide the data. */ - if (test_facility(74)) + if (test_facility(74)) { + memset(dst, 0, PAGE_SIZE); return sthyi((u64)dst, rc); - + } + /* + * When emulating, if diag204 returns BUSY don't reset dst buffer + * and use cached data. + */ + *rc = 0; + diag204_buf = diag204_get_data(is_diag204_cached(sctns)); + if (IS_ERR(diag204_buf)) + return PTR_ERR(diag204_buf); + memset(dst, 0, PAGE_SIZE); fill_hdr(sctns); fill_stsi(sctns); - fill_diag(sctns); - *rc = 0; + fill_diag(sctns, diag204_buf); + vfree(diag204_buf); return 0; } @@ -445,11 +479,14 @@ static int sthyi_update_cache(u64 *rc) { int r; - memset(sthyi_cache.info, 0, PAGE_SIZE); r = fill_dst(sthyi_cache.info, rc); - if (r) - return r; - sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + if (r == 0) { + sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + } else if (r == -EBUSY) { + /* mark as expired and return 0 to keep using cached data */ + sthyi_cache.end = jiffies - 1; + r = 0; + } return r; } diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index dc2355c623d6..5ec28028315b 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -38,33 +38,6 @@ #include "entry.h" -/* - * Perform the mmap() system call. Linux for S/390 isn't able to handle more - * than 5 system call parameters, so this system call uses a memory block - * for parameter passing. - */ - -struct s390_mmap_arg_struct { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; -}; - -SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) -{ - struct s390_mmap_arg_struct a; - int error = -EFAULT; - - if (copy_from_user(&a, arg, sizeof(a))) - goto out; - error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); -out: - return error; -} - #ifdef CONFIG_SYSVIPC /* * sys_ipc() is the de-multiplexer for the SysV IPC calls. @@ -151,8 +124,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap) { add_random_kstack_offset(); enter_from_user_mode(regs); - regs->psw = S390_lowcore.svc_old_psw; - regs->int_code = S390_lowcore.svc_int_code; + regs->psw = get_lowcore()->svc_old_psw; + regs->int_code = get_lowcore()->svc_int_code; update_timer_sys(); if (static_branch_likely(&cpu_has_bear)) current->thread.last_break = regs->last_break; diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile index fb85e797946d..c5d958a09ff4 100644 --- a/arch/s390/kernel/syscalls/Makefile +++ b/arch/s390/kernel/syscalls/Makefile @@ -4,15 +4,15 @@ gen := arch/$(ARCH)/include/generated kapi := $(gen)/asm uapi := $(gen)/uapi/asm -syscall := $(srctree)/$(src)/syscall.tbl -systbl := $(srctree)/$(src)/syscalltbl +syscall := $(src)/syscall.tbl +systbl := $(src)/syscalltbl gen-y := $(kapi)/syscall_table.h kapi-hdrs-y := $(kapi)/unistd_nr.h uapi-hdrs-y := $(uapi)/unistd_32.h uapi-hdrs-y += $(uapi)/unistd_64.h -targets += $(addprefix ../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y)) +targets += $(addprefix ../../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y)) PHONY += kapi uapi @@ -23,23 +23,26 @@ uapi: $(uapi-hdrs-y) # Create output directory if not already present $(shell mkdir -p $(uapi) $(kapi)) -filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $< +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$@" < $< > $@ -filechk_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< +quiet_cmd_sysnr = SYSNR $@ + cmd_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< > $@ -filechk_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< +quiet_cmd_syscalls = SYSTBL $@ + cmd_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< > $@ syshdr_abi_unistd_32 := common,32 -$(uapi)/unistd_32.h: $(syscall) FORCE - $(call filechk,syshdr,$@) +$(uapi)/unistd_32.h: $(syscall) $(systbl) FORCE + $(call if_changed,syshdr) syshdr_abi_unistd_64 := common,64 -$(uapi)/unistd_64.h: $(syscall) FORCE - $(call filechk,syshdr,$@) +$(uapi)/unistd_64.h: $(syscall) $(systbl) FORCE + $(call if_changed,syshdr) -$(kapi)/syscall_table.h: $(syscall) FORCE - $(call filechk,syscalls) +$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE + $(call if_changed,syscalls) sysnr_abi_unistd_nr := common,32,64 -$(kapi)/unistd_nr.h: $(syscall) FORCE - $(call filechk,sysnr) +$(kapi)/unistd_nr.h: $(syscall) $(systbl) FORCE + $(call if_changed,sysnr) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 095bb86339a7..e9115b4d8b63 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -418,7 +418,7 @@ 412 32 utimensat_time64 - sys_utimensat 413 32 pselect6_time64 - compat_sys_pselect6_time64 414 32 ppoll_time64 - compat_sys_ppoll_time64 -416 32 io_pgetevents_time64 - sys_io_pgetevents +416 32 io_pgetevents_time64 - compat_sys_io_pgetevents_time64 417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 418 32 mq_timedsend_time64 - sys_mq_timedsend 419 32 mq_timedreceive_time64 - sys_mq_timedreceive @@ -464,3 +464,8 @@ 459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules +462 common mseal sys_mseal sys_mseal +463 common setxattrat sys_setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat sys_removexattrat diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index 2be30a96696a..88055f58fbda 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -498,7 +498,6 @@ static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = { \ .open = stsi_open_##fc##_##s1##_##s2, \ .release = stsi_release, \ .read = stsi_read, \ - .llseek = no_llseek, \ }; static int stsi_release(struct inode *inode, struct file *file) diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S index c0a70efa2426..26f2981aa09e 100644 --- a/arch/s390/kernel/text_amode31.S +++ b/arch/s390/kernel/text_amode31.S @@ -18,8 +18,7 @@ * affects a few functions that are not performance-relevant. */ .macro BR_EX_AMODE31_r14 - larl %r1,0f - ex 0,0(%r1) + exrl 0,0f j . 0: br %r14 .endm diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index fb9f31f36628..e9f47c3a6197 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -36,7 +36,6 @@ #include <linux/profile.h> #include <linux/timex.h> #include <linux/notifier.h> -#include <linux/timekeeper_internal.h> #include <linux/clockchips.h> #include <linux/gfp.h> #include <linux/kprobes.h> @@ -131,7 +130,7 @@ void clock_comparator_work(void) { struct clock_event_device *cd; - S390_lowcore.clock_comparator = clock_comparator_max; + get_lowcore()->clock_comparator = clock_comparator_max; cd = this_cpu_ptr(&comparators); cd->event_handler(cd); } @@ -139,8 +138,8 @@ void clock_comparator_work(void) static int s390_next_event(unsigned long delta, struct clock_event_device *evt) { - S390_lowcore.clock_comparator = get_tod_clock() + delta; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = get_tod_clock() + delta; + set_clock_comparator(get_lowcore()->clock_comparator); return 0; } @@ -153,8 +152,8 @@ void init_cpu_timer(void) struct clock_event_device *cd; int cpu; - S390_lowcore.clock_comparator = clock_comparator_max; - set_clock_comparator(S390_lowcore.clock_comparator); + get_lowcore()->clock_comparator = clock_comparator_max; + set_clock_comparator(get_lowcore()->clock_comparator); cpu = smp_processor_id(); cd = &per_cpu(comparators, cpu); @@ -184,8 +183,8 @@ static void clock_comparator_interrupt(struct ext_code ext_code, unsigned long param64) { inc_irq_stat(IRQEXT_CLK); - if (S390_lowcore.clock_comparator == clock_comparator_max) - set_clock_comparator(S390_lowcore.clock_comparator); + if (get_lowcore()->clock_comparator == clock_comparator_max) + set_clock_comparator(get_lowcore()->clock_comparator); } static void stp_timing_alert(struct stp_irq_parm *); @@ -255,6 +254,7 @@ static struct clocksource clocksource_tod = { .shift = 24, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .vdso_clock_mode = VDSO_CLOCKMODE_TOD, + .id = CSID_S390_TOD, }; struct clocksource * __init clocksource_default_clock(void) @@ -408,12 +408,12 @@ static void clock_sync_global(long delta) static void clock_sync_local(long delta) { /* Add the delta to the clock comparator. */ - if (S390_lowcore.clock_comparator != clock_comparator_max) { - S390_lowcore.clock_comparator += delta; - set_clock_comparator(S390_lowcore.clock_comparator); + if (get_lowcore()->clock_comparator != clock_comparator_max) { + get_lowcore()->clock_comparator += delta; + set_clock_comparator(get_lowcore()->clock_comparator); } /* Adjust the last_update_clock time-stamp. */ - S390_lowcore.last_update_clock += delta; + get_lowcore()->last_update_clock += delta; } /* Single threaded workqueue used for stp sync events */ @@ -468,6 +468,12 @@ static void __init stp_reset(void) } } +bool stp_enabled(void) +{ + return test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online; +} +EXPORT_SYMBOL(stp_enabled); + static void stp_timeout(struct timer_list *unused) { queue_work(time_sync_wq, &stp_work); @@ -656,12 +662,12 @@ static void stp_check_leap(void) if (ret < 0) pr_err("failed to set leap second flags\n"); /* arm Timer to clear leap second flags */ - mod_timer(&stp_timer, jiffies + msecs_to_jiffies(14400 * MSEC_PER_SEC)); + mod_timer(&stp_timer, jiffies + secs_to_jiffies(14400)); } else { /* The day the leap second is scheduled for hasn't been reached. Retry * in one hour. */ - mod_timer(&stp_timer, jiffies + msecs_to_jiffies(3600 * MSEC_PER_SEC)); + mod_timer(&stp_timer, jiffies + secs_to_jiffies(3600)); } } @@ -729,8 +735,8 @@ static ssize_t ctn_id_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%016lx\n", - *(unsigned long *) stp_info.ctnid); + ret = sysfs_emit(buf, "%016lx\n", + *(unsigned long *)stp_info.ctnid); mutex_unlock(&stp_mutex); return ret; } @@ -745,7 +751,7 @@ static ssize_t ctn_type_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.ctn); + ret = sysfs_emit(buf, "%i\n", stp_info.ctn); mutex_unlock(&stp_mutex); return ret; } @@ -760,7 +766,7 @@ static ssize_t dst_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x2000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.dsto); mutex_unlock(&stp_mutex); return ret; } @@ -775,7 +781,7 @@ static ssize_t leap_seconds_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x8000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.leaps); mutex_unlock(&stp_mutex); return ret; } @@ -801,11 +807,11 @@ static ssize_t leap_seconds_scheduled_show(struct device *dev, return ret; if (!stzi.lsoib.p) - return sprintf(buf, "0,0\n"); + return sysfs_emit(buf, "0,0\n"); - return sprintf(buf, "%lu,%d\n", - tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, - stzi.lsoib.nlso - stzi.lsoib.also); + return sysfs_emit(buf, "%lu,%d\n", + tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC, + stzi.lsoib.nlso - stzi.lsoib.also); } static DEVICE_ATTR_RO(leap_seconds_scheduled); @@ -818,7 +824,7 @@ static ssize_t stratum_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.stratum); mutex_unlock(&stp_mutex); return ret; } @@ -833,7 +839,7 @@ static ssize_t time_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x0800)) - ret = sprintf(buf, "%i\n", (int) stp_info.tto); + ret = sysfs_emit(buf, "%i\n", (int)stp_info.tto); mutex_unlock(&stp_mutex); return ret; } @@ -848,7 +854,7 @@ static ssize_t time_zone_offset_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid() && (stp_info.vbits & 0x4000)) - ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo); + ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.tzo); mutex_unlock(&stp_mutex); return ret; } @@ -863,7 +869,7 @@ static ssize_t timing_mode_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.tmd); + ret = sysfs_emit(buf, "%i\n", stp_info.tmd); mutex_unlock(&stp_mutex); return ret; } @@ -878,7 +884,7 @@ static ssize_t timing_state_show(struct device *dev, mutex_lock(&stp_mutex); if (stpinfo_valid()) - ret = sprintf(buf, "%i\n", stp_info.tst); + ret = sysfs_emit(buf, "%i\n", stp_info.tst); mutex_unlock(&stp_mutex); return ret; } @@ -889,7 +895,7 @@ static ssize_t online_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%i\n", stp_online); + return sysfs_emit(buf, "%i\n", stp_online); } static ssize_t online_store(struct device *dev, diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 89e91b8ce842..211cc8382e4a 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -24,7 +24,9 @@ #include <linux/mm.h> #include <linux/nodemask.h> #include <linux/node.h> +#include <asm/hiperdispatch.h> #include <asm/sysinfo.h> +#include <asm/asm.h> #define PTF_HORIZONTAL (0UL) #define PTF_VERTICAL (1UL) @@ -47,6 +49,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; +static int cpu_management; static DECLARE_WORK(topology_work, topology_work_fn); @@ -144,6 +147,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, cpumask_set_cpu(cpu, &book->mask); cpumask_set_cpu(cpu, &socket->mask); smp_cpu_set_polarization(cpu, tl_core->pp); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); } } } @@ -221,15 +225,15 @@ static void topology_update_polarization_simple(void) static int ptf(unsigned long fc) { - int rc; + int cc; asm volatile( - " .insn rre,0xb9a20000,%1,%1\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc) - : "d" (fc) : "cc"); - return rc; + " .insn rre,0xb9a20000,%[fc],%[fc]\n" + CC_IPM(cc) + : CC_OUT(cc, cc) + : [fc] "d" (fc) + : CC_CLOBBER); + return CC_TRANSFORM(cc); } int topology_set_cpu_management(int fc) @@ -270,6 +274,7 @@ void update_cpu_masks(void) topo->drawer_id = id; } } + hd_reset_state(); for_each_online_cpu(cpu) { topo = &cpu_topology[cpu]; pkg_first = cpumask_first(&topo->core_mask); @@ -278,8 +283,10 @@ void update_cpu_masks(void) for_each_cpu(sibling, &topo->core_mask) { topo_sibling = &cpu_topology[sibling]; smt_first = cpumask_first(&topo_sibling->thread_mask); - if (sibling == smt_first) + if (sibling == smt_first) { topo_package->booted_cores++; + hd_add_core(sibling); + } } } else { topo->booted_cores = topo_package->booted_cores; @@ -303,8 +310,10 @@ static void __arch_update_dedicated_flag(void *arg) static int __arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; - int rc = 0; + int rc, hd_status; + hd_status = 0; + rc = 0; mutex_lock(&smp_cpu_state_mutex); if (MACHINE_HAS_TOPOLOGY) { rc = 1; @@ -314,22 +323,20 @@ static int __arch_update_cpu_topology(void) update_cpu_masks(); if (!MACHINE_HAS_TOPOLOGY) topology_update_polarization_simple(); + if (cpu_management == 1) + hd_status = hd_enable_hiperdispatch(); mutex_unlock(&smp_cpu_state_mutex); + if (hd_status == 0) + hd_disable_hiperdispatch(); return rc; } int arch_update_cpu_topology(void) { - struct device *dev; - int cpu, rc; + int rc; rc = __arch_update_cpu_topology(); on_each_cpu(__arch_update_dedicated_flag, NULL, 0); - for_each_online_cpu(cpu) { - dev = get_cpu_device(cpu); - if (dev) - kobject_uevent(&dev->kobj, KOBJ_CHANGE); - } return rc; } @@ -364,7 +371,7 @@ static void set_topology_timer(void) if (atomic_add_unless(&topology_poll, -1, 0)) mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100)); else - mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC)); + mod_timer(&topology_timer, jiffies + secs_to_jiffies(60)); } void topology_expect_change(void) @@ -380,7 +387,24 @@ void topology_expect_change(void) set_topology_timer(); } -static int cpu_management; +static int set_polarization(int polarization) +{ + int rc = 0; + + cpus_read_lock(); + mutex_lock(&smp_cpu_state_mutex); + if (cpu_management == polarization) + goto out; + rc = topology_set_cpu_management(polarization); + if (rc) + goto out; + cpu_management = polarization; + topology_expect_change(); +out: + mutex_unlock(&smp_cpu_state_mutex); + cpus_read_unlock(); + return rc; +} static ssize_t dispatching_show(struct device *dev, struct device_attribute *attr, @@ -389,7 +413,7 @@ static ssize_t dispatching_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", cpu_management); + count = sysfs_emit(buf, "%d\n", cpu_management); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -406,19 +430,7 @@ static ssize_t dispatching_store(struct device *dev, return -EINVAL; if (val != 0 && val != 1) return -EINVAL; - rc = 0; - cpus_read_lock(); - mutex_lock(&smp_cpu_state_mutex); - if (cpu_management == val) - goto out; - rc = topology_set_cpu_management(val); - if (rc) - goto out; - cpu_management = val; - topology_expect_change(); -out: - mutex_unlock(&smp_cpu_state_mutex); - cpus_read_unlock(); + rc = set_polarization(val); return rc ? rc : count; } static DEVICE_ATTR_RW(dispatching); @@ -432,19 +444,19 @@ static ssize_t cpu_polarization_show(struct device *dev, mutex_lock(&smp_cpu_state_mutex); switch (smp_cpu_get_polarization(cpu)) { case POLARIZATION_HRZ: - count = sprintf(buf, "horizontal\n"); + count = sysfs_emit(buf, "horizontal\n"); break; case POLARIZATION_VL: - count = sprintf(buf, "vertical:low\n"); + count = sysfs_emit(buf, "vertical:low\n"); break; case POLARIZATION_VM: - count = sprintf(buf, "vertical:medium\n"); + count = sysfs_emit(buf, "vertical:medium\n"); break; case POLARIZATION_VH: - count = sprintf(buf, "vertical:high\n"); + count = sysfs_emit(buf, "vertical:high\n"); break; default: - count = sprintf(buf, "unknown\n"); + count = sysfs_emit(buf, "unknown\n"); break; } mutex_unlock(&smp_cpu_state_mutex); @@ -468,7 +480,7 @@ static ssize_t cpu_dedicated_show(struct device *dev, ssize_t count; mutex_lock(&smp_cpu_state_mutex); - count = sprintf(buf, "%d\n", topology_cpu_dedicated(cpu)); + count = sysfs_emit(buf, "%d\n", topology_cpu_dedicated(cpu)); mutex_unlock(&smp_cpu_state_mutex); return count; } @@ -536,14 +548,21 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = memblock_alloc(sizeof(*mask->next), 8); - if (!mask->next) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*mask->next), 8); + mask->next = memblock_alloc_or_panic(sizeof(*mask->next), 8); mask = mask->next; } } +static int __init detect_polarization(union topology_entry *tle) +{ + struct topology_core *tl_core; + + while (tle->nl) + tle = next_tle(tle); + tl_core = (struct topology_core *)tle; + return tl_core->pp != POLARIZATION_HRZ; +} + void __init topology_init_early(void) { struct sysinfo_15_1_x *info; @@ -557,12 +576,10 @@ void __init topology_init_early(void) } if (!MACHINE_HAS_TOPOLOGY) goto out; - tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!tl_info) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + tl_info = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); info = tl_info; store_topology(info); + cpu_management = detect_polarization(info->tle); pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", info->mag[0], info->mag[1], info->mag[2], info->mag[3], info->mag[4], info->mag[5], info->mnest); @@ -600,7 +617,7 @@ static int __init topology_setup(char *str) } early_param("topology", topology_setup); -static int topology_ctl_handler(struct ctl_table *ctl, int write, +static int topology_ctl_handler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); @@ -630,12 +647,37 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write, return rc; } -static struct ctl_table topology_ctl_table[] = { +static int polarization_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int polarization; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &polarization, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + polarization = cpu_management; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + return set_polarization(polarization); +} + +static const struct ctl_table topology_ctl_table[] = { { .procname = "topology", .mode = 0644, .proc_handler = topology_ctl_handler, }, + { + .procname = "polarization", + .mode = 0644, + .proc_handler = polarization_ctl_handler, + }, }; static int __init topology_init(void) @@ -648,6 +690,8 @@ static int __init topology_init(void) set_topology_timer(); else topology_update_polarization_simple(); + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY_VERTICAL)) + set_polarization(1); register_sysctl("s390", topology_ctl_table); dev_root = bus_get_dev_root(&cpu_subsys); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 52578b5cecbd..24fee11b030d 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -27,9 +27,11 @@ #include <linux/uaccess.h> #include <linux/cpu.h> #include <linux/entry-common.h> +#include <linux/kmsan.h> #include <asm/asm-extable.h> #include <asm/vtime.h> #include <asm/fpu.h> +#include <asm/fault.h> #include "entry.h" static inline void __user *get_trap_ip(struct pt_regs *regs) @@ -262,6 +264,11 @@ static void monitor_event_exception(struct pt_regs *regs) void kernel_stack_overflow(struct pt_regs *regs) { + /* + * Normally regs are unpoisoned by the generic entry code, but + * kernel_stack_overflow() is a rare case that is called bypassing it. + */ + kmsan_unpoison_entry_regs(regs); bust_spinlocks(1); printk("Kernel stack overflow.\n"); show_regs(regs); @@ -288,15 +295,16 @@ static void __init test_monitor_call(void) void __init trap_init(void) { + struct lowcore *lc = get_lowcore(); unsigned long flags; struct ctlreg cr0; local_irq_save(flags); cr0 = local_ctl_clear_bit(0, CR0_LOW_ADDRESS_PROTECTION_BIT); - psw_bits(S390_lowcore.external_new_psw).mcheck = 1; - psw_bits(S390_lowcore.program_new_psw).mcheck = 1; - psw_bits(S390_lowcore.svc_new_psw).mcheck = 1; - psw_bits(S390_lowcore.io_new_psw).mcheck = 1; + psw_bits(lc->external_new_psw).mcheck = 1; + psw_bits(lc->program_new_psw).mcheck = 1; + psw_bits(lc->svc_new_psw).mcheck = 1; + psw_bits(lc->io_new_psw).mcheck = 1; local_ctl_load(0, &cr0); local_irq_restore(flags); local_mcck_enable(); @@ -307,11 +315,27 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { - unsigned int trapnr; + struct lowcore *lc = get_lowcore(); irqentry_state_t state; + unsigned int trapnr; + union teid teid; + + teid.val = lc->trans_exc_code; + regs->int_code = lc->pgm_int_code; + regs->int_parm_long = teid.val; - regs->int_code = S390_lowcore.pgm_int_code; - regs->int_parm_long = S390_lowcore.trans_exc_code; + /* + * In case of a guest fault, short-circuit the fault handler and return. + * This way the sie64a() function will return 0; fault address and + * other relevant bits are saved in current->thread.gmap_teid, and + * the fault number in current->thread.gmap_int_code. KVM will be + * able to use this information to handle the fault. + */ + if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) { + current->thread.gmap_teid.val = regs->int_parm_long; + current->thread.gmap_int_code = regs->int_code & 0xffff; + return; + } state = irqentry_enter(regs); @@ -324,19 +348,19 @@ void noinstr __do_pgm_check(struct pt_regs *regs) current->thread.last_break = regs->last_break; } - if (S390_lowcore.pgm_code & 0x0200) { + if (lc->pgm_code & 0x0200) { /* transaction abort */ - current->thread.trap_tdb = S390_lowcore.pgm_tdb; + current->thread.trap_tdb = lc->pgm_tdb; } - if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) { + if (lc->pgm_code & PGM_INT_CODE_PER) { if (user_mode(regs)) { struct per_event *ev = ¤t->thread.per_event; set_thread_flag(TIF_PER_TRAP); - ev->address = S390_lowcore.per_address; - ev->cause = S390_lowcore.per_code_combined; - ev->paid = S390_lowcore.per_access_id; + ev->address = lc->per_address; + ev->cause = lc->per_code_combined; + ev->paid = lc->per_access_id; } else { /* PER event in kernel is kprobes */ __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); @@ -400,8 +424,8 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = { [0x3b] = do_dat_exception, [0x3c] = default_trap_handler, [0x3d] = do_secure_storage_access, - [0x3e] = do_non_secure_storage_access, - [0x3f] = do_secure_storage_violation, + [0x3e] = default_trap_handler, + [0x3f] = default_trap_handler, [0x40] = monitor_event_exception, [0x41 ... 0x7f] = default_trap_handler, }; @@ -412,5 +436,3 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = { __stringify(default_trap_handler)) COND_TRAP(do_secure_storage_access); -COND_TRAP(do_non_secure_storage_access); -COND_TRAP(do_secure_storage_violation); diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index 0ece156fdd7c..cd44be2b6ce8 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -49,6 +49,8 @@ static inline bool is_final_pt_regs(struct unwind_state *state, READ_ONCE_NOCHECK(regs->psw.mask) & PSW_MASK_PSTATE; } +/* Avoid KMSAN false positives from touching uninitialized frames. */ +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; @@ -118,6 +120,8 @@ out_stop: } EXPORT_SYMBOL_GPL(unwind_next_frame); +/* Avoid KMSAN false positives from touching uninitialized frames. */ +__no_kmsan_checks void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long first_frame) { diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index fc07bc39e698..9f05df2da2f7 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -2,7 +2,7 @@ /* * Common Ultravisor functions and initialization * - * Copyright IBM Corp. 2019, 2020 + * Copyright IBM Corp. 2019, 2024 */ #define KMSG_COMPONENT "prot_virt" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -14,14 +14,14 @@ #include <linux/memblock.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/pagewalk.h> #include <asm/facility.h> #include <asm/sections.h> #include <asm/uv.h> /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); -#endif +EXPORT_SYMBOL(prot_virt_guest); /* * uv_info contains both host and guest information but it's currently only @@ -34,7 +34,6 @@ int __bootdata_preserved(prot_virt_guest); struct uv_info __bootdata_preserved(uv_info); EXPORT_SYMBOL(uv_info); -#if IS_ENABLED(CONFIG_KVM) int __bootdata_preserved(prot_virt_host); EXPORT_SYMBOL(prot_virt_host); @@ -109,7 +108,7 @@ EXPORT_SYMBOL_GPL(uv_pin_shared); * * @paddr: Absolute host address of page to be destroyed */ -static int uv_destroy_page(unsigned long paddr) +static int uv_destroy(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_DESTR_SEC_STOR, @@ -130,20 +129,33 @@ static int uv_destroy_page(unsigned long paddr) } /* - * The caller must already hold a reference to the page + * The caller must already hold a reference to the folio */ -int uv_destroy_owned_page(unsigned long paddr) +int uv_destroy_folio(struct folio *folio) { - struct page *page = phys_to_page(paddr); int rc; - get_page(page); - rc = uv_destroy_page(paddr); + /* See gmap_make_secure(): large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) + return 0; + + folio_get(folio); + rc = uv_destroy(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &page->flags); - put_page(page); + clear_bit(PG_arch_1, &folio->flags); + folio_put(folio); return rc; } +EXPORT_SYMBOL(uv_destroy_folio); + +/* + * The present PTE still indirectly holds a folio reference through the mapping. + */ +int uv_destroy_pte(pte_t pte) +{ + VM_WARN_ON(!pte_present(pte)); + return uv_destroy_folio(pfn_folio(pte_pfn(pte))); +} /* * Requests the Ultravisor to encrypt a guest page and make it @@ -163,54 +175,88 @@ int uv_convert_from_secure(unsigned long paddr) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure); /* - * The caller must already hold a reference to the page + * The caller must already hold a reference to the folio. */ -int uv_convert_owned_from_secure(unsigned long paddr) +int uv_convert_from_secure_folio(struct folio *folio) { - struct page *page = phys_to_page(paddr); int rc; - get_page(page); - rc = uv_convert_from_secure(paddr); + /* See gmap_make_secure(): large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) + return 0; + + folio_get(folio); + rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &page->flags); - put_page(page); + clear_bit(PG_arch_1, &folio->flags); + folio_put(folio); return rc; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure_folio); + +/* + * The present PTE still indirectly holds a folio reference through the mapping. + */ +int uv_convert_from_secure_pte(pte_t pte) +{ + VM_WARN_ON(!pte_present(pte)); + return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte))); +} /* - * Calculate the expected ref_count for a page that would otherwise have no + * Calculate the expected ref_count for a folio that would otherwise have no * further pins. This was cribbed from similar functions in other places in * the kernel, but with some slight modifications. We know that a secure - * page can not be a huge page for example. + * folio can not be a large folio, for example. */ -static int expected_page_refs(struct page *page) +static int expected_folio_refs(struct folio *folio) { int res; - res = page_mapcount(page); - if (PageSwapCache(page)) { + res = folio_mapcount(folio); + if (folio_test_swapcache(folio)) { res++; - } else if (page_mapping(page)) { + } else if (folio_mapping(folio)) { res++; - if (page_has_private(page)) + if (folio->private) res++; } return res; } -static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) +/** + * make_folio_secure() - make a folio secure + * @folio: the folio to make secure + * @uvcb: the uvcb that describes the UVC to be used + * + * The folio @folio will be made secure if possible, @uvcb will be passed + * as-is to the UVC. + * + * Return: 0 on success; + * -EBUSY if the folio is in writeback or has too many references; + * -E2BIG if the folio is large; + * -EAGAIN if the UVC needs to be attempted again; + * -ENXIO if the address is not mapped; + * -EINVAL if the UVC failed for other reasons. + * + * Context: The caller must hold exactly one extra reference on the folio + * (it's the same logic as split_folio()) + */ +int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { int expected, cc = 0; - if (PageWriteback(page)) - return -EAGAIN; - expected = expected_page_refs(page); - if (!page_ref_freeze(page, expected)) + if (folio_test_large(folio)) + return -E2BIG; + if (folio_test_writeback(folio)) return -EBUSY; - set_bit(PG_arch_1, &page->flags); + expected = expected_folio_refs(folio) + 1; + if (!folio_ref_freeze(folio, expected)) + return -EBUSY; + set_bit(PG_arch_1, &folio->flags); /* * If the UVC does not succeed or fail immediately, we don't want to * loop for long, or we might get stall notifications. @@ -220,9 +266,9 @@ static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) * -EAGAIN and we let the callers deal with it. */ cc = __uv_call(0, (u64)uvcb); - page_ref_unfreeze(page, expected); + folio_ref_unfreeze(folio, expected); /* - * Return -ENXIO if the page was not mapped, -EINVAL for other errors. + * Return -ENXIO if the folio was not mapped, -EINVAL for other errors. * If busy or partially completed, return -EAGAIN. */ if (cc == UVC_CC_OK) @@ -231,245 +277,49 @@ static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) return -EAGAIN; return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } - -/** - * should_export_before_import - Determine whether an export is needed - * before an import-like operation - * @uvcb: the Ultravisor control block of the UVC to be performed - * @mm: the mm of the process - * - * Returns whether an export is needed before every import-like operation. - * This is needed for shared pages, which don't trigger a secure storage - * exception when accessed from a different guest. - * - * Although considered as one, the Unpin Page UVC is not an actual import, - * so it is not affected. - * - * No export is needed also when there is only one protected VM, because the - * page cannot belong to the wrong VM in that case (there is no "other VM" - * it can belong to). - * - * Return: true if an export is needed before every import, otherwise false. - */ -static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) -{ - /* - * The misc feature indicates, among other things, that importing a - * shared page from a different protected VM will automatically also - * transfer its ownership. - */ - if (uv_has_feature(BIT_UV_FEAT_MISC)) - return false; - if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) - return false; - return atomic_read(&mm->context.protected_count) > 1; -} - -/* - * Requests the Ultravisor to make a page accessible to a guest. - * If it's brought in the first time, it will be cleared. If - * it has been exported before, it will be decrypted and integrity - * checked. - */ -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) -{ - struct vm_area_struct *vma; - bool local_drain = false; - spinlock_t *ptelock; - unsigned long uaddr; - struct page *page; - pte_t *ptep; - int rc; - -again: - rc = -EFAULT; - mmap_read_lock(gmap->mm); - - uaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(uaddr)) - goto out; - vma = vma_lookup(gmap->mm, uaddr); - if (!vma) - goto out; - /* - * Secure pages cannot be huge and userspace should not combine both. - * In case userspace does it anyway this will result in an -EFAULT for - * the unpack. The guest is thus never reaching secure mode. If - * userspace is playing dirty tricky with mapping huge pages later - * on this will result in a segmentation fault. - */ - if (is_vm_hugetlb_page(vma)) - goto out; - - rc = -ENXIO; - ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); - if (!ptep) - goto out; - if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { - page = pte_page(*ptep); - rc = -EAGAIN; - if (trylock_page(page)) { - if (should_export_before_import(uvcb, gmap->mm)) - uv_convert_from_secure(page_to_phys(page)); - rc = make_page_secure(page, uvcb); - unlock_page(page); - } - } - pte_unmap_unlock(ptep, ptelock); -out: - mmap_read_unlock(gmap->mm); - - if (rc == -EAGAIN) { - /* - * If we are here because the UVC returned busy or partial - * completion, this is just a useless check, but it is safe. - */ - wait_on_page_writeback(page); - } else if (rc == -EBUSY) { - /* - * If we have tried a local drain and the page refcount - * still does not match our expected safe value, try with a - * system wide drain. This is needed if the pagevecs holding - * the page are on a different CPU. - */ - if (local_drain) { - lru_add_drain_all(); - /* We give up here, and let the caller try again */ - return -EAGAIN; - } - /* - * We are here if the page refcount does not match the - * expected safe value. The main culprits are usually - * pagevecs. With lru_add_drain() we drain the pagevecs - * on the local CPU so that hopefully the refcount will - * reach the expected safe value. - */ - lru_add_drain(); - local_drain = true; - /* And now we try again immediately after draining */ - goto again; - } else if (rc == -ENXIO) { - if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) - return -EFAULT; - return -EAGAIN; - } - return rc; -} -EXPORT_SYMBOL_GPL(gmap_make_secure); - -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) -{ - struct uv_cb_cts uvcb = { - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, - .header.len = sizeof(uvcb), - .guest_handle = gmap->guest_handle, - .gaddr = gaddr, - }; - - return gmap_make_secure(gmap, gaddr, &uvcb); -} -EXPORT_SYMBOL_GPL(gmap_convert_to_secure); - -/** - * gmap_destroy_page - Destroy a guest page. - * @gmap: the gmap of the guest - * @gaddr: the guest address to destroy - * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. - */ -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) -{ - struct vm_area_struct *vma; - unsigned long uaddr; - struct page *page; - int rc; - - rc = -EFAULT; - mmap_read_lock(gmap->mm); - - uaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(uaddr)) - goto out; - vma = vma_lookup(gmap->mm, uaddr); - if (!vma) - goto out; - /* - * Huge pages should not be able to become secure - */ - if (is_vm_hugetlb_page(vma)) - goto out; - - rc = 0; - /* we take an extra reference here */ - page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); - if (IS_ERR_OR_NULL(page)) - goto out; - rc = uv_destroy_owned_page(page_to_phys(page)); - /* - * Fault handlers can race; it is possible that two CPUs will fault - * on the same secure page. One CPU can destroy the page, reboot, - * re-enter secure mode and import it, while the second CPU was - * stuck at the beginning of the handler. At some point the second - * CPU will be able to progress, and it will not be able to destroy - * the page. In that case we do not want to terminate the process, - * we instead try to export the page. - */ - if (rc) - rc = uv_convert_owned_from_secure(page_to_phys(page)); - put_page(page); -out: - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_destroy_page); +EXPORT_SYMBOL_GPL(make_folio_secure); /* - * To be called with the page locked or with an extra reference! This will - * prevent gmap_make_secure from touching the page concurrently. Having 2 - * parallel make_page_accessible is fine, as the UV calls will become a - * no-op if the page is already exported. + * To be called with the folio locked or with an extra reference! This will + * prevent gmap_make_secure from touching the folio concurrently. Having 2 + * parallel arch_make_folio_accessible is fine, as the UV calls will become a + * no-op if the folio is already exported. */ -int arch_make_page_accessible(struct page *page) +int arch_make_folio_accessible(struct folio *folio) { int rc = 0; - /* Hugepage cannot be protected, so nothing to do */ - if (PageHuge(page)) + /* See gmap_make_secure(): large folios cannot be secure */ + if (unlikely(folio_test_large(folio))) return 0; /* - * PG_arch_1 is used in 3 places: - * 1. for kernel page tables during early boot - * 2. for storage keys of huge pages and KVM - * 3. As an indication that this page might be secure. This can + * PG_arch_1 is used in 2 places: + * 1. for storage keys of hugetlb folios and KVM + * 2. As an indication that this small folio might be secure. This can * overindicate, e.g. we set the bit before calling * convert_to_secure. - * As secure pages are never huge, all 3 variants can co-exists. + * As secure pages are never large folios, both variants can co-exists. */ - if (!test_bit(PG_arch_1, &page->flags)) + if (!test_bit(PG_arch_1, &folio->flags)) return 0; - rc = uv_pin_shared(page_to_phys(page)); + rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); return 0; } - rc = uv_convert_from_secure(page_to_phys(page)); + rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); return 0; } return rc; } -EXPORT_SYMBOL_GPL(arch_make_page_accessible); +EXPORT_SYMBOL_GPL(arch_make_folio_accessible); -#endif - -#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) static ssize_t uv_query_facilities(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -612,12 +462,32 @@ static struct kobj_attribute uv_query_supp_secret_types_attr = static ssize_t uv_query_max_secrets(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%d\n", uv_info.max_secrets); + return sysfs_emit(buf, "%d\n", + uv_info.max_assoc_secrets + uv_info.max_retr_secrets); } static struct kobj_attribute uv_query_max_secrets_attr = __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL); +static ssize_t uv_query_max_retr_secrets(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_retr_secrets); +} + +static struct kobj_attribute uv_query_max_retr_secrets_attr = + __ATTR(max_retr_secrets, 0444, uv_query_max_retr_secrets, NULL); + +static ssize_t uv_query_max_assoc_secrets(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_assoc_secrets); +} + +static struct kobj_attribute uv_query_max_assoc_secrets_attr = + __ATTR(max_assoc_secrets, 0444, uv_query_max_assoc_secrets, NULL); + static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, &uv_query_feature_indications_attr.attr, @@ -635,34 +505,91 @@ static struct attribute *uv_query_attrs[] = { &uv_query_supp_add_secret_pcf_attr.attr, &uv_query_supp_secret_types_attr.attr, &uv_query_max_secrets_attr.attr, + &uv_query_max_assoc_secrets_attr.attr, + &uv_query_max_retr_secrets_attr.attr, NULL, }; +static inline struct uv_cb_query_keys uv_query_keys(void) +{ + struct uv_cb_query_keys uvcb = { + .header.cmd = UVC_CMD_QUERY_KEYS, + .header.len = sizeof(uvcb) + }; + + uv_call(0, (uint64_t)&uvcb); + return uvcb; +} + +static inline ssize_t emit_hash(struct uv_key_hash *hash, char *buf, int at) +{ + return sysfs_emit_at(buf, at, "%016llx%016llx%016llx%016llx\n", + hash->dword[0], hash->dword[1], hash->dword[2], hash->dword[3]); +} + +static ssize_t uv_keys_host_key(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + + return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_HK], buf, 0); +} + +static struct kobj_attribute uv_keys_host_key_attr = + __ATTR(host_key, 0444, uv_keys_host_key, NULL); + +static ssize_t uv_keys_backup_host_key(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + + return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_BACK_HK], buf, 0); +} + +static struct kobj_attribute uv_keys_backup_host_key_attr = + __ATTR(backup_host_key, 0444, uv_keys_backup_host_key, NULL); + +static ssize_t uv_keys_all(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uv_cb_query_keys uvcb = uv_query_keys(); + ssize_t len = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(uvcb.key_hashes); i++) + len += emit_hash(uvcb.key_hashes + i, buf, len); + + return len; +} + +static struct kobj_attribute uv_keys_all_attr = + __ATTR(all, 0444, uv_keys_all, NULL); + static struct attribute_group uv_query_attr_group = { .attrs = uv_query_attrs, }; +static struct attribute *uv_keys_attrs[] = { + &uv_keys_host_key_attr.attr, + &uv_keys_backup_host_key_attr.attr, + &uv_keys_all_attr.attr, + NULL, +}; + +static struct attribute_group uv_keys_attr_group = { + .attrs = uv_keys_attrs, +}; + static ssize_t uv_is_prot_virt_guest(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int val = 0; - -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST - val = prot_virt_guest; -#endif - return sysfs_emit(buf, "%d\n", val); + return sysfs_emit(buf, "%d\n", prot_virt_guest); } static ssize_t uv_is_prot_virt_host(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int val = 0; - -#if IS_ENABLED(CONFIG_KVM) - val = prot_virt_host; -#endif - - return sysfs_emit(buf, "%d\n", val); + return sysfs_emit(buf, "%d\n", prot_virt_host); } static struct kobj_attribute uv_prot_virt_guest = @@ -678,9 +605,27 @@ static const struct attribute *uv_prot_virt_attrs[] = { }; static struct kset *uv_query_kset; +static struct kset *uv_keys_kset; static struct kobject *uv_kobj; -static int __init uv_info_init(void) +static int __init uv_sysfs_dir_init(const struct attribute_group *grp, + struct kset **uv_dir_kset, const char *name) +{ + struct kset *kset; + int rc; + + kset = kset_create_and_add(name, NULL, uv_kobj); + if (!kset) + return -ENOMEM; + *uv_dir_kset = kset; + + rc = sysfs_create_group(&kset->kobj, grp); + if (rc) + kset_unregister(kset); + return rc; +} + +static int __init uv_sysfs_init(void) { int rc = -ENOMEM; @@ -695,17 +640,16 @@ static int __init uv_info_init(void) if (rc) goto out_kobj; - uv_query_kset = kset_create_and_add("query", NULL, uv_kobj); - if (!uv_query_kset) { - rc = -ENOMEM; + rc = uv_sysfs_dir_init(&uv_query_attr_group, &uv_query_kset, "query"); + if (rc) goto out_ind_files; - } - rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group); - if (!rc) - return 0; + /* Get installed key hashes if available, ignore any errors */ + if (test_bit_inv(BIT_UVC_CMD_QUERY_KEYS, uv_info.inst_calls_list)) + uv_sysfs_dir_init(&uv_keys_attr_group, &uv_keys_kset, "keys"); + + return 0; - kset_unregister(uv_query_kset); out_ind_files: sysfs_remove_files(uv_kobj, uv_prot_virt_attrs); out_kobj: @@ -713,5 +657,131 @@ out_kobj: kobject_put(uv_kobj); return rc; } -device_initcall(uv_info_init); -#endif +device_initcall(uv_sysfs_init); + +/* + * Find the secret with the secret_id in the provided list. + * + * Context: might sleep. + */ +static int find_secret_in_page(const u8 secret_id[UV_SECRET_ID_LEN], + const struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret) +{ + u16 i; + + for (i = 0; i < list->total_num_secrets; i++) { + if (memcmp(secret_id, list->secrets[i].id, UV_SECRET_ID_LEN) == 0) { + *secret = list->secrets[i].hdr; + return 0; + } + } + return -ENOENT; +} + +/* + * Do the actual search for `uv_get_secret_metadata`. + * + * Context: might sleep. + */ +static int find_secret(const u8 secret_id[UV_SECRET_ID_LEN], + struct uv_secret_list *list, + struct uv_secret_list_item_hdr *secret) +{ + u16 start_idx = 0; + u16 list_rc; + int ret; + + do { + uv_list_secrets(list, start_idx, &list_rc, NULL); + if (list_rc != UVC_RC_EXECUTED && list_rc != UVC_RC_MORE_DATA) { + if (list_rc == UVC_RC_INV_CMD) + return -ENODEV; + else + return -EIO; + } + ret = find_secret_in_page(secret_id, list, secret); + if (ret == 0) + return ret; + start_idx = list->next_secret_idx; + } while (list_rc == UVC_RC_MORE_DATA && start_idx < list->next_secret_idx); + + return -ENOENT; +} + +/** + * uv_get_secret_metadata() - get secret metadata for a given secret id. + * @secret_id: search pattern. + * @secret: output data, containing the secret's metadata. + * + * Search for a secret with the given secret_id in the Ultravisor secret store. + * + * Context: might sleep. + * + * Return: + * * %0: - Found entry; secret->idx and secret->type are valid. + * * %ENOENT - No entry found. + * * %ENODEV: - Not supported: UV not available or command not available. + * * %EIO: - Other unexpected UV error. + */ +int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN], + struct uv_secret_list_item_hdr *secret) +{ + struct uv_secret_list *buf; + int rc; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + rc = find_secret(secret_id, buf, secret); + kfree(buf); + return rc; +} +EXPORT_SYMBOL_GPL(uv_get_secret_metadata); + +/** + * uv_retrieve_secret() - get the secret value for the secret index. + * @secret_idx: Secret index for which the secret should be retrieved. + * @buf: Buffer to store retrieved secret. + * @buf_size: Size of the buffer. The correct buffer size is reported as part of + * the result from `uv_get_secret_metadata`. + * + * Calls the Retrieve Secret UVC and translates the UV return code into an errno. + * + * Context: might sleep. + * + * Return: + * * %0 - Entry found; buffer contains a valid secret. + * * %ENOENT: - No entry found or secret at the index is non-retrievable. + * * %ENODEV: - Not supported: UV not available or command not available. + * * %EINVAL: - Buffer too small for content. + * * %EIO: - Other unexpected UV error. + */ +int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size) +{ + struct uv_cb_retr_secr uvcb = { + .header.len = sizeof(uvcb), + .header.cmd = UVC_CMD_RETR_SECRET, + .secret_idx = secret_idx, + .buf_addr = (u64)buf, + .buf_size = buf_size, + }; + + uv_call_sched(0, (u64)&uvcb); + + switch (uvcb.header.rc) { + case UVC_RC_EXECUTED: + return 0; + case UVC_RC_INV_CMD: + return -ENODEV; + case UVC_RC_RETR_SECR_STORE_EMPTY: + case UVC_RC_RETR_SECR_INV_SECRET: + case UVC_RC_RETR_SECR_INV_IDX: + return -ENOENT; + case UVC_RC_RETR_SECR_BUF_SMALL: + return -EINVAL; + default: + return -EIO; + } +} +EXPORT_SYMBOL_GPL(uv_retrieve_secret); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index a45b3a4c91db..598b512cde01 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -12,12 +12,15 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/time_namespace.h> #include <linux/random.h> #include <vdso/datapage.h> +#include <asm/vdso/vsyscall.h> +#include <asm/alternative.h> #include <asm/vdso.h> extern char vdso64_start[], vdso64_end[]; @@ -29,12 +32,6 @@ static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - #ifdef CONFIG_TIME_NS struct vdso_data *arch_get_vdso_data(void *vvar_page) { @@ -210,17 +207,22 @@ static unsigned long vdso_addr(unsigned long start, unsigned long len) return addr; } -unsigned long vdso_size(void) +unsigned long vdso_text_size(void) { - unsigned long size = VVAR_NR_PAGES * PAGE_SIZE; + unsigned long size; if (is_compat_task()) - size += vdso32_end - vdso32_start; + size = vdso32_end - vdso32_start; else - size += vdso64_end - vdso64_start; + size = vdso64_end - vdso64_start; return PAGE_ALIGN(size); } +unsigned long vdso_size(void) +{ + return vdso_text_size() + VVAR_NR_PAGES * PAGE_SIZE; +} + int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { unsigned long addr = VDSO_BASE; @@ -245,8 +247,25 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) return pagelist; } +static void vdso_apply_alternatives(void) +{ + const struct elf64_shdr *alt, *shdr; + struct alt_instr *start, *end; + const struct elf64_hdr *hdr; + + hdr = (struct elf64_hdr *)vdso64_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".altinstructions"); + if (!alt) + return; + start = (void *)hdr + alt->sh_offset; + end = (void *)hdr + alt->sh_offset + alt->sh_size; + apply_alternatives(start, end); +} + static int __init vdso_init(void) { + vdso_apply_alternatives(); vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); if (IS_ENABLED(CONFIG_COMPAT)) vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index b12a274cbb47..2c5afb88d298 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # List of files in the vdso -KCOV_INSTRUMENT := n - # Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso32 = vdso_user_wrapper-32.o note-32.o @@ -19,8 +17,10 @@ KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) KBUILD_AFLAGS_32 += -m31 -s KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin +KBUILD_CFLAGS_32 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin -fasynchronous-unwind-tables LDFLAGS_vdso32.so.dbg += -shared -soname=linux-vdso32.so.1 \ --hash-style=both --build-id=sha1 -melf_s390 -T @@ -32,19 +32,13 @@ obj-y += vdso32_wrapper.o targets += vdso32.lds CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) -# Disable gcov profiling, ubsan and kasan for VDSO code -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n - # Force dependency (incbin is bad) $(obj)/vdso32_wrapper.o : $(obj)/vdso32.so quiet_cmd_vdso_and_check = VDSO $@ cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) -$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE +$(obj)/vdso32.so.dbg: $(obj)/vdso32.lds $(obj-vdso32) FORCE $(call if_changed,vdso_and_check) # strip rule for the .so file @@ -62,7 +56,7 @@ quiet_cmd_vdso32cc = VDSO32C $@ cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< # Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +gen-vdsosym := $(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S index 65b9513a5a0e..c916c4f73f76 100644 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -16,7 +16,7 @@ SECTIONS #ifdef CONFIG_TIME_NS PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); #endif - . = VDSO_LBASE + SIZEOF_HEADERS; + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index ef9832726097..ad206f2068d8 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -1,16 +1,19 @@ # SPDX-License-Identifier: GPL-2.0 # List of files in the vdso -KCOV_INSTRUMENT := n - # Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile -obj-vdso64 = vdso_user_wrapper.o note.o -obj-cvdso64 = vdso64_generic.o getcpu.o -VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK) +obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o +obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o +VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE) CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE) +ifneq ($(c-getrandom-y),) + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) +endif + # Build rules targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg @@ -24,9 +27,11 @@ KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS)) KBUILD_AFLAGS_64 += -m64 KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_64 := $(filter-out -mpacked-stack,$(KBUILD_CFLAGS_64)) KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64)) KBUILD_CFLAGS_64 := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_64)) -KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin +KBUILD_CFLAGS_64 := $(filter-out -fno-asynchronous-unwind-tables,$(KBUILD_CFLAGS_64)) +KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin -fasynchronous-unwind-tables ldflags-y := -shared -soname=linux-vdso64.so.1 \ --hash-style=both --build-id=sha1 -T @@ -37,12 +42,6 @@ obj-y += vdso64_wrapper.o targets += vdso64.lds CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) -# Disable gcov profiling, ubsan and kasan for VDSO code -GCOV_PROFILE := n -UBSAN_SANITIZE := n -KASAN_SANITIZE := n -KCSAN_SANITIZE := n - # Force dependency (incbin is bad) $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so @@ -50,7 +49,7 @@ quiet_cmd_vdso_and_check = VDSO $@ cmd_vdso_and_check = $(cmd_ld); $(cmd_vdso_check) # link rule for the .so file, .lds has to be first -$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE +$(obj)/vdso64.so.dbg: $(obj)/vdso64.lds $(obj-vdso64) $(obj-cvdso64) FORCE $(call if_changed,vdso_and_check) # strip rule for the .so file @@ -72,7 +71,7 @@ quiet_cmd_vdso64cc = VDSO64C $@ cmd_vdso64cc = $(CC) $(c_flags) -c -o $@ $< # Generate VDSO offsets using helper script -gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +gen-vdsosym := $(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h index 34c7a2312f9d..9e5397e7b590 100644 --- a/arch/s390/kernel/vdso64/vdso.h +++ b/arch/s390/kernel/vdso64/vdso.h @@ -10,5 +10,6 @@ int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unuse int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); #endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */ diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 37e2a505e81d..ec42b7d9cb53 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -4,6 +4,7 @@ * library */ +#include <asm/vdso/vsyscall.h> #include <asm/page.h> #include <asm/vdso.h> @@ -13,10 +14,11 @@ OUTPUT_ARCH(s390:64-bit) SECTIONS { PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); + PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); #ifdef CONFIG_TIME_NS PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); #endif - . = VDSO_LBASE + SIZEOF_HEADERS; + . = SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -42,6 +44,10 @@ SECTIONS .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } .rodata1 : { *(.rodata1) } + . = ALIGN(8); + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + .dynamic : { *(.dynamic) } :text :dynamic .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr @@ -140,6 +146,7 @@ VERSION __kernel_restart_syscall; __kernel_rt_sigreturn; __kernel_sigreturn; + __kernel_getrandom; local: *; }; } diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S index 57f62596e53b..aa06c85bcbd3 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -6,8 +6,6 @@ #include <asm/dwarf.h> #include <asm/ptrace.h> -#define WRAPPER_FRAME_SIZE (STACK_FRAME_OVERHEAD+8) - /* * Older glibc version called vdso without allocating a stackframe. This wrapper * is just used to allocate a stackframe. See @@ -15,23 +13,23 @@ * for details. */ .macro vdso_func func - .globl __kernel_\func - .type __kernel_\func,@function - __ALIGN -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC - aghi %r15,-WRAPPER_FRAME_SIZE - CFI_DEF_CFA_OFFSET (STACK_FRAME_OVERHEAD + WRAPPER_FRAME_SIZE) - CFI_VAL_OFFSET 15, -STACK_FRAME_OVERHEAD - stg %r14,STACK_FRAME_OVERHEAD(%r15) + aghi %r15,-STACK_FRAME_VDSO_OVERHEAD + CFI_DEF_CFA_OFFSET (STACK_FRAME_USER_OVERHEAD + STACK_FRAME_VDSO_OVERHEAD) + CFI_VAL_OFFSET 15,-STACK_FRAME_USER_OVERHEAD + stg %r14,__SFVDSO_RETURN_ADDRESS(%r15) + CFI_REL_OFFSET 14,__SFVDSO_RETURN_ADDRESS + xc __SFUSER_BACKCHAIN(8,%r15),__SFUSER_BACKCHAIN(%r15) brasl %r14,__s390_vdso_\func - lg %r14,STACK_FRAME_OVERHEAD(%r15) - aghi %r15,WRAPPER_FRAME_SIZE - CFI_DEF_CFA_OFFSET STACK_FRAME_OVERHEAD + lg %r14,__SFVDSO_RETURN_ADDRESS(%r15) + CFI_RESTORE 14 + aghi %r15,STACK_FRAME_VDSO_OVERHEAD + CFI_DEF_CFA_OFFSET STACK_FRAME_USER_OVERHEAD CFI_RESTORE 15 br %r14 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_func gettimeofday @@ -40,16 +38,13 @@ vdso_func clock_gettime vdso_func getcpu .macro vdso_syscall func,syscall - .globl __kernel_\func - .type __kernel_\func,@function - __ALIGN -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC svc \syscall /* Make sure we notice when a syscall returns, which shouldn't happen */ .word 0 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_syscall restart_syscall,__NR_restart_syscall diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S new file mode 100644 index 000000000000..09c034c2f853 --- /dev/null +++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/stringify.h> +#include <linux/linkage.h> +#include <asm/alternative.h> +#include <asm/dwarf.h> +#include <asm/fpu-insn.h> + +#define STATE0 %v0 +#define STATE1 %v1 +#define STATE2 %v2 +#define STATE3 %v3 +#define COPY0 %v4 +#define COPY1 %v5 +#define COPY2 %v6 +#define COPY3 %v7 +#define BEPERM %v19 +#define TMP0 %v20 +#define TMP1 %v21 +#define TMP2 %v22 +#define TMP3 %v23 + + .section .rodata + + .balign 32 +SYM_DATA_START_LOCAL(chacha20_constants) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap +SYM_DATA_END(chacha20_constants) + + .text +/* + * s390 ChaCha20 implementation meant for vDSO. Produces a given positive + * number of blocks of output with nonce 0, taking an input key and 8-bytes + * counter. Does not spill to the stack. + * + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, + * const uint8_t *key, + * uint32_t *counter, + * size_t nblocks) + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + CFI_STARTPROC + larl %r1,chacha20_constants + + /* COPY0 = "expand 32-byte k" */ + VL COPY0,0,,%r1 + + /* BEPERM = byte selectors for VPERM */ + ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148) + + /* COPY1,COPY2 = key */ + VLM COPY1,COPY2,0,%r3 + + /* COPY3 = counter || zero nonce */ + lg %r3,0(%r4) + VZERO COPY3 + VLVGG COPY3,%r3,0 + + lghi %r1,0 +.Lblock: + VLR STATE0,COPY0 + VLR STATE1,COPY1 + VLR STATE2,COPY2 + VLR STATE3,COPY3 + + lghi %r0,10 +.Ldoubleround: + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,16 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,8 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,7 + + /* STATE1[0,1,2,3] = STATE1[1,2,3,0] */ + VSLDB STATE1,STATE1,STATE1,4 + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ + VSLDB STATE2,STATE2,STATE2,8 + /* STATE3[0,1,2,3] = STATE3[3,0,1,2] */ + VSLDB STATE3,STATE3,STATE3,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,16 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,8 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,7 + + /* STATE1[0,1,2,3] = STATE1[3,0,1,2] */ + VSLDB STATE1,STATE1,STATE1,12 + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ + VSLDB STATE2,STATE2,STATE2,8 + /* STATE3[0,1,2,3] = STATE3[1,2,3,0] */ + VSLDB STATE3,STATE3,STATE3,4 + brctg %r0,.Ldoubleround + + /* OUTPUT0 = STATE0 + COPY0 */ + VAF STATE0,STATE0,COPY0 + /* OUTPUT1 = STATE1 + COPY1 */ + VAF STATE1,STATE1,COPY1 + /* OUTPUT2 = STATE2 + COPY2 */ + VAF STATE2,STATE2,COPY2 + /* OUTPUT3 = STATE3 + COPY3 */ + VAF STATE3,STATE3,COPY3 + + ALTERNATIVE \ + __stringify( \ + /* Convert STATE to little endian and store to OUTPUT */\ + VPERM TMP0,STATE0,STATE0,BEPERM; \ + VPERM TMP1,STATE1,STATE1,BEPERM; \ + VPERM TMP2,STATE2,STATE2,BEPERM; \ + VPERM TMP3,STATE3,STATE3,BEPERM; \ + VSTM TMP0,TMP3,0,%r2), \ + __stringify( \ + /* 32 bit wise little endian store to OUTPUT */ \ + VSTBRF STATE0,0,,%r2; \ + VSTBRF STATE1,16,,%r2; \ + VSTBRF STATE2,32,,%r2; \ + VSTBRF STATE3,48,,%r2; \ + brcl 0,0), \ + ALT_FACILITY(148) + + /* ++COPY3.COUNTER */ + /* alsih %r3,1 */ + .insn rilu,0xcc0a00000000,%r3,1 + alcr %r3,%r1 + VLVGG COPY3,%r3,0 + + /* OUTPUT += 64, --NBLOCKS */ + aghi %r2,64 + brctg %r5,.Lblock + + /* COUNTER = COPY3.COUNTER */ + stg %r3,0(%r4) + + /* Zero out potentially sensitive regs */ + VZERO STATE0 + VZERO STATE1 + VZERO STATE2 + VZERO STATE3 + VZERO COPY1 + VZERO COPY2 + + /* Early exit if TMP0-TMP3 have not been used */ + ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148) + + VZERO TMP0 + VZERO TMP1 + VZERO TMP2 + VZERO TMP3 + + br %r14 + CFI_ENDPROC +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/s390/kernel/vdso64/vgetrandom.c b/arch/s390/kernel/vdso64/vgetrandom.c new file mode 100644 index 000000000000..b5268b507fb5 --- /dev/null +++ b/arch/s390/kernel/vdso64/vgetrandom.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/facility.h> +#include <uapi/asm-generic/errno.h> +#include "vdso.h" + +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + if (test_facility(129)) + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) + return -ENOSYS; + return getrandom_syscall(buffer, len, flags); +} diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c index d296dfc22191..cc8933e04ff7 100644 --- a/arch/s390/kernel/vmcore_info.c +++ b/arch/s390/kernel/vmcore_info.c @@ -1,8 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/vmcore_info.h> -#include <asm/abs_lowcore.h> #include <linux/mm.h> +#include <asm/abs_lowcore.h> +#include <asm/sections.h> #include <asm/setup.h> void arch_crash_save_vmcoreinfo(void) @@ -14,7 +15,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31); vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31); + vmcoreinfo_append_str("IDENTITYBASE=%lx\n", __identity_base); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("KERNELOFFPHYS=%lx\n", __kaslr_offset_phys); abs_lc = get_abs_lowcore(); abs_lc->vmcore_info = paddr_vmcoreinfo_note(); put_abs_lowcore(abs_lc); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 48de296e8905..ff1ddba96352 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -39,7 +39,7 @@ PHDRS { SECTIONS { - . = 0x100000; + . = TEXT_OFFSET; .text : { _stext = .; /* Start of text section */ _text = .; /* Text and read-only data */ @@ -52,21 +52,12 @@ SECTIONS SOFTIRQENTRY_TEXT FTRACE_HOTPATCH_TRAMPOLINES_TEXT *(.text.*_indirect_*) - *(.fixup) *(.gnu.warning) . = ALIGN(PAGE_SIZE); _etext = .; /* End of text section */ } :text = 0x0700 RO_DATA(PAGE_SIZE) - .data.rel.ro : { - *(.data.rel.ro .data.rel.ro.*) - } - .got : { - __got_start = .; - *(.got) - __got_end = .; - } . = ALIGN(PAGE_SIZE); _sdata = .; /* Start of data section */ @@ -80,6 +71,15 @@ SECTIONS . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; + .data.rel.ro : { + *(.data.rel.ro .data.rel.ro.*) + } + .got : { + __got_start = .; + *(.got) + __got_end = .; + } + RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE) .data.rel : { *(.data.rel*) @@ -183,39 +183,16 @@ SECTIONS .amode31.data : { *(.amode31.data) } - . = ALIGN(PAGE_SIZE); + . = _samode31 + AMODE31_SIZE; _eamode31 = .; /* early.c uses stsi, which requires page aligned data. */ . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) - PERCPU_SECTION(0x100) + RUNTIME_CONST_VARIABLES -#ifdef CONFIG_PIE_BUILD - .dynsym ALIGN(8) : { - __dynsym_start = .; - *(.dynsym) - __dynsym_end = .; - } - .rela.dyn ALIGN(8) : { - __rela_dyn_start = .; - *(.rela*) - __rela_dyn_end = .; - } - .dynamic ALIGN(8) : { - *(.dynamic) - } - .dynstr ALIGN(8) : { - *(.dynstr) - } -#endif - .hash ALIGN(8) : { - *(.hash) - } - .gnu.hash ALIGN(8) : { - *(.gnu.hash) - } + PERCPU_SECTION(0x100) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ @@ -230,7 +207,6 @@ SECTIONS * it should match struct vmlinux_info */ .vmlinux.info 0 (INFO) : { - QUAD(_stext) /* default_lma */ QUAD(startup_continue) /* entry */ QUAD(__bss_start - _stext) /* image_size */ QUAD(__bss_stop - __bss_start) /* bss_size */ @@ -239,18 +215,14 @@ SECTIONS QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ QUAD(__boot_data_preserved_end - __boot_data_preserved_start) /* bootdata_preserved_size */ -#ifdef CONFIG_PIE_BUILD - QUAD(__dynsym_start) /* dynsym_start */ - QUAD(__rela_dyn_start) /* rela_dyn_start */ - QUAD(__rela_dyn_end) /* rela_dyn_end */ -#else QUAD(__got_start) /* got_start */ QUAD(__got_end) /* got_end */ -#endif QUAD(_eamode31 - _samode31) /* amode31_size */ QUAD(init_mm) QUAD(swapper_pg_dir) QUAD(invalid_pg_dir) + QUAD(__alt_instructions) + QUAD(__alt_instructions_end) #ifdef CONFIG_KASAN QUAD(kasan_early_shadow_page) QUAD(kasan_early_shadow_pte) @@ -282,12 +254,10 @@ SECTIONS *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) } ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") -#ifndef CONFIG_PIE_BUILD .rela.dyn : { *(.rela.*) *(.rela_*) } ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") -#endif /* Sections to be discarded */ DISCARDS diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 24a18e5ef6e8..234a0ba30510 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -33,24 +33,17 @@ static DEFINE_PER_CPU(u64, mt_scaling_mult) = { 1 }; static DEFINE_PER_CPU(u64, mt_scaling_div) = { 1 }; static DEFINE_PER_CPU(u64, mt_scaling_jiffies); -static inline u64 get_vtimer(void) -{ - u64 timer; - - asm volatile("stpt %0" : "=Q" (timer)); - return timer; -} - static inline void set_vtimer(u64 expires) { + struct lowcore *lc = get_lowcore(); u64 timer; asm volatile( " stpt %0\n" /* Store current cpu timer value */ " spt %1" /* Set new value imm. afterwards */ : "=Q" (timer) : "Q" (expires)); - S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; - S390_lowcore.last_update_timer = expires; + lc->system_timer += lc->last_update_timer - timer; + lc->last_update_timer = expires; } static inline int virt_timer_forward(u64 elapsed) @@ -125,22 +118,23 @@ static void account_system_index_scaled(struct task_struct *p, u64 cputime, static int do_account_vtime(struct task_struct *tsk) { u64 timer, clock, user, guest, system, hardirq, softirq; + struct lowcore *lc = get_lowcore(); - timer = S390_lowcore.last_update_timer; - clock = S390_lowcore.last_update_clock; + timer = lc->last_update_timer; + clock = lc->last_update_clock; asm volatile( " stpt %0\n" /* Store current cpu timer value */ " stckf %1" /* Store current tod clock value */ - : "=Q" (S390_lowcore.last_update_timer), - "=Q" (S390_lowcore.last_update_clock) + : "=Q" (lc->last_update_timer), + "=Q" (lc->last_update_clock) : : "cc"); - clock = S390_lowcore.last_update_clock - clock; - timer -= S390_lowcore.last_update_timer; + clock = lc->last_update_clock - clock; + timer -= lc->last_update_timer; if (hardirq_count()) - S390_lowcore.hardirq_timer += timer; + lc->hardirq_timer += timer; else - S390_lowcore.system_timer += timer; + lc->system_timer += timer; /* Update MT utilization calculation */ if (smp_cpu_mtid && @@ -149,16 +143,16 @@ static int do_account_vtime(struct task_struct *tsk) /* Calculate cputime delta */ user = update_tsk_timer(&tsk->thread.user_timer, - READ_ONCE(S390_lowcore.user_timer)); + READ_ONCE(lc->user_timer)); guest = update_tsk_timer(&tsk->thread.guest_timer, - READ_ONCE(S390_lowcore.guest_timer)); + READ_ONCE(lc->guest_timer)); system = update_tsk_timer(&tsk->thread.system_timer, - READ_ONCE(S390_lowcore.system_timer)); + READ_ONCE(lc->system_timer)); hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, - READ_ONCE(S390_lowcore.hardirq_timer)); + READ_ONCE(lc->hardirq_timer)); softirq = update_tsk_timer(&tsk->thread.softirq_timer, - READ_ONCE(S390_lowcore.softirq_timer)); - S390_lowcore.steal_timer += + READ_ONCE(lc->softirq_timer)); + lc->steal_timer += clock - user - guest - system - hardirq - softirq; /* Push account value */ @@ -184,17 +178,19 @@ static int do_account_vtime(struct task_struct *tsk) void vtime_task_switch(struct task_struct *prev) { + struct lowcore *lc = get_lowcore(); + do_account_vtime(prev); - prev->thread.user_timer = S390_lowcore.user_timer; - prev->thread.guest_timer = S390_lowcore.guest_timer; - prev->thread.system_timer = S390_lowcore.system_timer; - prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; - prev->thread.softirq_timer = S390_lowcore.softirq_timer; - S390_lowcore.user_timer = current->thread.user_timer; - S390_lowcore.guest_timer = current->thread.guest_timer; - S390_lowcore.system_timer = current->thread.system_timer; - S390_lowcore.hardirq_timer = current->thread.hardirq_timer; - S390_lowcore.softirq_timer = current->thread.softirq_timer; + prev->thread.user_timer = lc->user_timer; + prev->thread.guest_timer = lc->guest_timer; + prev->thread.system_timer = lc->system_timer; + prev->thread.hardirq_timer = lc->hardirq_timer; + prev->thread.softirq_timer = lc->softirq_timer; + lc->user_timer = current->thread.user_timer; + lc->guest_timer = current->thread.guest_timer; + lc->system_timer = current->thread.system_timer; + lc->hardirq_timer = current->thread.hardirq_timer; + lc->softirq_timer = current->thread.softirq_timer; } /* @@ -204,28 +200,29 @@ void vtime_task_switch(struct task_struct *prev) */ void vtime_flush(struct task_struct *tsk) { + struct lowcore *lc = get_lowcore(); u64 steal, avg_steal; if (do_account_vtime(tsk)) virt_timer_expire(); - steal = S390_lowcore.steal_timer; - avg_steal = S390_lowcore.avg_steal_timer; + steal = lc->steal_timer; + avg_steal = lc->avg_steal_timer; if ((s64) steal > 0) { - S390_lowcore.steal_timer = 0; + lc->steal_timer = 0; account_steal_time(cputime_to_nsecs(steal)); avg_steal += steal; } - S390_lowcore.avg_steal_timer = avg_steal / 2; + lc->avg_steal_timer = avg_steal / 2; } static u64 vtime_delta(void) { - u64 timer = S390_lowcore.last_update_timer; - - S390_lowcore.last_update_timer = get_vtimer(); + struct lowcore *lc = get_lowcore(); + u64 timer = lc->last_update_timer; - return timer - S390_lowcore.last_update_timer; + lc->last_update_timer = get_cpu_timer(); + return timer - lc->last_update_timer; } /* @@ -234,12 +231,13 @@ static u64 vtime_delta(void) */ void vtime_account_kernel(struct task_struct *tsk) { + struct lowcore *lc = get_lowcore(); u64 delta = vtime_delta(); if (tsk->flags & PF_VCPU) - S390_lowcore.guest_timer += delta; + lc->guest_timer += delta; else - S390_lowcore.system_timer += delta; + lc->system_timer += delta; virt_timer_forward(delta); } @@ -249,7 +247,7 @@ void vtime_account_softirq(struct task_struct *tsk) { u64 delta = vtime_delta(); - S390_lowcore.softirq_timer += delta; + get_lowcore()->softirq_timer += delta; virt_timer_forward(delta); } @@ -258,7 +256,7 @@ void vtime_account_hardirq(struct task_struct *tsk) { u64 delta = vtime_delta(); - S390_lowcore.hardirq_timer += delta; + get_lowcore()->hardirq_timer += delta; virt_timer_forward(delta); } diff --git a/arch/s390/kernel/wti.c b/arch/s390/kernel/wti.c new file mode 100644 index 000000000000..949fdbf0e8b6 --- /dev/null +++ b/arch/s390/kernel/wti.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for warning track interruption + * + * Copyright IBM Corp. 2023 + */ + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/kallsyms.h> +#include <linux/smpboot.h> +#include <linux/irq.h> +#include <uapi/linux/sched/types.h> +#include <asm/debug.h> +#include <asm/diag.h> +#include <asm/sclp.h> + +#define WTI_DBF_LEN 64 + +struct wti_debug { + unsigned long missed; + unsigned long addr; + pid_t pid; +}; + +struct wti_state { + /* debug data for s390dbf */ + struct wti_debug dbg; + /* + * Represents the real-time thread responsible to + * acknowledge the warning-track interrupt and trigger + * preliminary and postliminary precautions. + */ + struct task_struct *thread; + /* + * If pending is true, the real-time thread must be scheduled. + * If not, a wake up of that thread will remain a noop. + */ + bool pending; +}; + +static DEFINE_PER_CPU(struct wti_state, wti_state); + +static debug_info_t *wti_dbg; + +/* + * During a warning-track grace period, interrupts are disabled + * to prevent delays of the warning-track acknowledgment. + * + * Once the CPU is physically dispatched again, interrupts are + * re-enabled. + */ + +static void wti_irq_disable(void) +{ + unsigned long flags; + struct ctlreg cr6; + + local_irq_save(flags); + local_ctl_store(6, &cr6); + /* disable all I/O interrupts */ + cr6.val &= ~0xff000000UL; + local_ctl_load(6, &cr6); + local_irq_restore(flags); +} + +static void wti_irq_enable(void) +{ + unsigned long flags; + struct ctlreg cr6; + + local_irq_save(flags); + local_ctl_store(6, &cr6); + /* enable all I/O interrupts */ + cr6.val |= 0xff000000UL; + local_ctl_load(6, &cr6); + local_irq_restore(flags); +} + +static void store_debug_data(struct wti_state *st) +{ + struct pt_regs *regs = get_irq_regs(); + + st->dbg.pid = current->pid; + st->dbg.addr = 0; + if (!user_mode(regs)) + st->dbg.addr = regs->psw.addr; +} + +static void wti_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct wti_state *st = this_cpu_ptr(&wti_state); + + inc_irq_stat(IRQEXT_WTI); + wti_irq_disable(); + store_debug_data(st); + st->pending = true; + wake_up_process(st->thread); +} + +static int wti_pending(unsigned int cpu) +{ + struct wti_state *st = per_cpu_ptr(&wti_state, cpu); + + return st->pending; +} + +static void wti_dbf_grace_period(struct wti_state *st) +{ + struct wti_debug *wdi = &st->dbg; + char buf[WTI_DBF_LEN]; + + if (wdi->addr) + snprintf(buf, sizeof(buf), "%d %pS", wdi->pid, (void *)wdi->addr); + else + snprintf(buf, sizeof(buf), "%d <user>", wdi->pid); + debug_text_event(wti_dbg, 2, buf); + wdi->missed++; +} + +static int wti_show(struct seq_file *seq, void *v) +{ + struct wti_state *st; + int cpu; + + cpus_read_lock(); + seq_puts(seq, " "); + for_each_online_cpu(cpu) + seq_printf(seq, "CPU%-8d", cpu); + seq_putc(seq, '\n'); + for_each_online_cpu(cpu) { + st = per_cpu_ptr(&wti_state, cpu); + seq_printf(seq, " %10lu", st->dbg.missed); + } + seq_putc(seq, '\n'); + cpus_read_unlock(); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(wti); + +static void wti_thread_fn(unsigned int cpu) +{ + struct wti_state *st = per_cpu_ptr(&wti_state, cpu); + + st->pending = false; + /* + * Yield CPU voluntarily to the hypervisor. Control + * resumes when hypervisor decides to dispatch CPU + * to this LPAR again. + */ + if (diag49c(DIAG49C_SUBC_ACK)) + wti_dbf_grace_period(st); + wti_irq_enable(); +} + +static struct smp_hotplug_thread wti_threads = { + .store = &wti_state.thread, + .thread_should_run = wti_pending, + .thread_fn = wti_thread_fn, + .thread_comm = "cpuwti/%u", + .selfparking = false, +}; + +static int __init wti_init(void) +{ + struct sched_param wti_sched_param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct dentry *wti_dir; + struct wti_state *st; + int cpu, rc; + + rc = -EOPNOTSUPP; + if (!sclp.has_wti) + goto out; + rc = smpboot_register_percpu_thread(&wti_threads); + if (WARN_ON(rc)) + goto out; + for_each_online_cpu(cpu) { + st = per_cpu_ptr(&wti_state, cpu); + sched_setscheduler(st->thread, SCHED_FIFO, &wti_sched_param); + } + rc = register_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt); + if (rc) { + pr_warn("Couldn't request external interrupt 0x1007\n"); + goto out_thread; + } + irq_subclass_register(IRQ_SUBCLASS_WARNING_TRACK); + rc = diag49c(DIAG49C_SUBC_REG); + if (rc) { + pr_warn("Failed to register warning track interrupt through DIAG 49C\n"); + rc = -EOPNOTSUPP; + goto out_subclass; + } + wti_dir = debugfs_create_dir("wti", arch_debugfs_dir); + debugfs_create_file("stat", 0400, wti_dir, NULL, &wti_fops); + wti_dbg = debug_register("wti", 1, 1, WTI_DBF_LEN); + if (!wti_dbg) { + rc = -ENOMEM; + goto out_debug_register; + } + rc = debug_register_view(wti_dbg, &debug_hex_ascii_view); + if (rc) + goto out_debug_register; + goto out; +out_debug_register: + debug_unregister(wti_dbg); +out_subclass: + irq_subclass_unregister(IRQ_SUBCLASS_WARNING_TRACK); + unregister_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt); +out_thread: + smpboot_unregister_percpu_thread(&wti_threads); +out: + return rc; +} +late_initcall(wti_init); |