diff options
Diffstat (limited to 'arch/s390/kernel')
37 files changed, 1387 insertions, 410 deletions
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index e47a4be54ff8..48caae8c7e10 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -36,22 +36,23 @@ CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls -obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o +obj-y := head64.o traps.o time.o process.o early.o setup.o idle.o vtime.o obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o obj-y += sysinfo.o lgr.o os_info.o ctlreg.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o -obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o +obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o extra-y += vmlinux.lds obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) +obj-$(CONFIG_SYSFS) += cpacf.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o +obj-$(CONFIG_SCHED_TOPOLOGY) += topology.o hiperdispatch.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_AUDIT) += audit.o compat-obj-$(CONFIG_AUDIT) += compat_audit.o diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index ffa0dd2dbaac..5529248d84fb 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -112,8 +112,7 @@ int main(void) OFFSET(__LC_MCK_NEW_PSW, lowcore, mcck_new_psw); OFFSET(__LC_IO_NEW_PSW, lowcore, io_new_psw); /* software defined lowcore locations 0x200 - 0xdff*/ - OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync); - OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async); + OFFSET(__LC_SAVE_AREA, lowcore, save_area); OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart); OFFSET(__LC_PCPU, lowcore, pcpu); OFFSET(__LC_RETURN_PSW, lowcore, return_psw); diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 1942e2a9f8db..5a86b9d1da71 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -24,11 +24,11 @@ #include <linux/tty.h> #include <linux/personality.h> #include <linux/binfmts.h> +#include <asm/vdso-symbols.h> #include <asm/access-regs.h> #include <asm/ucontext.h> #include <linux/uaccess.h> #include <asm/lowcore.h> -#include <asm/vdso.h> #include <asm/fpu.h> #include "compat_linux.h" #include "compat_ptrace.h" diff --git a/arch/s390/kernel/cpacf.c b/arch/s390/kernel/cpacf.c new file mode 100644 index 000000000000..c8575dbc890d --- /dev/null +++ b/arch/s390/kernel/cpacf.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define KMSG_COMPONENT "cpacf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/sysfs.h> +#include <asm/cpacf.h> + +#define CPACF_QUERY(name, instruction) \ +static ssize_t name##_query_raw_read(struct file *fp, \ + struct kobject *kobj, \ + struct bin_attribute *attr, \ + char *buf, loff_t offs, \ + size_t count) \ +{ \ + cpacf_mask_t mask; \ + \ + if (!cpacf_query(CPACF_##instruction, &mask)) \ + return -EOPNOTSUPP; \ + return memory_read_from_buffer(buf, count, &offs, &mask, sizeof(mask)); \ +} \ +static BIN_ATTR_RO(name##_query_raw, sizeof(cpacf_mask_t)) + +CPACF_QUERY(km, KM); +CPACF_QUERY(kmc, KMC); +CPACF_QUERY(kimd, KIMD); +CPACF_QUERY(klmd, KLMD); +CPACF_QUERY(kmac, KMAC); +CPACF_QUERY(pckmo, PCKMO); +CPACF_QUERY(kmf, KMF); +CPACF_QUERY(kmctr, KMCTR); +CPACF_QUERY(kmo, KMO); +CPACF_QUERY(pcc, PCC); +CPACF_QUERY(prno, PRNO); +CPACF_QUERY(kma, KMA); +CPACF_QUERY(kdsa, KDSA); + +#define CPACF_QAI(name, instruction) \ +static ssize_t name##_query_auth_info_raw_read( \ + struct file *fp, struct kobject *kobj, \ + struct bin_attribute *attr, char *buf, loff_t offs, \ + size_t count) \ +{ \ + cpacf_qai_t qai; \ + \ + if (!cpacf_qai(CPACF_##instruction, &qai)) \ + return -EOPNOTSUPP; \ + return memory_read_from_buffer(buf, count, &offs, &qai, \ + sizeof(qai)); \ +} \ +static BIN_ATTR_RO(name##_query_auth_info_raw, sizeof(cpacf_qai_t)) + +CPACF_QAI(km, KM); +CPACF_QAI(kmc, KMC); +CPACF_QAI(kimd, KIMD); +CPACF_QAI(klmd, KLMD); +CPACF_QAI(kmac, KMAC); +CPACF_QAI(pckmo, PCKMO); +CPACF_QAI(kmf, KMF); +CPACF_QAI(kmctr, KMCTR); +CPACF_QAI(kmo, KMO); +CPACF_QAI(pcc, PCC); +CPACF_QAI(prno, PRNO); +CPACF_QAI(kma, KMA); +CPACF_QAI(kdsa, KDSA); + +static struct bin_attribute *cpacf_attrs[] = { + &bin_attr_km_query_raw, + &bin_attr_kmc_query_raw, + &bin_attr_kimd_query_raw, + &bin_attr_klmd_query_raw, + &bin_attr_kmac_query_raw, + &bin_attr_pckmo_query_raw, + &bin_attr_kmf_query_raw, + &bin_attr_kmctr_query_raw, + &bin_attr_kmo_query_raw, + &bin_attr_pcc_query_raw, + &bin_attr_prno_query_raw, + &bin_attr_kma_query_raw, + &bin_attr_kdsa_query_raw, + &bin_attr_km_query_auth_info_raw, + &bin_attr_kmc_query_auth_info_raw, + &bin_attr_kimd_query_auth_info_raw, + &bin_attr_klmd_query_auth_info_raw, + &bin_attr_kmac_query_auth_info_raw, + &bin_attr_pckmo_query_auth_info_raw, + &bin_attr_kmf_query_auth_info_raw, + &bin_attr_kmctr_query_auth_info_raw, + &bin_attr_kmo_query_auth_info_raw, + &bin_attr_pcc_query_auth_info_raw, + &bin_attr_prno_query_auth_info_raw, + &bin_attr_kma_query_auth_info_raw, + &bin_attr_kdsa_query_auth_info_raw, + NULL, +}; + +static const struct attribute_group cpacf_attr_grp = { + .name = "cpacf", + .bin_attrs = cpacf_attrs, +}; + +static int __init cpacf_init(void) +{ + struct device *cpu_root; + int rc = 0; + + cpu_root = bus_get_dev_root(&cpu_subsys); + if (cpu_root) { + rc = sysfs_create_group(&cpu_root->kobj, &cpacf_attr_grp); + put_device(cpu_root); + } + return rc; +} +device_initcall(cpacf_init); diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index bce50ca75ea7..e62bea9ab21e 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -163,7 +163,6 @@ static const struct file_operations debug_file_ops = { .write = debug_input, .open = debug_open, .release = debug_close, - .llseek = no_llseek, }; static struct dentry *debug_debugfs_root_entry; diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index ac7b8c8e3133..007e1795670e 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -52,6 +52,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" }, [DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" }, [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" }, + [DIAG_STAT_X49C] = { .code = 0x49c, .name = "Warning-Track Interruption" }, [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; @@ -303,3 +304,19 @@ int diag26c(void *req, void *resp, enum diag26c_sc subcode) return diag_amode31_ops.diag26c(virt_to_phys(req), virt_to_phys(resp), subcode); } EXPORT_SYMBOL(diag26c); + +int diag49c(unsigned long subcode) +{ + int rc; + + diag_stat_inc(DIAG_STAT_X49C); + asm volatile( + " diag %[subcode],0,0x49c\n" + " ipm %[rc]\n" + " srl %[rc],28\n" + : [rc] "=d" (rc) + : [subcode] "d" (subcode) + : "cc"); + return rc; +} +EXPORT_SYMBOL(diag49c); diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 89dc826a8d2e..94eb8168ea44 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -122,6 +122,7 @@ enum { U8_32, /* 8 bit unsigned value starting at 32 */ U12_16, /* 12 bit unsigned value starting at 16 */ U16_16, /* 16 bit unsigned value starting at 16 */ + U16_20, /* 16 bit unsigned value starting at 20 */ U16_32, /* 16 bit unsigned value starting at 32 */ U32_16, /* 32 bit unsigned value starting at 16 */ VX_12, /* Vector index register starting at position 12 */ @@ -184,6 +185,7 @@ static const struct s390_operand operands[] = { [U8_32] = { 8, 32, 0 }, [U12_16] = { 12, 16, 0 }, [U16_16] = { 16, 16, 0 }, + [U16_20] = { 16, 20, 0 }, [U16_32] = { 16, 32, 0 }, [U32_16] = { 32, 16, 0 }, [VX_12] = { 4, 12, OPERAND_INDEX | OPERAND_VR }, @@ -257,7 +259,6 @@ static const unsigned char formats[][6] = { [INSTR_RSL_R0RD] = { D_20, L4_8, B_16, 0, 0, 0 }, [INSTR_RSY_AARD] = { A_8, A_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_CCRD] = { C_8, C_12, D20_20, B_16, 0, 0 }, - [INSTR_RSY_RDRU] = { R_8, D20_20, B_16, U4_12, 0, 0 }, [INSTR_RSY_RRRD] = { R_8, R_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_RURD] = { R_8, U4_12, D20_20, B_16, 0, 0 }, [INSTR_RSY_RURD2] = { R_8, D20_20, B_16, U4_12, 0, 0 }, @@ -300,14 +301,17 @@ static const unsigned char formats[][6] = { [INSTR_VRI_V0UU2] = { V_8, U16_16, U4_32, 0, 0, 0 }, [INSTR_VRI_V0UUU] = { V_8, U8_16, U8_24, U4_32, 0, 0 }, [INSTR_VRI_VR0UU] = { V_8, R_12, U8_28, U4_24, 0, 0 }, + [INSTR_VRI_VV0UU] = { V_8, V_12, U8_28, U4_24, 0, 0 }, [INSTR_VRI_VVUU] = { V_8, V_12, U16_16, U4_32, 0, 0 }, [INSTR_VRI_VVUUU] = { V_8, V_12, U12_16, U4_32, U4_28, 0 }, [INSTR_VRI_VVUUU2] = { V_8, V_12, U8_28, U8_16, U4_24, 0 }, [INSTR_VRI_VVV0U] = { V_8, V_12, V_16, U8_24, 0, 0 }, [INSTR_VRI_VVV0UU] = { V_8, V_12, V_16, U8_24, U4_32, 0 }, [INSTR_VRI_VVV0UU2] = { V_8, V_12, V_16, U8_28, U4_24, 0 }, - [INSTR_VRR_0V] = { V_12, 0, 0, 0, 0, 0 }, + [INSTR_VRI_VVV0UV] = { V_8, V_12, V_16, V_32, U8_24, 0 }, + [INSTR_VRR_0V0U] = { V_12, U16_20, 0, 0, 0, 0 }, [INSTR_VRR_0VV0U] = { V_12, V_16, U4_24, 0, 0, 0 }, + [INSTR_VRR_0VVU] = { V_12, V_16, U16_20, 0, 0, 0 }, [INSTR_VRR_RV0UU] = { R_8, V_12, U4_24, U4_28, 0, 0 }, [INSTR_VRR_VRR] = { V_8, R_12, R_16, 0, 0, 0 }, [INSTR_VRR_VV] = { V_8, V_12, 0, 0, 0, 0 }, @@ -455,21 +459,21 @@ static int print_insn(char *buffer, unsigned char *code, unsigned long addr) if (separator) ptr += sprintf(ptr, "%c", separator); if (operand->flags & OPERAND_GPR) - ptr += sprintf(ptr, "%%r%i", value); + ptr += sprintf(ptr, "%%r%u", value); else if (operand->flags & OPERAND_FPR) - ptr += sprintf(ptr, "%%f%i", value); + ptr += sprintf(ptr, "%%f%u", value); else if (operand->flags & OPERAND_AR) - ptr += sprintf(ptr, "%%a%i", value); + ptr += sprintf(ptr, "%%a%u", value); else if (operand->flags & OPERAND_CR) - ptr += sprintf(ptr, "%%c%i", value); + ptr += sprintf(ptr, "%%c%u", value); else if (operand->flags & OPERAND_VR) - ptr += sprintf(ptr, "%%v%i", value); + ptr += sprintf(ptr, "%%v%u", value); else if (operand->flags & OPERAND_PCREL) { void *pcrel = (void *)((int)value + addr); ptr += sprintf(ptr, "%px", pcrel); } else if (operand->flags & OPERAND_SIGNED) - ptr += sprintf(ptr, "%i", value); + ptr += sprintf(ptr, "%i", (int)value); else ptr += sprintf(ptr, "%u", value); if (operand->flags & OPERAND_DISP) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 14d324865e33..62f8f5a750a3 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -7,6 +7,7 @@ #define KMSG_COMPONENT "setup" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <linux/sched/debug.h> #include <linux/compiler.h> #include <linux/init.h> #include <linux/errno.h> @@ -175,20 +176,45 @@ static __init void setup_topology(void) topology_max_mnest = max_mnest; } -void __do_early_pgm_check(struct pt_regs *regs) +void __init __do_early_pgm_check(struct pt_regs *regs) { - if (!fixup_exception(regs)) - disabled_wait(); + struct lowcore *lc = get_lowcore(); + unsigned long ip; + + regs->int_code = lc->pgm_int_code; + regs->int_parm_long = lc->trans_exc_code; + ip = __rewind_psw(regs->psw, regs->int_code >> 16); + + /* Monitor Event? Might be a warning */ + if ((regs->int_code & PGM_INT_CODE_MASK) == 0x40) { + if (report_bug(ip, regs) == BUG_TRAP_TYPE_WARN) + return; + } + if (fixup_exception(regs)) + return; + /* + * Unhandled exception - system cannot continue but try to get some + * helpful messages to the console. Use early_printk() to print + * some basic information in case it is too early for printk(). + */ + register_early_console(); + early_printk("PANIC: early exception %04x PSW: %016lx %016lx\n", + regs->int_code & 0xffff, regs->psw.mask, regs->psw.addr); + show_regs(regs); + disabled_wait(); } static noinline __init void setup_lowcore_early(void) { + struct lowcore *lc = get_lowcore(); psw_t psw; psw.addr = (unsigned long)early_pgm_check_handler; psw.mask = PSW_KERNEL_BITS; - get_lowcore()->program_new_psw = psw; - get_lowcore()->preempt_count = INIT_PREEMPT_COUNT; + lc->program_new_psw = psw; + lc->preempt_count = INIT_PREEMPT_COUNT; + lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); + lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); } static __init void detect_diag9c(void) @@ -242,6 +268,8 @@ static __init void detect_machine_facilities(void) } if (test_facility(194)) get_lowcore()->machine_flags |= MACHINE_FLAG_RDP; + if (test_facility(85)) + get_lowcore()->machine_flags |= MACHINE_FLAG_SEQ_INSN; } static inline void save_vector_registers(void) diff --git a/arch/s390/kernel/early_printk.c b/arch/s390/kernel/early_printk.c index d9d53f44008a..cefe020a3be3 100644 --- a/arch/s390/kernel/early_printk.c +++ b/arch/s390/kernel/early_printk.c @@ -6,6 +6,7 @@ #include <linux/console.h> #include <linux/kernel.h> #include <linux/init.h> +#include <asm/setup.h> #include <asm/sclp.h> static void sclp_early_write(struct console *con, const char *s, unsigned int len) @@ -20,6 +21,16 @@ static struct console sclp_early_console = { .index = -1, }; +void __init register_early_console(void) +{ + if (early_console) + return; + if (!sclp.has_linemode && !sclp.has_vt220) + return; + early_console = &sclp_early_console; + register_console(early_console); +} + static int __init setup_early_printk(char *buf) { if (early_console) @@ -27,10 +38,7 @@ static int __init setup_early_printk(char *buf) /* Accept only "earlyprintk" and "earlyprintk=sclp" */ if (buf && !str_has_prefix(buf, "sclp")) return 0; - if (!sclp.has_linemode && !sclp.has_vt220) - return 0; - early_console = &sclp_early_console; - register_console(early_console); + register_early_console(); return 0; } early_param("earlyprintk", setup_early_printk); diff --git a/arch/s390/kernel/earlypgm.S b/arch/s390/kernel/earlypgm.S deleted file mode 100644 index c634871f0d90..000000000000 --- a/arch/s390/kernel/earlypgm.S +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright IBM Corp. 2006, 2007 - * Author(s): Michael Holzheu <holzheu@de.ibm.com> - */ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> - -SYM_CODE_START(early_pgm_check_handler) - stmg %r8,%r15,__LC_SAVE_AREA_SYNC - aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) - la %r11,STACK_FRAME_OVERHEAD(%r15) - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - lgr %r2,%r11 - brasl %r14,__do_early_pgm_check - mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) - lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) - lpswe __LC_RETURN_PSW -SYM_CODE_END(early_pgm_check_handler) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 749410cfdbc0..d6d5317f768e 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -42,7 +42,7 @@ _LPP_OFFSET = __LC_LPP .macro LPSWEY address, lpswe ALTERNATIVE_2 "b \lpswe;nopr", \ - ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY_EARLY(193), \ + ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193), \ __stringify(.insn siy,0xeb0000000071,LOWCORE_ALT_ADDRESS+\address,0), \ ALT_LOWCORE .endm @@ -264,7 +264,7 @@ EXPORT_SYMBOL(sie_exit) */ SYM_CODE_START(system_call) - STMG_LC %r8,%r15,__LC_SAVE_AREA_SYNC + STMG_LC %r8,%r15,__LC_SAVE_AREA GET_LC %r13 stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF @@ -287,7 +287,7 @@ SYM_CODE_START(system_call) xgr %r10,%r10 xgr %r11,%r11 la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs - mvc __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC(%r13) + mvc __PT_R8(64,%r2),__LC_SAVE_AREA(%r13) MBEAR %r2,%r13 lgr %r3,%r14 brasl %r14,__do_syscall @@ -323,7 +323,7 @@ SYM_CODE_END(ret_from_fork) */ SYM_CODE_START(pgm_check_handler) - STMG_LC %r8,%r15,__LC_SAVE_AREA_SYNC + STMG_LC %r8,%r15,__LC_SAVE_AREA GET_LC %r13 stpt __LC_SYS_ENTER_TIMER(%r13) BPOFF @@ -338,16 +338,16 @@ SYM_CODE_START(pgm_check_handler) jnz 2f # -> enabled, can't be a double fault tm __LC_PGM_ILC+3(%r13),0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -2: CHECK_STACK __LC_SAVE_AREA_SYNC,%r13 +2: CHECK_STACK __LC_SAVE_AREA,%r13 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) # CHECK_VMAP_STACK branches to stack_overflow or 4f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,%r13,4f + CHECK_VMAP_STACK __LC_SAVE_AREA,%r13,4f 3: lg %r15,__LC_KERNEL_STACK(%r13) 4: la %r11,STACK_FRAME_OVERHEAD(%r15) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC(%r13) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK(%r13) stctg %c1,%c1,__PT_CR1(%r11) #if IS_ENABLED(CONFIG_KVM) @@ -398,7 +398,7 @@ SYM_CODE_END(pgm_check_handler) */ .macro INT_HANDLER name,lc_old_psw,handler SYM_CODE_START(\name) - STMG_LC %r8,%r15,__LC_SAVE_AREA_ASYNC + STMG_LC %r8,%r15,__LC_SAVE_AREA GET_LC %r13 stckf __LC_INT_CLOCK(%r13) stpt __LC_SYS_ENTER_TIMER(%r13) @@ -414,7 +414,7 @@ SYM_CODE_START(\name) BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST SIEEXIT __SF_SIE_CONTROL(%r15),%r13 #endif -0: CHECK_STACK __LC_SAVE_AREA_ASYNC,%r13 +0: CHECK_STACK __LC_SAVE_AREA,%r13 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) j 2f 1: lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13) @@ -432,7 +432,7 @@ SYM_CODE_START(\name) xgr %r7,%r7 xgr %r10,%r10 xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC(%r13) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) MBEAR %r11,%r13 stmg %r8,%r9,__PT_PSW(%r11) lgr %r2,%r11 # pass pointer to pt_regs @@ -599,6 +599,24 @@ SYM_CODE_START(restart_int_handler) 3: j 3b SYM_CODE_END(restart_int_handler) + __INIT +SYM_CODE_START(early_pgm_check_handler) + STMG_LC %r8,%r15,__LC_SAVE_AREA + GET_LC %r13 + aghi %r15,-(STACK_FRAME_OVERHEAD+__PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_PSW(16,%r11),__LC_PGM_OLD_PSW(%r13) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13) + lgr %r2,%r11 + brasl %r14,__do_early_pgm_check + mvc __LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + LPSWEY __LC_RETURN_PSW,__LC_RETURN_LPSWE +SYM_CODE_END(early_pgm_check_handler) + __FINIT + .section .kprobes.text, "ax" #if defined(CONFIG_CHECK_STACK) || defined(CONFIG_VMAP_STACK) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 0bd6adc40a34..0b6e62d1d8b8 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -50,10 +50,6 @@ struct ftrace_insn { s32 disp; } __packed; -#ifdef CONFIG_MODULES -static char *ftrace_plt; -#endif /* CONFIG_MODULES */ - static const char *ftrace_shared_hotpatch_trampoline(const char **end) { const char *tstart, *tend; @@ -73,19 +69,20 @@ static const char *ftrace_shared_hotpatch_trampoline(const char **end) bool ftrace_need_init_nop(void) { - return true; + return !MACHINE_HAS_SEQ_INSN; } int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { static struct ftrace_hotpatch_trampoline *next_vmlinux_trampoline = __ftrace_hotpatch_trampolines_start; - static const char orig[6] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 }; + static const struct ftrace_insn orig = { .opc = 0xc004, .disp = 0 }; static struct ftrace_hotpatch_trampoline *trampoline; struct ftrace_hotpatch_trampoline **next_trampoline; struct ftrace_hotpatch_trampoline *trampolines_end; struct ftrace_hotpatch_trampoline tmp; struct ftrace_insn *insn; + struct ftrace_insn old; const char *shared; s32 disp; @@ -99,7 +96,6 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) if (mod) { next_trampoline = &mod->arch.next_trampoline; trampolines_end = mod->arch.trampolines_end; - shared = ftrace_plt; } #endif @@ -107,8 +103,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) return -ENOMEM; trampoline = (*next_trampoline)++; + if (copy_from_kernel_nofault(&old, (void *)rec->ip, sizeof(old))) + return -EFAULT; /* Check for the compiler-generated fentry nop (brcl 0, .). */ - if (WARN_ON_ONCE(memcmp((const void *)rec->ip, &orig, sizeof(orig)))) + if (WARN_ON_ONCE(memcmp(&orig, &old, sizeof(old)))) return -EINVAL; /* Generate the trampoline. */ @@ -144,8 +142,35 @@ static struct ftrace_hotpatch_trampoline *ftrace_get_trampoline(struct dyn_ftrac return trampoline; } -int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, - unsigned long addr) +static inline struct ftrace_insn +ftrace_generate_branch_insn(unsigned long ip, unsigned long target) +{ + /* brasl r0,target or brcl 0,0 */ + return (struct ftrace_insn){ .opc = target ? 0xc005 : 0xc004, + .disp = target ? (target - ip) / 2 : 0 }; +} + +static int ftrace_patch_branch_insn(unsigned long ip, unsigned long old_target, + unsigned long target) +{ + struct ftrace_insn orig = ftrace_generate_branch_insn(ip, old_target); + struct ftrace_insn new = ftrace_generate_branch_insn(ip, target); + struct ftrace_insn old; + + if (!IS_ALIGNED(ip, 8)) + return -EINVAL; + if (copy_from_kernel_nofault(&old, (void *)ip, sizeof(old))) + return -EFAULT; + /* Verify that the to be replaced code matches what we expect. */ + if (memcmp(&orig, &old, sizeof(old))) + return -EINVAL; + s390_kernel_write((void *)ip, &new, sizeof(new)); + return 0; +} + +static int ftrace_modify_trampoline_call(struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long addr) { struct ftrace_hotpatch_trampoline *trampoline; u64 old; @@ -161,6 +186,15 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, return 0; } +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + if (MACHINE_HAS_SEQ_INSN) + return ftrace_patch_branch_insn(rec->ip, old_addr, addr); + else + return ftrace_modify_trampoline_call(rec, old_addr, addr); +} + static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) { u16 old; @@ -179,11 +213,14 @@ static int ftrace_patch_branch_mask(void *addr, u16 expected, bool enable) int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - /* Expect brcl 0xf,... */ - return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); + /* Expect brcl 0xf,... for the !MACHINE_HAS_SEQ_INSN case */ + if (MACHINE_HAS_SEQ_INSN) + return ftrace_patch_branch_insn(rec->ip, addr, 0); + else + return ftrace_patch_branch_mask((void *)rec->ip, 0xc0f4, false); } -int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +static int ftrace_make_trampoline_call(struct dyn_ftrace *rec, unsigned long addr) { struct ftrace_hotpatch_trampoline *trampoline; @@ -195,6 +232,14 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ftrace_patch_branch_mask((void *)rec->ip, 0xc004, true); } +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + if (MACHINE_HAS_SEQ_INSN) + return ftrace_patch_branch_insn(rec->ip, 0, addr); + else + return ftrace_make_trampoline_call(rec, addr); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { ftrace_func = func; @@ -215,25 +260,6 @@ void ftrace_arch_code_modify_post_process(void) text_poke_sync_lock(); } -#ifdef CONFIG_MODULES - -static int __init ftrace_plt_init(void) -{ - const char *start, *end; - - ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE); - if (!ftrace_plt) - panic("cannot allocate ftrace plt\n"); - - start = ftrace_shared_hotpatch_trampoline(&end); - memcpy(ftrace_plt, start, end - start); - set_memory_rox((unsigned long)ftrace_plt, 1); - return 0; -} -device_initcall(ftrace_plt_init); - -#endif /* CONFIG_MODULES */ - #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * Hook the return address and push it in the stack of return addresses @@ -264,26 +290,14 @@ NOKPROBE_SYMBOL(prepare_ftrace_return); */ int ftrace_enable_ftrace_graph_caller(void) { - int rc; - /* Expect brc 0xf,... */ - rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false); - if (rc) - return rc; - text_poke_sync_lock(); - return 0; + return ftrace_patch_branch_mask(ftrace_graph_caller, 0xa7f4, false); } int ftrace_disable_ftrace_graph_caller(void) { - int rc; - /* Expect brc 0x0,... */ - rc = ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true); - if (rc) - return rc; - text_poke_sync_lock(); - return 0; + return ftrace_patch_branch_mask(ftrace_graph_caller, 0xa704, true); } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/s390/kernel/ftrace.h b/arch/s390/kernel/ftrace.h index 7f75a9616406..23337065f402 100644 --- a/arch/s390/kernel/ftrace.h +++ b/arch/s390/kernel/ftrace.h @@ -18,7 +18,5 @@ extern const char ftrace_shared_hotpatch_trampoline_br[]; extern const char ftrace_shared_hotpatch_trampoline_br_end[]; extern const char ftrace_shared_hotpatch_trampoline_exrl[]; extern const char ftrace_shared_hotpatch_trampoline_exrl_end[]; -extern const char ftrace_plt_template[]; -extern const char ftrace_plt_template_end[]; #endif /* _FTRACE_H */ diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c new file mode 100644 index 000000000000..2a99a216ab62 --- /dev/null +++ b/arch/s390/kernel/hiperdispatch.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define KMSG_COMPONENT "hd" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +/* + * Hiperdispatch: + * Dynamically calculates the optimum number of high capacity COREs + * by considering the state the system is in. When hiperdispatch decides + * that a capacity update is necessary, it schedules a topology update. + * During topology updates the CPU capacities are always re-adjusted. + * + * There is two places where CPU capacities are being accessed within + * hiperdispatch. + * -> hiperdispatch's reoccuring work function reads CPU capacities to + * determine high capacity CPU count. + * -> during a topology update hiperdispatch's adjustment function + * updates CPU capacities. + * These two can run on different CPUs in parallel which can cause + * hiperdispatch to make wrong decisions. This can potentially cause + * some overhead by leading to extra rebuild_sched_domains() calls + * for correction. Access to capacities within hiperdispatch has to be + * serialized to prevent the overhead. + * + * Hiperdispatch decision making revolves around steal time. + * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time + * crosses the threshold value hiperdispatch falls back to giving high + * capacities to entitled CPUs. When steal time drops below the + * threshold boundary, hiperdispatch utilizes all CPUs by giving all + * of them high capacity. + * + * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread + * performance. Comparing the throughput of; + * - single CORE, with N threads, running N tasks + * - N separate COREs running N tasks, + * using individual COREs for individual tasks yield better + * performance. This performance difference is roughly ~30% (can change + * between machine generations) + * + * Hiperdispatch tries to hint scheduler to use individual COREs for + * each task, as long as steal time on those COREs are less than 30%, + * therefore delaying the throughput loss caused by using SMP threads. + */ + +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/kernel_stat.h> +#include <linux/kstrtox.h> +#include <linux/ktime.h> +#include <linux/sysctl.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <asm/hiperdispatch.h> +#include <asm/setup.h> +#include <asm/smp.h> +#include <asm/topology.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hiperdispatch.h> + +#define HD_DELAY_FACTOR (4) +#define HD_DELAY_INTERVAL (HZ / 4) +#define HD_STEAL_THRESHOLD 30 +#define HD_STEAL_AVG_WEIGHT 16 + +static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ +static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */ +static int hd_high_capacity_cores; /* Current CORE count with high capacity */ +static int hd_entitled_cores; /* Total vertical high and medium CORE count */ +static int hd_online_cores; /* Current online CORE count */ + +static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */ +static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */ +static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */ +static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */ + +static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD; +static unsigned int hd_delay_factor = HD_DELAY_FACTOR; +static int hd_enabled; + +static void hd_capacity_work_fn(struct work_struct *work); +static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); + +static int hd_set_hiperdispatch_mode(int enable) +{ + if (!MACHINE_HAS_TOPOLOGY) + enable = 0; + if (hd_enabled == enable) + return 0; + hd_enabled = enable; + return 1; +} + +void hd_reset_state(void) +{ + cpumask_clear(&hd_vl_coremask); + cpumask_clear(&hd_vmvl_cpumask); + hd_entitled_cores = 0; + hd_online_cores = 0; +} + +void hd_add_core(int cpu) +{ + const struct cpumask *siblings; + int polarization; + + hd_online_cores++; + polarization = smp_cpu_get_polarization(cpu); + siblings = topology_sibling_cpumask(cpu); + switch (polarization) { + case POLARIZATION_VH: + hd_entitled_cores++; + break; + case POLARIZATION_VM: + hd_entitled_cores++; + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + case POLARIZATION_VL: + cpumask_set_cpu(cpu, &hd_vl_coremask); + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + } +} + +/* Serialize update and read operations of debug counters. */ +static DEFINE_MUTEX(hd_counter_mutex); + +static void hd_update_times(void) +{ + static ktime_t prev; + ktime_t now; + + /* + * Check if hiperdispatch is active, if not set the prev to 0. + * This way it is possible to differentiate the first update iteration after + * enabling hiperdispatch. + */ + if (hd_entitled_cores == 0 || hd_enabled == 0) { + prev = ktime_set(0, 0); + return; + } + now = ktime_get(); + if (ktime_after(prev, 0)) { + if (hd_high_capacity_cores == hd_online_cores) + hd_high_time += ktime_ms_delta(now, prev); + else + hd_low_time += ktime_ms_delta(now, prev); + } + prev = now; +} + +static void hd_update_capacities(void) +{ + int cpu, upscaling_cores; + unsigned long capacity; + + upscaling_cores = hd_high_capacity_cores - hd_entitled_cores; + capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW; + hd_high_capacity_cores = hd_entitled_cores; + for_each_cpu(cpu, &hd_vl_coremask) { + smp_set_core_capacity(cpu, capacity); + if (capacity != CPU_CAPACITY_HIGH) + continue; + hd_high_capacity_cores++; + upscaling_cores--; + if (upscaling_cores == 0) + capacity = CPU_CAPACITY_LOW; + } +} + +void hd_disable_hiperdispatch(void) +{ + cancel_delayed_work_sync(&hd_capacity_work); + hd_high_capacity_cores = hd_online_cores; + hd_previous_steal = 0; +} + +int hd_enable_hiperdispatch(void) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + mutex_unlock(&hd_counter_mutex); + if (hd_enabled == 0) + return 0; + if (hd_entitled_cores == 0) + return 0; + if (hd_online_cores <= hd_entitled_cores) + return 0; + mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + hd_update_capacities(); + return 1; +} + +static unsigned long hd_steal_avg(unsigned long new) +{ + static unsigned long steal; + + steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT; + return steal; +} + +static unsigned long hd_calculate_steal_percentage(void) +{ + unsigned long time_delta, steal_delta, steal, percentage; + static ktime_t prev; + int cpus, cpu; + ktime_t now; + + cpus = 0; + steal = 0; + percentage = 0; + for_each_cpu(cpu, &hd_vmvl_cpumask) { + steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + cpus++; + } + /* + * If there is no vertical medium and low CPUs steal time + * is 0 as vertical high CPUs shouldn't experience steal time. + */ + if (cpus == 0) + return percentage; + now = ktime_get(); + time_delta = ktime_to_ns(ktime_sub(now, prev)); + if (steal > hd_previous_steal && hd_previous_steal != 0) { + steal_delta = (steal - hd_previous_steal) * 100 / time_delta; + percentage = steal_delta / cpus; + } + hd_previous_steal = steal; + prev = now; + return percentage; +} + +static void hd_capacity_work_fn(struct work_struct *work) +{ + unsigned long steal_percentage, new_cores; + + mutex_lock(&smp_cpu_state_mutex); + /* + * If online cores are less or equal to entitled cores hiperdispatch + * does not need to make any adjustments, call a topology update to + * disable hiperdispatch. + * Normally this check is handled on topology update, but during cpu + * unhotplug, topology and cpu mask updates are done in reverse + * order, causing hd_enable_hiperdispatch() to get stale data. + */ + if (hd_online_cores <= hd_entitled_cores) { + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return; + } + steal_percentage = hd_steal_avg(hd_calculate_steal_percentage()); + if (steal_percentage < hd_steal_threshold) + new_cores = hd_online_cores; + else + new_cores = hd_entitled_cores; + if (hd_high_capacity_cores != new_cores) { + trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores); + hd_high_capacity_cores = new_cores; + atomic64_inc(&hd_adjustments); + topology_schedule_update(); + } + trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores); + mutex_unlock(&smp_cpu_state_mutex); + schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL); +} + +static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int hiperdispatch; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &hiperdispatch, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + hiperdispatch = hd_enabled; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + mutex_lock(&smp_cpu_state_mutex); + if (hd_set_hiperdispatch_mode(hiperdispatch)) + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return 0; +} + +static struct ctl_table hiperdispatch_ctl_table[] = { + { + .procname = "hiperdispatch", + .mode = 0644, + .proc_handler = hiperdispatch_ctl_handler, + }, +}; + +static ssize_t hd_steal_threshold_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_steal_threshold); +} + +static ssize_t hd_steal_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (val > 100) + return -ERANGE; + hd_steal_threshold = val; + return count; +} + +static DEVICE_ATTR_RW(hd_steal_threshold); + +static ssize_t hd_delay_factor_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_delay_factor); +} + +static ssize_t hd_delay_factor_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (!val) + return -ERANGE; + hd_delay_factor = val; + return count; +} + +static DEVICE_ATTR_RW(hd_delay_factor); + +static struct attribute *hd_attrs[] = { + &dev_attr_hd_steal_threshold.attr, + &dev_attr_hd_delay_factor.attr, + NULL, +}; + +static const struct attribute_group hd_attr_group = { + .name = "hiperdispatch", + .attrs = hd_attrs, +}; + +static int hd_greedy_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_high_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n"); + +static int hd_conservative_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_low_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n"); + +static int hd_adjustment_count_get(void *unused, u64 *val) +{ + *val = atomic64_read(&hd_adjustments); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n"); + +static void __init hd_create_debugfs_counters(void) +{ + struct dentry *dir; + + dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir); + debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops); + debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops); + debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops); +} + +static void __init hd_create_attributes(void) +{ + struct device *dev; + + dev = bus_get_dev_root(&cpu_subsys); + if (!dev) + return; + if (sysfs_create_group(&dev->kobj, &hd_attr_group)) + pr_warn("Unable to create hiperdispatch attribute group\n"); + put_device(dev); +} + +static int __init hd_init(void) +{ + if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) { + hd_set_hiperdispatch_mode(1); + topology_schedule_update(); + } + if (!register_sysctl("s390", hiperdispatch_ctl_table)) + pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n"); + hd_create_debugfs_counters(); + hd_create_attributes(); + return 0; +} +late_initcall(hd_init); diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 1af5a08d72ab..2639a3d12736 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -76,6 +76,7 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQEXT_CMS, .name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"}, {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, + {.irq = IRQEXT_WTI, .name = "WTI", .desc = "[EXT] Warning Track"}, {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 05c83505e979..6295faf0987d 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -21,6 +21,7 @@ #include <linux/hardirq.h> #include <linux/ftrace.h> #include <linux/execmem.h> +#include <asm/text-patching.h> #include <asm/set_memory.h> #include <asm/sections.h> #include <asm/dis.h> @@ -152,7 +153,12 @@ void arch_arm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; - stop_machine_cpuslocked(swap_instruction, &args, NULL); + if (MACHINE_HAS_SEQ_INSN) { + swap_instruction(&args); + text_poke_sync(); + } else { + stop_machine_cpuslocked(swap_instruction, &args, NULL); + } } NOKPROBE_SYMBOL(arch_arm_kprobe); @@ -160,7 +166,12 @@ void arch_disarm_kprobe(struct kprobe *p) { struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; - stop_machine_cpuslocked(swap_instruction, &args, NULL); + if (MACHINE_HAS_SEQ_INSN) { + swap_instruction(&args); + text_poke_sync(); + } else { + stop_machine_cpuslocked(swap_instruction, &args, NULL); + } } NOKPROBE_SYMBOL(arch_disarm_kprobe); diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index ae4d4fd9afcd..7e267ef63a7f 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -9,6 +9,7 @@ #include <asm/ftrace.h> #include <asm/nospec-insn.h> #include <asm/ptrace.h> +#include <asm/march.h> #define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE) #define STACK_PTREGS (STACK_FRAME_OVERHEAD) @@ -88,7 +89,7 @@ SYM_CODE_START(ftrace_caller) SYM_CODE_END(ftrace_caller) SYM_CODE_START(ftrace_common) -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#ifdef MARCH_HAS_Z196_FEATURES aghik %r2,%r0,-MCOUNT_INSN_SIZE lgrl %r4,function_trace_op lgrl %r1,ftrace_func @@ -115,7 +116,7 @@ SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) .Lftrace_graph_caller_end: #endif lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES +#ifdef MARCH_HAS_Z196_FEATURES ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) locgrz %r1,%r0 #else diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 23ab9f02f278..ddc1448ea2e1 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -14,9 +14,6 @@ #include <linux/node.h> #include <asm/numa.h> -struct pglist_data *node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(node_data); - void __init numa_setup(void) { int nid; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 6968be98af11..e2e0aa463fbd 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -22,6 +22,10 @@ #include <asm/hwctrset.h> #include <asm/debug.h> +/* Perf PMU definitions for the counter facility */ +#define PERF_CPUM_CF_MAX_CTR 0xffffUL /* Max ctr for ECCTR */ +#define PERF_EVENT_CPUM_CF_DIAG 0xBC000UL /* Event: Counter sets */ + enum cpumf_ctr_set { CPUMF_CTR_SET_BASIC = 0, /* Basic Counter Set */ CPUMF_CTR_SET_USER = 1, /* Problem-State Counter Set */ @@ -1694,7 +1698,6 @@ static const struct file_operations cfset_fops = { .release = cfset_release, .unlocked_ioctl = cfset_ioctl, .compat_ioctl = cfset_ioctl, - .llseek = no_llseek }; static struct miscdevice cfset_dev = { diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 736c1d9632dd..5b765e3ccf0c 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -24,6 +24,22 @@ #include <asm/timex.h> #include <linux/io.h> +/* Perf PMU definitions for the sampling facility */ +#define PERF_CPUM_SF_MAX_CTR 2 +#define PERF_EVENT_CPUM_SF 0xB0000UL /* Event: Basic-sampling */ +#define PERF_EVENT_CPUM_SF_DIAG 0xBD000UL /* Event: Combined-sampling */ +#define PERF_CPUM_SF_BASIC_MODE 0x0001 /* Basic-sampling flag */ +#define PERF_CPUM_SF_DIAG_MODE 0x0002 /* Diagnostic-sampling flag */ +#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ + +#define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config) +#define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc) +#define TEAR_REG(hwc) ((hwc)->last_tag) +#define SAMPL_RATE(hwc) ((hwc)->event_base) +#define SAMPL_FLAGS(hwc) ((hwc)->config_base) +#define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) +#define SAMPL_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) + /* Minimum number of sample-data-block-tables: * At least one table is required for the sampling buffer structure. * A single table contains up to 511 pointers to sample-data-blocks. @@ -113,17 +129,6 @@ static inline unsigned long sample_rate_to_freq(struct hws_qsi_info_block *qsi, return USEC_PER_SEC * qsi->cpu_speed / rate; } -/* Return TOD timestamp contained in an trailer entry */ -static inline unsigned long long trailer_timestamp(struct hws_trailer_entry *te) -{ - /* TOD in STCKE format */ - if (te->header.t) - return *((unsigned long long *)&te->timestamp[1]); - - /* TOD in STCK format */ - return *((unsigned long long *)&te->timestamp[0]); -} - /* Return pointer to trailer entry of an sample data block */ static inline struct hws_trailer_entry *trailer_entry_ptr(unsigned long v) { @@ -154,12 +159,12 @@ static inline unsigned long *get_next_sdbt(unsigned long *s) /* * sf_disable() - Switch off sampling facility */ -static int sf_disable(void) +static void sf_disable(void) { struct hws_lsctl_request_block sreq; memset(&sreq, 0, sizeof(sreq)); - return lsctl(&sreq); + lsctl(&sreq); } /* @@ -208,8 +213,6 @@ static void free_sampling_buffer(struct sf_buffer *sfb) } } - debug_sprintf_event(sfdbg, 5, "%s: freed sdbt %#lx\n", __func__, - (unsigned long)sfb->sdbt); memset(sfb, 0, sizeof(*sfb)); } @@ -265,10 +268,8 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, * the sampling buffer origin. */ if (sfb->sdbt != get_next_sdbt(tail)) { - debug_sprintf_event(sfdbg, 3, "%s: " - "sampling buffer is not linked: origin %#lx" - " tail %#lx\n", __func__, - (unsigned long)sfb->sdbt, + debug_sprintf_event(sfdbg, 3, "%s buffer not linked origin %#lx tail %#lx\n", + __func__, (unsigned long)sfb->sdbt, (unsigned long)tail); return -EINVAL; } @@ -318,9 +319,6 @@ static int realloc_sampling_buffer(struct sf_buffer *sfb, *tail = virt_to_phys(sfb->sdbt) + 1; sfb->tail = tail; - debug_sprintf_event(sfdbg, 4, "%s: new buffer" - " settings: sdbt %lu sdb %lu\n", __func__, - sfb->num_sdbt, sfb->num_sdb); return rc; } @@ -357,15 +355,8 @@ static int alloc_sampling_buffer(struct sf_buffer *sfb, unsigned long num_sdb) /* Allocate requested number of sample-data-blocks */ rc = realloc_sampling_buffer(sfb, num_sdb, GFP_KERNEL); - if (rc) { + if (rc) free_sampling_buffer(sfb); - debug_sprintf_event(sfdbg, 4, "%s: " - "realloc_sampling_buffer failed with rc %i\n", - __func__, rc); - } else - debug_sprintf_event(sfdbg, 4, - "%s: tear %#lx dear %#lx\n", __func__, - (unsigned long)sfb->sdbt, (unsigned long)*sfb->sdbt); return rc; } @@ -377,8 +368,8 @@ static void sfb_set_limits(unsigned long min, unsigned long max) CPUM_SF_MAX_SDB = max; memset(&si, 0, sizeof(si)); - if (!qsi(&si)) - CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); + qsi(&si); + CPUM_SF_SDB_DIAG_FACTOR = DIV_ROUND_UP(si.dsdes, si.bsdes); } static unsigned long sfb_max_limit(struct hw_perf_event *hwc) @@ -397,12 +388,6 @@ static unsigned long sfb_pending_allocs(struct sf_buffer *sfb, return 0; } -static int sfb_has_pending_allocs(struct sf_buffer *sfb, - struct hw_perf_event *hwc) -{ - return sfb_pending_allocs(sfb, hwc) > 0; -} - static void sfb_account_allocs(unsigned long num, struct hw_perf_event *hwc) { /* Limit the number of SDBs to not exceed the maximum */ @@ -426,7 +411,6 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw) static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { unsigned long n_sdb, freq; - size_t sample_size; /* Calculate sampling buffers using 4K pages * @@ -457,7 +441,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up * to 511 SDBs). */ - sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); n_sdb = CPUM_SF_MIN_SDB + DIV_ROUND_UP(freq, 10000); @@ -473,12 +456,6 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) if (sf_buffer_available(cpuhw)) return 0; - debug_sprintf_event(sfdbg, 3, - "%s: rate %lu f %lu sdb %lu/%lu" - " sample_size %lu cpuhw %p\n", __func__, - SAMPL_RATE(hwc), freq, n_sdb, sfb_max_limit(hwc), - sample_size, cpuhw); - return alloc_sampling_buffer(&cpuhw->sfb, sfb_pending_allocs(&cpuhw->sfb, hwc)); } @@ -535,8 +512,6 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, if (num) sfb_account_allocs(num, hwc); - debug_sprintf_event(sfdbg, 5, "%s: overflow %llu ratio %lu num %lu\n", - __func__, OVERFLOW_REG(hwc), ratio, num); OVERFLOW_REG(hwc) = 0; } @@ -554,13 +529,11 @@ static void sfb_account_overflows(struct cpu_hw_sf *cpuhw, static void extend_sampling_buffer(struct sf_buffer *sfb, struct hw_perf_event *hwc) { - unsigned long num, num_old; - int rc; + unsigned long num; num = sfb_pending_allocs(sfb, hwc); if (!num) return; - num_old = sfb->num_sdb; /* Disable the sampling facility to reset any states and also * clear pending measurement alerts. @@ -572,51 +545,33 @@ static void extend_sampling_buffer(struct sf_buffer *sfb, * called by perf. Because this is a reallocation, it is fine if the * new SDB-request cannot be satisfied immediately. */ - rc = realloc_sampling_buffer(sfb, num, GFP_ATOMIC); - if (rc) - debug_sprintf_event(sfdbg, 5, "%s: realloc failed with rc %i\n", - __func__, rc); - - if (sfb_has_pending_allocs(sfb, hwc)) - debug_sprintf_event(sfdbg, 5, "%s: " - "req %lu alloc %lu remaining %lu\n", - __func__, num, sfb->num_sdb - num_old, - sfb_pending_allocs(sfb, hwc)); + realloc_sampling_buffer(sfb, num, GFP_ATOMIC); } /* Number of perf events counting hardware events */ -static atomic_t num_events; +static refcount_t num_events; /* Used to avoid races in calling reserve/release_cpumf_hardware */ static DEFINE_MUTEX(pmc_reserve_mutex); #define PMC_INIT 0 #define PMC_RELEASE 1 -#define PMC_FAILURE 2 static void setup_pmc_cpu(void *flags) { - struct cpu_hw_sf *cpusf = this_cpu_ptr(&cpu_hw_sf); - int err = 0; + struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); switch (*((int *)flags)) { case PMC_INIT: - memset(cpusf, 0, sizeof(*cpusf)); - err = qsi(&cpusf->qsi); - if (err) - break; - cpusf->flags |= PMU_F_RESERVED; - err = sf_disable(); + memset(cpuhw, 0, sizeof(*cpuhw)); + qsi(&cpuhw->qsi); + cpuhw->flags |= PMU_F_RESERVED; + sf_disable(); break; case PMC_RELEASE: - cpusf->flags &= ~PMU_F_RESERVED; - err = sf_disable(); - if (!err) - deallocate_buffers(cpusf); + cpuhw->flags &= ~PMU_F_RESERVED; + sf_disable(); + deallocate_buffers(cpuhw); break; } - if (err) { - *((int *)flags) |= PMC_FAILURE; - pr_err("Switching off the sampling facility failed with rc %i\n", err); - } } static void release_pmc_hardware(void) @@ -627,27 +582,19 @@ static void release_pmc_hardware(void) on_each_cpu(setup_pmc_cpu, &flags, 1); } -static int reserve_pmc_hardware(void) +static void reserve_pmc_hardware(void) { int flags = PMC_INIT; on_each_cpu(setup_pmc_cpu, &flags, 1); - if (flags & PMC_FAILURE) { - release_pmc_hardware(); - return -ENODEV; - } irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - return 0; } static void hw_perf_event_destroy(struct perf_event *event) { /* Release PMC if this is the last perf event */ - if (!atomic_add_unless(&num_events, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_events) == 0) - release_pmc_hardware(); + if (refcount_dec_and_mutex_lock(&num_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -751,9 +698,6 @@ static unsigned long getrate(bool freq, unsigned long sample, */ if (sample_rate_to_freq(si, rate) > sysctl_perf_event_sample_rate) { - debug_sprintf_event(sfdbg, 1, "%s: " - "Sampling rate exceeds maximum " - "perf sample rate\n", __func__); rate = 0; } } @@ -798,9 +742,6 @@ static int __hw_perf_event_init_rate(struct perf_event *event, attr->sample_period = rate; SAMPL_RATE(hwc) = rate; hw_init_period(hwc, SAMPL_RATE(hwc)); - debug_sprintf_event(sfdbg, 4, "%s: cpu %d period %#llx freq %d,%#lx\n", - __func__, event->cpu, event->attr.sample_period, - event->attr.freq, SAMPLE_FREQ_MODE(hwc)); return 0; } @@ -810,23 +751,17 @@ static int __hw_perf_event_init(struct perf_event *event) struct hws_qsi_info_block si; struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; - int cpu, err; + int cpu, err = 0; /* Reserve CPU-measurement sampling facility */ - err = 0; - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && reserve_pmc_hardware()) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); + mutex_lock(&pmc_reserve_mutex); + if (!refcount_inc_not_zero(&num_events)) { + reserve_pmc_hardware(); + refcount_set(&num_events, 1); } + mutex_unlock(&pmc_reserve_mutex); event->destroy = hw_perf_event_destroy; - if (err) - goto out; - /* Access per-CPU sampling information (query sampling info) */ /* * The event->cpu value can be -1 to count on every CPU, for example, @@ -838,9 +773,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ memset(&si, 0, sizeof(si)); cpuhw = NULL; - if (event->cpu == -1) + if (event->cpu == -1) { qsi(&si); - else { + } else { /* Event is pinned to a particular CPU, retrieve the per-CPU * sampling structure for accessing the CPU-specific QSI. */ @@ -881,10 +816,6 @@ static int __hw_perf_event_init(struct perf_event *event) if (err) goto out; - /* Initialize sample data overflow accounting */ - hwc->extra_reg.reg = REG_OVERFLOW; - OVERFLOW_REG(hwc) = 0; - /* Use AUX buffer. No need to allocate it by ourself */ if (attr->config == PERF_EVENT_CPUM_SF_DIAG) return 0; @@ -1007,7 +938,7 @@ static void cpumsf_pmu_enable(struct pmu *pmu) extend_sampling_buffer(&cpuhw->sfb, hwc); } /* Rate may be adjusted with ioctl() */ - cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw); + cpuhw->lsctl.interval = SAMPL_RATE(hwc); } /* (Re)enable the PMU and sampling facility */ @@ -1023,12 +954,6 @@ static void cpumsf_pmu_enable(struct pmu *pmu) /* Load current program parameter */ lpp(&get_lowcore()->lpp); - - debug_sprintf_event(sfdbg, 6, "%s: es %i cs %i ed %i cd %i " - "interval %#lx tear %#lx dear %#lx\n", __func__, - cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, - cpuhw->lsctl.cd, cpuhw->lsctl.interval, - cpuhw->lsctl.tear, cpuhw->lsctl.dear); } static void cpumsf_pmu_disable(struct pmu *pmu) @@ -1055,21 +980,18 @@ static void cpumsf_pmu_disable(struct pmu *pmu) return; } - /* Save state of TEAR and DEAR register contents */ - err = qsi(&si); - if (!err) { - /* TEAR/DEAR values are valid only if the sampling facility is - * enabled. Note that cpumsf_pmu_disable() might be called even - * for a disabled sampling facility because cpumsf_pmu_enable() - * controls the enable/disable state. - */ - if (si.es) { - cpuhw->lsctl.tear = si.tear; - cpuhw->lsctl.dear = si.dear; - } - } else - debug_sprintf_event(sfdbg, 3, "%s: qsi() failed with err %i\n", - __func__, err); + /* + * Save state of TEAR and DEAR register contents. + * TEAR/DEAR values are valid only if the sampling facility is + * enabled. Note that cpumsf_pmu_disable() might be called even + * for a disabled sampling facility because cpumsf_pmu_enable() + * controls the enable/disable state. + */ + qsi(&si); + if (si.es) { + cpuhw->lsctl.tear = si.tear; + cpuhw->lsctl.dear = si.dear; + } cpuhw->flags &= ~PMU_F_ENABLED; } @@ -1235,11 +1157,6 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, /* Count discarded samples */ *overflow += 1; } else { - debug_sprintf_event(sfdbg, 4, - "%s: Found unknown" - " sampling data entry: te->f %i" - " basic.def %#4x (%p)\n", __func__, - te->header.f, sample->def, sample); /* Sample slot is not yet written or other record. * * This condition can occur if the buffer was reused @@ -1284,7 +1201,7 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) * AUX buffer is used when in diagnostic sampling mode. * No perf events/samples are created. */ - if (SAMPL_DIAG_MODE(&event->hw)) + if (SAMPL_DIAG_MODE(hwc)) return; sdbt = (unsigned long *)TEAR_REG(hwc); @@ -1309,13 +1226,6 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) */ sampl_overflow += te->header.overflow; - /* Timestamps are valid for full sample-data-blocks only */ - debug_sprintf_event(sfdbg, 6, "%s: sdbt %#lx/%#lx " - "overflow %llu timestamp %#llx\n", - __func__, sdb, (unsigned long)sdbt, - te->header.overflow, - (te->header.f) ? trailer_timestamp(te) : 0ULL); - /* Collect all samples from a single sample-data-block and * flag if an (perf) event overflow happened. If so, the PMU * is stopped and remaining samples will be discarded. @@ -1340,7 +1250,7 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) sdbt = get_next_sdbt(sdbt); /* Update event hardware registers */ - TEAR_REG(hwc) = (unsigned long) sdbt; + TEAR_REG(hwc) = (unsigned long)sdbt; /* Stop processing sample-data if all samples of the current * sample-data-block were flushed even if it was not full. @@ -1362,19 +1272,8 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) * are dropped. * Slightly increase the interval to avoid hitting this limit. */ - if (event_overflow) { + if (event_overflow) SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); - debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n", - __func__, - DIV_ROUND_UP(SAMPL_RATE(hwc), 10)); - } - - if (sampl_overflow || event_overflow) - debug_sprintf_event(sfdbg, 4, "%s: " - "overflows: sample %llu event %llu" - " total %llu num_sdb %llu\n", - __func__, sampl_overflow, event_overflow, - OVERFLOW_REG(hwc), num_sdb); } static inline unsigned long aux_sdb_index(struct aux_buffer *aux, @@ -1442,9 +1341,6 @@ static void aux_output_end(struct perf_output_handle *handle) /* Remove alert indicators in the buffer */ te = aux_sdb_trailer(aux, aux->alert_mark); te->header.a = 0; - - debug_sprintf_event(sfdbg, 6, "%s: SDBs %ld range %ld head %ld\n", - __func__, i, range_scan, aux->head); } /* @@ -1463,7 +1359,7 @@ static int aux_output_begin(struct perf_output_handle *handle, unsigned long range, i, range_scan, idx, head, base, offset; struct hws_trailer_entry *te; - if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) + if (handle->head & ~PAGE_MASK) return -EINVAL; aux->head = handle->head >> PAGE_SHIFT; @@ -1475,10 +1371,6 @@ static int aux_output_begin(struct perf_output_handle *handle, * SDBs between aux->head and aux->empty_mark are already ready * for new data. range_scan is num of SDBs not within them. */ - debug_sprintf_event(sfdbg, 6, - "%s: range %ld head %ld alert %ld empty %ld\n", - __func__, range, aux->head, aux->alert_mark, - aux->empty_mark); if (range > aux_sdb_num_empty(aux)) { range_scan = range - aux_sdb_num_empty(aux); idx = aux->empty_mark + 1; @@ -1504,12 +1396,6 @@ static int aux_output_begin(struct perf_output_handle *handle, cpuhw->lsctl.tear = virt_to_phys((void *)base) + offset * sizeof(unsigned long); cpuhw->lsctl.dear = virt_to_phys((void *)aux->sdb_index[head]); - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld empty %ld " - "index %ld tear %#lx dear %#lx\n", __func__, - aux->head, aux->alert_mark, aux->empty_mark, - head / CPUM_SF_SDB_PER_TABLE, - cpuhw->lsctl.tear, cpuhw->lsctl.dear); - return 0; } @@ -1571,14 +1457,11 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, unsigned long long *overflow) { - unsigned long i, range_scan, idx, idx_old; union hws_trailer_header old, prev, new; + unsigned long i, range_scan, idx; unsigned long long orig_overflow; struct hws_trailer_entry *te; - debug_sprintf_event(sfdbg, 6, "%s: range %ld head %ld alert %ld " - "empty %ld\n", __func__, range, aux->head, - aux->alert_mark, aux->empty_mark); if (range <= aux_sdb_num_empty(aux)) /* * No need to scan. All SDBs in range are marked as empty. @@ -1601,7 +1484,7 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, * indicator fall into this range, set it. */ range_scan = range - aux_sdb_num_empty(aux); - idx_old = idx = aux->empty_mark + 1; + idx = aux->empty_mark + 1; for (i = 0; i < range_scan; i++, idx++) { te = aux_sdb_trailer(aux, idx); prev.val = READ_ONCE_ALIGNED_128(te->header.val); @@ -1623,9 +1506,6 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, /* Update empty_mark to new position */ aux->empty_mark = aux->head + range - 1; - debug_sprintf_event(sfdbg, 6, "%s: range_scan %ld idx %ld..%ld " - "empty %ld\n", __func__, range_scan, idx_old, - idx - 1, aux->empty_mark); return true; } @@ -1642,12 +1522,12 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) unsigned long num_sdb; aux = perf_get_aux(handle); - if (WARN_ON_ONCE(!aux)) + if (!aux) return; /* Inform user space new data arrived */ size = aux_sdb_num_alert(aux) << PAGE_SHIFT; - debug_sprintf_event(sfdbg, 6, "%s: #alert %ld\n", __func__, + debug_sprintf_event(sfdbg, 6, "%s #alert %ld\n", __func__, size >> PAGE_SHIFT); perf_aux_output_end(handle, size); @@ -1661,7 +1541,7 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) num_sdb); break; } - if (WARN_ON_ONCE(!aux)) + if (!aux) return; /* Update head and alert_mark to new position */ @@ -1681,23 +1561,11 @@ static void hw_collect_aux(struct cpu_hw_sf *cpuhw) perf_aux_output_end(&cpuhw->handle, size); pr_err("Sample data caused the AUX buffer with %lu " "pages to overflow\n", aux->sfb.num_sdb); - debug_sprintf_event(sfdbg, 1, "%s: head %ld range %ld " - "overflow %lld\n", __func__, - aux->head, range, overflow); } else { size = aux_sdb_num_alert(aux) << PAGE_SHIFT; perf_aux_output_end(&cpuhw->handle, size); - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " - "already full, try another\n", - __func__, - aux->head, aux->alert_mark); } } - - if (done) - debug_sprintf_event(sfdbg, 6, "%s: head %ld alert %ld " - "empty %ld\n", __func__, aux->head, - aux->alert_mark, aux->empty_mark); } /* @@ -1719,8 +1587,6 @@ static void aux_buffer_free(void *data) kfree(aux->sdbt_index); kfree(aux->sdb_index); kfree(aux); - - debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu\n", __func__, num_sdbt); } static void aux_sdb_init(unsigned long sdb) @@ -1828,9 +1694,6 @@ static void *aux_buffer_setup(struct perf_event *event, void **pages, */ aux->empty_mark = sfb->num_sdb - 1; - debug_sprintf_event(sfdbg, 4, "%s: SDBTs %lu SDBs %lu\n", __func__, - sfb->num_sdbt, sfb->num_sdb); - return aux; no_sdbt: @@ -1863,8 +1726,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) memset(&si, 0, sizeof(si)); if (event->cpu == -1) { - if (qsi(&si)) - return -ENODEV; + qsi(&si); } else { /* Event is pinned to a particular CPU, retrieve the per-CPU * sampling structure for accessing the CPU-specific QSI. @@ -1874,7 +1736,7 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) si = cpuhw->qsi; } - do_freq = !!SAMPLE_FREQ_MODE(&event->hw); + do_freq = !!SAMPL_FREQ_MODE(&event->hw); rate = getrate(do_freq, value, &si); if (!rate) return -EINVAL; @@ -1882,10 +1744,6 @@ static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) event->attr.sample_period = rate; SAMPL_RATE(&event->hw) = rate; hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); - debug_sprintf_event(sfdbg, 4, "%s:" - " cpu %d value %#llx period %#llx freq %d\n", - __func__, event->cpu, value, - event->attr.sample_period, do_freq); return 0; } @@ -1896,12 +1754,8 @@ static void cpumsf_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + if (!(event->hw.state & PERF_HES_STOPPED)) return; - - if (flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - perf_pmu_disable(event->pmu); event->hw.state = 0; cpuhw->lsctl.cs = 1; @@ -1936,7 +1790,7 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); struct aux_buffer *aux; - int err; + int err = 0; if (cpuhw->flags & PMU_F_IN_USE) return -EAGAIN; @@ -1944,7 +1798,6 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt) return -EINVAL; - err = 0; perf_pmu_disable(event->pmu); event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; @@ -2115,7 +1968,7 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Report measurement alerts only for non-PRA codes */ if (alert != CPU_MF_INT_SF_PRA) - debug_sprintf_event(sfdbg, 6, "%s: alert %#x\n", __func__, + debug_sprintf_event(sfdbg, 6, "%s alert %#x\n", __func__, alert); /* Sampling authorization change request */ @@ -2143,7 +1996,7 @@ static int cpusf_pmu_setup(unsigned int cpu, int flags) /* Ignore the notification if no events are scheduled on the PMU. * This might be racy... */ - if (!atomic_read(&num_events)) + if (!refcount_read(&num_events)) return 0; local_irq_disable(); @@ -2205,10 +2058,12 @@ static const struct kernel_param_ops param_ops_sfb_size = { .get = param_get_sfb_size, }; -#define RS_INIT_FAILURE_QSI 0x0001 -#define RS_INIT_FAILURE_BSDES 0x0002 -#define RS_INIT_FAILURE_ALRT 0x0003 -#define RS_INIT_FAILURE_PERF 0x0004 +enum { + RS_INIT_FAILURE_BSDES = 2, /* Bad basic sampling size */ + RS_INIT_FAILURE_ALRT = 3, /* IRQ registration failure */ + RS_INIT_FAILURE_PERF = 4 /* PMU registration failure */ +}; + static void __init pr_cpumsf_err(unsigned int reason) { pr_err("Sampling facility support for perf is not available: " @@ -2224,11 +2079,7 @@ static int __init init_cpum_sampling_pmu(void) return -ENODEV; memset(&si, 0, sizeof(si)); - if (qsi(&si)) { - pr_cpumsf_err(RS_INIT_FAILURE_QSI); - return -ENODEV; - } - + qsi(&si); if (!si.as && !si.ad) return -ENODEV; diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 2f5a20e300f6..fa7325454266 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -738,6 +738,22 @@ static const char * const paicrypt_ctrnames[] = { [154] = "PCKMO_ENCRYPT_ECC_ED448_KEY", [155] = "IBM_RESERVED_155", [156] = "IBM_RESERVED_156", + [157] = "KM_FULL_XTS_AES_128", + [158] = "KM_FULL_XTS_AES_256", + [159] = "KM_FULL_XTS_ENCRYPTED_AES_128", + [160] = "KM_FULL_XTS_ENCRYPTED_AES_256", + [161] = "KMAC_HMAC_SHA_224", + [162] = "KMAC_HMAC_SHA_256", + [163] = "KMAC_HMAC_SHA_384", + [164] = "KMAC_HMAC_SHA_512", + [165] = "KMAC_HMAC_ENCRYPTED_SHA_224", + [166] = "KMAC_HMAC_ENCRYPTED_SHA_256", + [167] = "KMAC_HMAC_ENCRYPTED_SHA_384", + [168] = "KMAC_HMAC_ENCRYPTED_SHA_512", + [169] = "PCKMO_ENCRYPT_HMAC_512_KEY", + [170] = "PCKMO_ENCRYPT_HMAC_1024_KEY", + [171] = "PCKMO_ENCRYPT_AES_XTS_128", + [172] = "PCKMO_ENCRYPT_AES_XTS_256", }; static void __init attr_event_free(struct attribute **attrs, int num) diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 6295531b39a2..7f462bef1fc0 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -635,6 +635,15 @@ static const char * const paiext_ctrnames[] = { [25] = "NNPA_1MFRAME", [26] = "NNPA_2GFRAME", [27] = "NNPA_ACCESSEXCEPT", + [28] = "NNPA_TRANSFORM", + [29] = "NNPA_GELU", + [30] = "NNPA_MOMENTS", + [31] = "NNPA_LAYERNORM", + [32] = "NNPA_MATMUL_OP_BCAST1", + [33] = "NNPA_SQRT", + [34] = "NNPA_INVSQRT", + [35] = "NNPA_NORM", + [36] = "NNPA_REDUCE", }; static void __init attr_event_free(struct attribute **attrs, int num) diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 6c2cb345402f..e48013cd832c 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -30,9 +30,9 @@ #include <linux/compat.h> #include <asm/ucontext.h> #include <linux/uaccess.h> +#include <asm/vdso-symbols.h> #include <asm/access-regs.h> #include <asm/lowcore.h> -#include <asm/vdso.h> #include "entry.h" /* diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index fbba37ec53cf..4df56fdb2488 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -671,6 +671,25 @@ int smp_cpu_get_polarization(int cpu) return per_cpu(pcpu_devices, cpu).polarization; } +void smp_cpu_set_capacity(int cpu, unsigned long val) +{ + per_cpu(pcpu_devices, cpu).capacity = val; +} + +unsigned long smp_cpu_get_capacity(int cpu) +{ + return per_cpu(pcpu_devices, cpu).capacity; +} + +void smp_set_core_capacity(int cpu, unsigned long val) +{ + int i; + + cpu = smp_get_base_cpu(cpu); + for (i = cpu; (i <= cpu + smp_cpu_mtid) && (i < nr_cpu_ids); i++) + smp_cpu_set_capacity(i, val); +} + int smp_cpu_get_cpu_address(int cpu) { return per_cpu(pcpu_devices, cpu).address; @@ -719,6 +738,7 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, else pcpu->state = CPU_STATE_STANDBY; smp_cpu_set_polarization(cpu, POLARIZATION_UNKNOWN); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); set_cpu_present(cpu, true); if (!early && arch_register_cpu(cpu)) set_cpu_present(cpu, false); @@ -961,6 +981,7 @@ void __init smp_prepare_boot_cpu(void) ipl_pcpu->state = CPU_STATE_CONFIGURED; lc->pcpu = (unsigned long)ipl_pcpu; smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN); + smp_cpu_set_capacity(0, CPU_CAPACITY_HIGH); } void __init smp_setup_processor_id(void) diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 640363b2a105..9f59837d159e 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -162,22 +162,3 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, { arch_stack_walk_user_common(consume_entry, cookie, NULL, regs, false); } - -unsigned long return_address(unsigned int n) -{ - struct unwind_state state; - unsigned long addr; - - /* Increment to skip current stack entry */ - n++; - - unwind_for_each_frame(&state, NULL, NULL, 0) { - addr = unwind_get_return_address(&state); - if (!addr) - break; - if (!n--) - return addr; - } - return 0; -} -EXPORT_SYMBOL_GPL(return_address); diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index 2be30a96696a..88055f58fbda 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -498,7 +498,6 @@ static const struct file_operations stsi_##fc##_##s1##_##s2##_fs_ops = { \ .open = stsi_open_##fc##_##s1##_##s2, \ .release = stsi_release, \ .read = stsi_read, \ - .llseek = no_llseek, \ }; static int stsi_release(struct inode *inode, struct file *file) diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 22029ecae1c5..813e5da9a973 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -24,6 +24,7 @@ #include <linux/mm.h> #include <linux/nodemask.h> #include <linux/node.h> +#include <asm/hiperdispatch.h> #include <asm/sysinfo.h> #define PTF_HORIZONTAL (0UL) @@ -47,6 +48,7 @@ static int topology_mode = TOPOLOGY_MODE_UNINITIALIZED; static void set_topology_timer(void); static void topology_work_fn(struct work_struct *work); static struct sysinfo_15_1_x *tl_info; +static int cpu_management; static DECLARE_WORK(topology_work, topology_work_fn); @@ -144,6 +146,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, cpumask_set_cpu(cpu, &book->mask); cpumask_set_cpu(cpu, &socket->mask); smp_cpu_set_polarization(cpu, tl_core->pp); + smp_cpu_set_capacity(cpu, CPU_CAPACITY_HIGH); } } } @@ -270,6 +273,7 @@ void update_cpu_masks(void) topo->drawer_id = id; } } + hd_reset_state(); for_each_online_cpu(cpu) { topo = &cpu_topology[cpu]; pkg_first = cpumask_first(&topo->core_mask); @@ -278,8 +282,10 @@ void update_cpu_masks(void) for_each_cpu(sibling, &topo->core_mask) { topo_sibling = &cpu_topology[sibling]; smt_first = cpumask_first(&topo_sibling->thread_mask); - if (sibling == smt_first) + if (sibling == smt_first) { topo_package->booted_cores++; + hd_add_core(sibling); + } } } else { topo->booted_cores = topo_package->booted_cores; @@ -303,8 +309,10 @@ static void __arch_update_dedicated_flag(void *arg) static int __arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; - int rc = 0; + int rc, hd_status; + hd_status = 0; + rc = 0; mutex_lock(&smp_cpu_state_mutex); if (MACHINE_HAS_TOPOLOGY) { rc = 1; @@ -314,7 +322,11 @@ static int __arch_update_cpu_topology(void) update_cpu_masks(); if (!MACHINE_HAS_TOPOLOGY) topology_update_polarization_simple(); + if (cpu_management == 1) + hd_status = hd_enable_hiperdispatch(); mutex_unlock(&smp_cpu_state_mutex); + if (hd_status == 0) + hd_disable_hiperdispatch(); return rc; } @@ -374,7 +386,24 @@ void topology_expect_change(void) set_topology_timer(); } -static int cpu_management; +static int set_polarization(int polarization) +{ + int rc = 0; + + cpus_read_lock(); + mutex_lock(&smp_cpu_state_mutex); + if (cpu_management == polarization) + goto out; + rc = topology_set_cpu_management(polarization); + if (rc) + goto out; + cpu_management = polarization; + topology_expect_change(); +out: + mutex_unlock(&smp_cpu_state_mutex); + cpus_read_unlock(); + return rc; +} static ssize_t dispatching_show(struct device *dev, struct device_attribute *attr, @@ -400,19 +429,7 @@ static ssize_t dispatching_store(struct device *dev, return -EINVAL; if (val != 0 && val != 1) return -EINVAL; - rc = 0; - cpus_read_lock(); - mutex_lock(&smp_cpu_state_mutex); - if (cpu_management == val) - goto out; - rc = topology_set_cpu_management(val); - if (rc) - goto out; - cpu_management = val; - topology_expect_change(); -out: - mutex_unlock(&smp_cpu_state_mutex); - cpus_read_unlock(); + rc = set_polarization(val); return rc ? rc : count; } static DEVICE_ATTR_RW(dispatching); @@ -624,12 +641,37 @@ static int topology_ctl_handler(const struct ctl_table *ctl, int write, return rc; } +static int polarization_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int polarization; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &polarization, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + polarization = cpu_management; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + return set_polarization(polarization); +} + static struct ctl_table topology_ctl_table[] = { { .procname = "topology", .mode = 0644, .proc_handler = topology_ctl_handler, }, + { + .procname = "polarization", + .mode = 0644, + .proc_handler = polarization_ctl_handler, + }, }; static int __init topology_init(void) @@ -642,6 +684,8 @@ static int __init topology_init(void) set_topology_timer(); else topology_update_polarization_simple(); + if (IS_ENABLED(CONFIG_SCHED_TOPOLOGY_VERTICAL)) + set_polarization(1); register_sysctl("s390", topology_ctl_table); dev_root = bus_get_dev_root(&cpu_subsys); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 36db065c7cf7..9646f773208a 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -14,6 +14,7 @@ #include <linux/memblock.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/pagewalk.h> #include <asm/facility.h> #include <asm/sections.h> #include <asm/uv.h> @@ -462,9 +463,9 @@ EXPORT_SYMBOL_GPL(gmap_convert_to_secure); int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) { struct vm_area_struct *vma; + struct folio_walk fw; unsigned long uaddr; struct folio *folio; - struct page *page; int rc; rc = -EFAULT; @@ -483,11 +484,15 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) goto out; rc = 0; - /* we take an extra reference here */ - page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); - if (IS_ERR_OR_NULL(page)) + folio = folio_walk_start(&fw, vma, uaddr, 0); + if (!folio) goto out; - folio = page_folio(page); + /* + * See gmap_make_secure(): large folios cannot be secure. Small + * folio implies FW_LEVEL_PTE. + */ + if (folio_test_large(folio) || !pte_write(fw.pte)) + goto out_walk_end; rc = uv_destroy_folio(folio); /* * Fault handlers can race; it is possible that two CPUs will fault @@ -500,7 +505,8 @@ int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) */ if (rc) rc = uv_convert_from_secure_folio(folio); - folio_put(folio); +out_walk_end: + folio_walk_end(&fw, vma); out: mmap_read_unlock(gmap->mm); return rc; @@ -548,11 +554,6 @@ int arch_make_folio_accessible(struct folio *folio) } EXPORT_SYMBOL_GPL(arch_make_folio_accessible); -int arch_make_page_accessible(struct page *page) -{ - return arch_make_folio_accessible(page_folio(page)); -} -EXPORT_SYMBOL_GPL(arch_make_page_accessible); static ssize_t uv_query_facilities(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 2f967ac2b8e3..598b512cde01 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -12,12 +12,15 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/time_namespace.h> #include <linux/random.h> #include <vdso/datapage.h> +#include <asm/vdso/vsyscall.h> +#include <asm/alternative.h> #include <asm/vdso.h> extern char vdso64_start[], vdso64_end[]; @@ -29,12 +32,6 @@ static union vdso_data_store vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -enum vvar_pages { - VVAR_DATA_PAGE_OFFSET, - VVAR_TIMENS_PAGE_OFFSET, - VVAR_NR_PAGES, -}; - #ifdef CONFIG_TIME_NS struct vdso_data *arch_get_vdso_data(void *vvar_page) { @@ -250,8 +247,25 @@ static struct page ** __init vdso_setup_pages(void *start, void *end) return pagelist; } +static void vdso_apply_alternatives(void) +{ + const struct elf64_shdr *alt, *shdr; + struct alt_instr *start, *end; + const struct elf64_hdr *hdr; + + hdr = (struct elf64_hdr *)vdso64_start; + shdr = (void *)hdr + hdr->e_shoff; + alt = find_section(hdr, shdr, ".altinstructions"); + if (!alt) + return; + start = (void *)hdr + alt->sh_offset; + end = (void *)hdr + alt->sh_offset + alt->sh_size; + apply_alternatives(start, end); +} + static int __init vdso_init(void) { + vdso_apply_alternatives(); vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); if (IS_ENABLED(CONFIG_COMPAT)) vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index ba19c0ca7c87..37bb4b761229 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -3,12 +3,17 @@ # Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile -obj-vdso64 = vdso_user_wrapper.o note.o -obj-cvdso64 = vdso64_generic.o getcpu.o +obj-vdso64 = vdso_user_wrapper.o note.o vgetrandom-chacha.o +obj-cvdso64 = vdso64_generic.o getcpu.o vgetrandom.o VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK) CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vgetrandom.o = $(VDSO_CFLAGS_REMOVE) CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE) +ifneq ($(c-getrandom-y),) + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) +endif + # Build rules targets := $(obj-vdso64) $(obj-cvdso64) vdso64.so vdso64.so.dbg diff --git a/arch/s390/kernel/vdso64/vdso.h b/arch/s390/kernel/vdso64/vdso.h index 34c7a2312f9d..9e5397e7b590 100644 --- a/arch/s390/kernel/vdso64/vdso.h +++ b/arch/s390/kernel/vdso64/vdso.h @@ -10,5 +10,6 @@ int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unuse int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts); +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len); #endif /* __ARCH_S390_KERNEL_VDSO64_VDSO_H */ diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 37e2a505e81d..753040a4b5ab 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -4,6 +4,7 @@ * library */ +#include <asm/vdso/vsyscall.h> #include <asm/page.h> #include <asm/vdso.h> @@ -13,6 +14,7 @@ OUTPUT_ARCH(s390:64-bit) SECTIONS { PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); + PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); #ifdef CONFIG_TIME_NS PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); #endif @@ -42,6 +44,10 @@ SECTIONS .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } .rodata1 : { *(.rodata1) } + . = ALIGN(8); + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + .dynamic : { *(.dynamic) } :text :dynamic .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr @@ -140,6 +146,7 @@ VERSION __kernel_restart_syscall; __kernel_rt_sigreturn; __kernel_sigreturn; + __kernel_getrandom; local: *; }; } diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S index e26e68675c08..aa06c85bcbd3 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -13,10 +13,7 @@ * for details. */ .macro vdso_func func - .globl __kernel_\func - .type __kernel_\func,@function - __ALIGN -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC aghi %r15,-STACK_FRAME_VDSO_OVERHEAD CFI_DEF_CFA_OFFSET (STACK_FRAME_USER_OVERHEAD + STACK_FRAME_VDSO_OVERHEAD) @@ -32,7 +29,7 @@ __kernel_\func: CFI_RESTORE 15 br %r14 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_func gettimeofday @@ -41,16 +38,13 @@ vdso_func clock_gettime vdso_func getcpu .macro vdso_syscall func,syscall - .globl __kernel_\func - .type __kernel_\func,@function - __ALIGN -__kernel_\func: +SYM_FUNC_START(__kernel_\func) CFI_STARTPROC svc \syscall /* Make sure we notice when a syscall returns, which shouldn't happen */ .word 0 CFI_ENDPROC - .size __kernel_\func,.-__kernel_\func +SYM_FUNC_END(__kernel_\func) .endm vdso_syscall restart_syscall,__NR_restart_syscall diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S new file mode 100644 index 000000000000..09c034c2f853 --- /dev/null +++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/stringify.h> +#include <linux/linkage.h> +#include <asm/alternative.h> +#include <asm/dwarf.h> +#include <asm/fpu-insn.h> + +#define STATE0 %v0 +#define STATE1 %v1 +#define STATE2 %v2 +#define STATE3 %v3 +#define COPY0 %v4 +#define COPY1 %v5 +#define COPY2 %v6 +#define COPY3 %v7 +#define BEPERM %v19 +#define TMP0 %v20 +#define TMP1 %v21 +#define TMP2 %v22 +#define TMP3 %v23 + + .section .rodata + + .balign 32 +SYM_DATA_START_LOCAL(chacha20_constants) + .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral + .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap +SYM_DATA_END(chacha20_constants) + + .text +/* + * s390 ChaCha20 implementation meant for vDSO. Produces a given positive + * number of blocks of output with nonce 0, taking an input key and 8-bytes + * counter. Does not spill to the stack. + * + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, + * const uint8_t *key, + * uint32_t *counter, + * size_t nblocks) + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + CFI_STARTPROC + larl %r1,chacha20_constants + + /* COPY0 = "expand 32-byte k" */ + VL COPY0,0,,%r1 + + /* BEPERM = byte selectors for VPERM */ + ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148) + + /* COPY1,COPY2 = key */ + VLM COPY1,COPY2,0,%r3 + + /* COPY3 = counter || zero nonce */ + lg %r3,0(%r4) + VZERO COPY3 + VLVGG COPY3,%r3,0 + + lghi %r1,0 +.Lblock: + VLR STATE0,COPY0 + VLR STATE1,COPY1 + VLR STATE2,COPY2 + VLR STATE3,COPY3 + + lghi %r0,10 +.Ldoubleround: + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,16 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,8 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,7 + + /* STATE1[0,1,2,3] = STATE1[1,2,3,0] */ + VSLDB STATE1,STATE1,STATE1,4 + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ + VSLDB STATE2,STATE2,STATE2,8 + /* STATE3[0,1,2,3] = STATE3[3,0,1,2] */ + VSLDB STATE3,STATE3,STATE3,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,16 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 12) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,12 + + /* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 8) */ + VAF STATE0,STATE0,STATE1 + VX STATE3,STATE3,STATE0 + VERLLF STATE3,STATE3,8 + + /* STATE2 += STATE3, STATE1 = rotl32(STATE1 ^ STATE2, 7) */ + VAF STATE2,STATE2,STATE3 + VX STATE1,STATE1,STATE2 + VERLLF STATE1,STATE1,7 + + /* STATE1[0,1,2,3] = STATE1[3,0,1,2] */ + VSLDB STATE1,STATE1,STATE1,12 + /* STATE2[0,1,2,3] = STATE2[2,3,0,1] */ + VSLDB STATE2,STATE2,STATE2,8 + /* STATE3[0,1,2,3] = STATE3[1,2,3,0] */ + VSLDB STATE3,STATE3,STATE3,4 + brctg %r0,.Ldoubleround + + /* OUTPUT0 = STATE0 + COPY0 */ + VAF STATE0,STATE0,COPY0 + /* OUTPUT1 = STATE1 + COPY1 */ + VAF STATE1,STATE1,COPY1 + /* OUTPUT2 = STATE2 + COPY2 */ + VAF STATE2,STATE2,COPY2 + /* OUTPUT3 = STATE3 + COPY3 */ + VAF STATE3,STATE3,COPY3 + + ALTERNATIVE \ + __stringify( \ + /* Convert STATE to little endian and store to OUTPUT */\ + VPERM TMP0,STATE0,STATE0,BEPERM; \ + VPERM TMP1,STATE1,STATE1,BEPERM; \ + VPERM TMP2,STATE2,STATE2,BEPERM; \ + VPERM TMP3,STATE3,STATE3,BEPERM; \ + VSTM TMP0,TMP3,0,%r2), \ + __stringify( \ + /* 32 bit wise little endian store to OUTPUT */ \ + VSTBRF STATE0,0,,%r2; \ + VSTBRF STATE1,16,,%r2; \ + VSTBRF STATE2,32,,%r2; \ + VSTBRF STATE3,48,,%r2; \ + brcl 0,0), \ + ALT_FACILITY(148) + + /* ++COPY3.COUNTER */ + /* alsih %r3,1 */ + .insn rilu,0xcc0a00000000,%r3,1 + alcr %r3,%r1 + VLVGG COPY3,%r3,0 + + /* OUTPUT += 64, --NBLOCKS */ + aghi %r2,64 + brctg %r5,.Lblock + + /* COUNTER = COPY3.COUNTER */ + stg %r3,0(%r4) + + /* Zero out potentially sensitive regs */ + VZERO STATE0 + VZERO STATE1 + VZERO STATE2 + VZERO STATE3 + VZERO COPY1 + VZERO COPY2 + + /* Early exit if TMP0-TMP3 have not been used */ + ALTERNATIVE "nopr", "br %r14", ALT_FACILITY(148) + + VZERO TMP0 + VZERO TMP1 + VZERO TMP2 + VZERO TMP3 + + br %r14 + CFI_ENDPROC +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/s390/kernel/vdso64/vgetrandom.c b/arch/s390/kernel/vdso64/vgetrandom.c new file mode 100644 index 000000000000..b5268b507fb5 --- /dev/null +++ b/arch/s390/kernel/vdso64/vgetrandom.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/facility.h> +#include <uapi/asm-generic/errno.h> +#include "vdso.h" + +ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + if (test_facility(129)) + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) + return -ENOSYS; + return getrandom_syscall(buffer, len, flags); +} diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index ae5d0a9d6911..377b9aaf8c92 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -191,8 +191,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) - RUNTIME_CONST(shift, d_hash_shift) - RUNTIME_CONST(ptr, dentry_hashtable) + RUNTIME_CONST_VARIABLES PERCPU_SECTION(0x100) diff --git a/arch/s390/kernel/wti.c b/arch/s390/kernel/wti.c new file mode 100644 index 000000000000..949fdbf0e8b6 --- /dev/null +++ b/arch/s390/kernel/wti.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for warning track interruption + * + * Copyright IBM Corp. 2023 + */ + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/kallsyms.h> +#include <linux/smpboot.h> +#include <linux/irq.h> +#include <uapi/linux/sched/types.h> +#include <asm/debug.h> +#include <asm/diag.h> +#include <asm/sclp.h> + +#define WTI_DBF_LEN 64 + +struct wti_debug { + unsigned long missed; + unsigned long addr; + pid_t pid; +}; + +struct wti_state { + /* debug data for s390dbf */ + struct wti_debug dbg; + /* + * Represents the real-time thread responsible to + * acknowledge the warning-track interrupt and trigger + * preliminary and postliminary precautions. + */ + struct task_struct *thread; + /* + * If pending is true, the real-time thread must be scheduled. + * If not, a wake up of that thread will remain a noop. + */ + bool pending; +}; + +static DEFINE_PER_CPU(struct wti_state, wti_state); + +static debug_info_t *wti_dbg; + +/* + * During a warning-track grace period, interrupts are disabled + * to prevent delays of the warning-track acknowledgment. + * + * Once the CPU is physically dispatched again, interrupts are + * re-enabled. + */ + +static void wti_irq_disable(void) +{ + unsigned long flags; + struct ctlreg cr6; + + local_irq_save(flags); + local_ctl_store(6, &cr6); + /* disable all I/O interrupts */ + cr6.val &= ~0xff000000UL; + local_ctl_load(6, &cr6); + local_irq_restore(flags); +} + +static void wti_irq_enable(void) +{ + unsigned long flags; + struct ctlreg cr6; + + local_irq_save(flags); + local_ctl_store(6, &cr6); + /* enable all I/O interrupts */ + cr6.val |= 0xff000000UL; + local_ctl_load(6, &cr6); + local_irq_restore(flags); +} + +static void store_debug_data(struct wti_state *st) +{ + struct pt_regs *regs = get_irq_regs(); + + st->dbg.pid = current->pid; + st->dbg.addr = 0; + if (!user_mode(regs)) + st->dbg.addr = regs->psw.addr; +} + +static void wti_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct wti_state *st = this_cpu_ptr(&wti_state); + + inc_irq_stat(IRQEXT_WTI); + wti_irq_disable(); + store_debug_data(st); + st->pending = true; + wake_up_process(st->thread); +} + +static int wti_pending(unsigned int cpu) +{ + struct wti_state *st = per_cpu_ptr(&wti_state, cpu); + + return st->pending; +} + +static void wti_dbf_grace_period(struct wti_state *st) +{ + struct wti_debug *wdi = &st->dbg; + char buf[WTI_DBF_LEN]; + + if (wdi->addr) + snprintf(buf, sizeof(buf), "%d %pS", wdi->pid, (void *)wdi->addr); + else + snprintf(buf, sizeof(buf), "%d <user>", wdi->pid); + debug_text_event(wti_dbg, 2, buf); + wdi->missed++; +} + +static int wti_show(struct seq_file *seq, void *v) +{ + struct wti_state *st; + int cpu; + + cpus_read_lock(); + seq_puts(seq, " "); + for_each_online_cpu(cpu) + seq_printf(seq, "CPU%-8d", cpu); + seq_putc(seq, '\n'); + for_each_online_cpu(cpu) { + st = per_cpu_ptr(&wti_state, cpu); + seq_printf(seq, " %10lu", st->dbg.missed); + } + seq_putc(seq, '\n'); + cpus_read_unlock(); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(wti); + +static void wti_thread_fn(unsigned int cpu) +{ + struct wti_state *st = per_cpu_ptr(&wti_state, cpu); + + st->pending = false; + /* + * Yield CPU voluntarily to the hypervisor. Control + * resumes when hypervisor decides to dispatch CPU + * to this LPAR again. + */ + if (diag49c(DIAG49C_SUBC_ACK)) + wti_dbf_grace_period(st); + wti_irq_enable(); +} + +static struct smp_hotplug_thread wti_threads = { + .store = &wti_state.thread, + .thread_should_run = wti_pending, + .thread_fn = wti_thread_fn, + .thread_comm = "cpuwti/%u", + .selfparking = false, +}; + +static int __init wti_init(void) +{ + struct sched_param wti_sched_param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct dentry *wti_dir; + struct wti_state *st; + int cpu, rc; + + rc = -EOPNOTSUPP; + if (!sclp.has_wti) + goto out; + rc = smpboot_register_percpu_thread(&wti_threads); + if (WARN_ON(rc)) + goto out; + for_each_online_cpu(cpu) { + st = per_cpu_ptr(&wti_state, cpu); + sched_setscheduler(st->thread, SCHED_FIFO, &wti_sched_param); + } + rc = register_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt); + if (rc) { + pr_warn("Couldn't request external interrupt 0x1007\n"); + goto out_thread; + } + irq_subclass_register(IRQ_SUBCLASS_WARNING_TRACK); + rc = diag49c(DIAG49C_SUBC_REG); + if (rc) { + pr_warn("Failed to register warning track interrupt through DIAG 49C\n"); + rc = -EOPNOTSUPP; + goto out_subclass; + } + wti_dir = debugfs_create_dir("wti", arch_debugfs_dir); + debugfs_create_file("stat", 0400, wti_dir, NULL, &wti_fops); + wti_dbg = debug_register("wti", 1, 1, WTI_DBF_LEN); + if (!wti_dbg) { + rc = -ENOMEM; + goto out_debug_register; + } + rc = debug_register_view(wti_dbg, &debug_hex_ascii_view); + if (rc) + goto out_debug_register; + goto out; +out_debug_register: + debug_unregister(wti_dbg); +out_subclass: + irq_subclass_unregister(IRQ_SUBCLASS_WARNING_TRACK); + unregister_external_irq(EXT_IRQ_WARNING_TRACK, wti_interrupt); +out_thread: + smpboot_unregister_percpu_thread(&wti_threads); +out: + return rc; +} +late_initcall(wti_init); |